From ec70b60d3ecb603884d6c86e175654888ed9da3d Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 23 May 2025 15:05:20 +0200 Subject: [PATCH 1/3] moved vector field to new setup --- .../concepts/caching/semantic_caching.py | 20 +- .../store_chat_history_in_cosmosdb.py | 14 +- .../data_model.py | 39 +-- .../samples/concepts/memory/complex_memory.py | 24 +- python/samples/concepts/memory/data_models.py | 45 +-- .../concepts/memory/memory_with_pandas.py | 79 +++-- .../samples/concepts/memory/simple_memory.py | 35 +- .../rag/rag_with_vector_collection.py | 8 +- .../samples/concepts/rag/self_critique_rag.py | 8 +- .../third_party/postgres-memory.ipynb | 19 +- .../connectors/memory/azure_ai_search.py | 15 +- .../connectors/memory/azure_cosmos_db.py | 12 +- .../connectors/memory/faiss.py | 4 +- .../connectors/memory/in_memory.py | 4 +- .../connectors/memory/mongodb.py | 4 +- .../connectors/memory/pinecone.py | 6 +- .../connectors/memory/postgres.py | 26 +- .../connectors/memory/redis.py | 43 +-- .../connectors/memory/sql_server.py | 51 ++- .../connectors/memory/weaviate.py | 32 +- python/semantic_kernel/data/__init__.py | 12 +- python/semantic_kernel/data/definitions.py | 329 +++++++++--------- python/semantic_kernel/data/vectors.py | 86 ++--- python/tests/conftest.py | 55 +-- .../memory/azure_cosmos_db/conftest.py | 25 +- .../memory/postgres/test_postgres_int.py | 51 ++- ...test_azure_cosmos_db_mongodb_collection.py | 13 +- .../test_azure_cosmos_db_no_sql_collection.py | 3 +- .../unit/connectors/memory/test_faiss.py | 15 +- .../connectors/memory/test_postgres_store.py | 50 +-- .../unit/connectors/memory/test_qdrant.py | 4 +- .../unit/connectors/memory/test_sql_server.py | 37 +- python/tests/unit/data/conftest.py | 79 ++--- .../data/test_vector_store_model_decorator.py | 77 ++-- .../test_vector_store_record_definition.py | 46 +-- 35 files changed, 651 insertions(+), 719 deletions(-) diff --git a/python/samples/concepts/caching/semantic_caching.py b/python/samples/concepts/caching/semantic_caching.py index dedacdd7c682..2a175dd4ca8a 100644 --- a/python/samples/concepts/caching/semantic_caching.py +++ b/python/samples/concepts/caching/semantic_caching.py @@ -10,15 +10,7 @@ from semantic_kernel import Kernel from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion, OpenAITextEmbedding from semantic_kernel.connectors.memory.in_memory import InMemoryStore -from semantic_kernel.data import ( - VectorSearchOptions, - VectorStore, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreRecordCollection, - VectorStoreVectorField, - vectorstoremodel, -) +from semantic_kernel.data import VectorStore, VectorStoreField, VectorStoreRecordCollection, vectorstoremodel from semantic_kernel.filters import FilterTypes, FunctionInvocationContext, PromptRenderContext from semantic_kernel.functions import FunctionResult @@ -32,9 +24,9 @@ @vectorstoremodel(collection_name=COLLECTION_NAME) @dataclass class CacheRecord: - result: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)] - prompt: Annotated[str | None, VectorStoreVectorField(dimensions=1536)] = None - id: Annotated[str, VectorStoreKeyField] = field(default_factory=lambda: str(uuid4())) + result: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)] + prompt: Annotated[str | None, VectorStoreField("vector", dimensions=1536)] = None + id: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4())) # Define the filters, one for caching the results and one for using the cache. @@ -66,9 +58,7 @@ async def on_prompt_render( """ await next(context) await self.collection.ensure_collection_exists() - results = await self.collection.search( - context.rendered_prompt, options=VectorSearchOptions(vector_property_name="prompt", top=1) - ) + results = await self.collection.search(context.rendered_prompt, vector_property_name="prompt", top=1) async for result in results.results: if result.score and result.score < self.score_threshold: context.function_result = FunctionResult( diff --git a/python/samples/concepts/chat_history/store_chat_history_in_cosmosdb.py b/python/samples/concepts/chat_history/store_chat_history_in_cosmosdb.py index 283e339cdaf3..c20ea1841e28 100644 --- a/python/samples/concepts/chat_history/store_chat_history_in_cosmosdb.py +++ b/python/samples/concepts/chat_history/store_chat_history_in_cosmosdb.py @@ -11,13 +11,7 @@ from semantic_kernel.contents import ChatHistory, ChatMessageContent from semantic_kernel.core_plugins.math_plugin import MathPlugin from semantic_kernel.core_plugins.time_plugin import TimePlugin -from semantic_kernel.data import ( - VectorStore, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreRecordCollection, - vectorstoremodel, -) +from semantic_kernel.data import VectorStore, VectorStoreField, VectorStoreRecordCollection, vectorstoremodel """ This sample demonstrates how to build a conversational chatbot @@ -39,9 +33,9 @@ @vectorstoremodel @dataclass class ChatHistoryModel: - session_id: Annotated[str, VectorStoreKeyField] - user_id: Annotated[str, VectorStoreDataField(is_indexed=True)] - messages: Annotated[list[dict[str, str]], VectorStoreDataField(is_indexed=True)] + session_id: Annotated[str, VectorStoreField("key")] + user_id: Annotated[str, VectorStoreField("data", is_indexed=True)] + messages: Annotated[list[dict[str, str]], VectorStoreField("data", is_indexed=True)] # 2. We then create a class that extends the ChatHistory class diff --git a/python/samples/concepts/memory/azure_ai_search_hotel_samples/data_model.py b/python/samples/concepts/memory/azure_ai_search_hotel_samples/data_model.py index c56824f57e21..6c290b4b4257 100644 --- a/python/samples/concepts/memory/azure_ai_search_hotel_samples/data_model.py +++ b/python/samples/concepts/memory/azure_ai_search_hotel_samples/data_model.py @@ -15,7 +15,7 @@ ) from pydantic import BaseModel, ConfigDict -from semantic_kernel.data import VectorStoreDataField, VectorStoreKeyField, VectorStoreVectorField, vectorstoremodel +from semantic_kernel.data import VectorStoreField, vectorstoremodel """ The data model used for this sample is based on the hotel data model from the Azure AI Search samples. @@ -55,29 +55,20 @@ class Address(BaseModel): @vectorstoremodel(collection_name="hotel-index") class HotelSampleClass(BaseModel): - HotelId: Annotated[str, VectorStoreKeyField] - HotelName: Annotated[str | None, VectorStoreDataField()] = None - Description: Annotated[ - str, - VectorStoreDataField(is_full_text_indexed=True), - ] - DescriptionVector: Annotated[ - list[float] | str | None, - VectorStoreVectorField(dimensions=1536), - ] = None - Description_fr: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)] - DescriptionFrVector: Annotated[ - list[float] | str | None, - VectorStoreVectorField(dimensions=1536), - ] = None - Category: Annotated[str, VectorStoreDataField()] - Tags: Annotated[list[str], VectorStoreDataField(is_indexed=True)] - ParkingIncluded: Annotated[bool | None, VectorStoreDataField()] = None - LastRenovationDate: Annotated[str | None, VectorStoreDataField(type=SearchFieldDataType.DateTimeOffset)] = None - Rating: Annotated[float, VectorStoreDataField()] - Location: Annotated[dict[str, Any], VectorStoreDataField(type=SearchFieldDataType.GeographyPoint)] - Address: Annotated[Address, VectorStoreDataField()] - Rooms: Annotated[list[Rooms], VectorStoreDataField()] + HotelId: Annotated[str, VectorStoreField("key")] + HotelName: Annotated[str | None, VectorStoreField("data")] = None + Description: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)] + DescriptionVector: Annotated[list[float] | str | None, VectorStoreField("vector", dimensions=1536)] = None + Description_fr: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)] + DescriptionFrVector: Annotated[list[float] | str | None, VectorStoreField("vector", dimensions=1536)] = None + Category: Annotated[str, VectorStoreField("data")] + Tags: Annotated[list[str], VectorStoreField("data", is_indexed=True)] + ParkingIncluded: Annotated[bool | None, VectorStoreField("data")] = None + LastRenovationDate: Annotated[str | None, VectorStoreField("data", type=SearchFieldDataType.DateTimeOffset)] = None + Rating: Annotated[float, VectorStoreField("data")] + Location: Annotated[dict[str, Any], VectorStoreField("data", type=SearchFieldDataType.GeographyPoint)] + Address: Annotated[Address, VectorStoreField("data")] + Rooms: Annotated[list[Rooms], VectorStoreField("data")] model_config = ConfigDict(extra="ignore") diff --git a/python/samples/concepts/memory/complex_memory.py b/python/samples/concepts/memory/complex_memory.py index c9f0d8feebc3..16ebc3cf92af 100644 --- a/python/samples/concepts/memory/complex_memory.py +++ b/python/samples/concepts/memory/complex_memory.py @@ -26,13 +26,8 @@ SqlServerCollection, WeaviateCollection, ) -from semantic_kernel.data import ( - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreRecordCollection, - VectorStoreVectorField, - vectorstoremodel, -) +from semantic_kernel.data import VectorStoreRecordCollection, vectorstoremodel +from semantic_kernel.data.definitions import VectorStoreField from semantic_kernel.data.vectors import SearchType, VectorSearch # This is a rather complex sample, showing how to use the vector store @@ -48,14 +43,19 @@ @vectorstoremodel(collection_name="test") @dataclass class DataModel: - title: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)] - content: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)] + title: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)] + content: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)] embedding: Annotated[ str | None, - VectorStoreVectorField(dimensions=1536, type_="float"), + VectorStoreField("vector", dimensions=1536, type_="float"), ] = None - id: Annotated[str, VectorStoreKeyField()] = field(default_factory=lambda: str(uuid4())) - tag: Annotated[str | None, VectorStoreDataField(type_="str", is_indexed=True)] = None + id: Annotated[ + str, + VectorStoreField( + "key", + ), + ] = field(default_factory=lambda: str(uuid4())) + tag: Annotated[str | None, VectorStoreField("data", type_="str", is_indexed=True)] = None def __post_init__(self, **kwargs): if self.embedding is None: diff --git a/python/samples/concepts/memory/data_models.py b/python/samples/concepts/memory/data_models.py index 998894f90281..4aaf8dfc74c4 100644 --- a/python/samples/concepts/memory/data_models.py +++ b/python/samples/concepts/memory/data_models.py @@ -7,13 +7,7 @@ from pandas import DataFrame from pydantic import BaseModel, Field -from semantic_kernel.data import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, - vectorstoremodel, -) +from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField, vectorstoremodel # This concept shows the different ways you can create a vector store data model # using dataclasses, Pydantic, and Python classes. @@ -21,8 +15,7 @@ # There are a number of universal things about these data models: # they must specify the type of field through the annotation (or the definition). -# there must be at least one field of type VectorStoreRecordKeyField. -# If you set the embedding_property_name in the VectorStoreRecordDataField, that field must exist and be a vector field. +# there must be at least one field of type `key`. # A unannotated field is allowed but must have a default value. # The purpose of these models is to be what you pass to and get back from a vector store. @@ -32,7 +25,7 @@ # so defining the key with a int, might make some stores unusable. # The decorator takes the class and pulls out the fields and annotations to create a definition, -# of type VectorStoreRecordDefinition. +# of type VectorStoreCollectionDefinition. # This definition is used for the vector store to know how to handle the data model. # You can also create the definition yourself, and pass it to the vector stores together with a standard type, @@ -44,18 +37,18 @@ @vectorstoremodel @dataclass class DataModelDataclass: - vector: Annotated[list[float], VectorStoreVectorField] - key: Annotated[str, VectorStoreKeyField()] = field(default_factory=lambda: str(uuid4())) - content: Annotated[str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector")] = "content1" + vector: Annotated[list[float] | None, VectorStoreField("vector", dimensions=3)] = None + key: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4())) + content: Annotated[str, VectorStoreField("data")] = "content1" other: str | None = None # Data model using Pydantic BaseModels @vectorstoremodel class DataModelPydantic(BaseModel): - id: Annotated[str, VectorStoreKeyField()] = Field(default_factory=lambda: str(uuid4())) - content: Annotated[str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector")] = "content1" - vector: Annotated[list[float], VectorStoreVectorField] + id: Annotated[str, VectorStoreField("key")] = Field(default_factory=lambda: str(uuid4())) + content: Annotated[str, VectorStoreField("data")] = "content1" + vector: Annotated[list[float] | None, VectorStoreField("vector", dimensions=3)] = None other: str | None = None @@ -65,11 +58,9 @@ class DataModelPydantic(BaseModel): class DataModelPython: def __init__( self, - vector: Annotated[list[float], VectorStoreVectorField], - key: Annotated[str, VectorStoreKeyField] = None, - content: Annotated[ - str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector") - ] = "content1", + key: Annotated[str | None, VectorStoreField("key")] = None, + vector: Annotated[list[float], VectorStoreField("vector", dimensions=3)] = None, + content: Annotated[str, VectorStoreField("data")] = "content1", other: str | None = None, ): self.vector = vector @@ -88,7 +79,7 @@ def serialize(self) -> dict[str, Any]: } @classmethod - def deserialize(cls, obj: dict[str, Any]) -> "DataModelDataclass": + def deserialize(cls, obj: dict[str, Any]) -> "DataModelPython": return cls( vector=obj["vector"], key=obj["key"], @@ -102,11 +93,11 @@ def deserialize(cls, obj: dict[str, Any]) -> "DataModelDataclass": # There is also a to_dict and from_dict method, which are used to convert the data model to and from a dict, # these should be specific to the type used, if using dict as type then these can be left off. definition_pandas = VectorStoreCollectionDefinition( - fields={ - "vector": VectorStoreVectorField(type_="list[float]"), - "key": VectorStoreKeyField(type_="str"), - "content": VectorStoreDataField(type_="str", has_embedding=True, embedding_property_name="vector"), - }, + fields=[ + VectorStoreField("vector", name="vector", type="float", dimensions=3), + VectorStoreField("key", name="key", type="str"), + VectorStoreField("data", name="content", type="str"), + ], container_mode=True, to_dict=lambda record, **_: record.to_dict(orient="records"), from_dict=lambda records, **_: DataFrame(records), diff --git a/python/samples/concepts/memory/memory_with_pandas.py b/python/samples/concepts/memory/memory_with_pandas.py index cae30e077307..956b643cd23c 100644 --- a/python/samples/concepts/memory/memory_with_pandas.py +++ b/python/samples/concepts/memory/memory_with_pandas.py @@ -5,62 +5,67 @@ import pandas as pd -from semantic_kernel import Kernel -from semantic_kernel.connectors.ai.open_ai import OpenAIEmbeddingPromptExecutionSettings, OpenAITextEmbedding +from semantic_kernel.connectors.ai.open_ai import OpenAITextEmbedding from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection -from semantic_kernel.data import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) -from semantic_kernel.data.vectors import add_vector_to_records +from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField -model_fields = VectorStoreCollectionDefinition( - container_mode=True, - fields={ - "content": VectorStoreDataField(has_embedding=True, embedding_property_name="vector"), - "id": VectorStoreKeyField(), - "vector": VectorStoreVectorField( - embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)} +definition = VectorStoreCollectionDefinition( + collection_name="pandas_test_index", + fields=[ + VectorStoreField("key", name="id", type="str"), + VectorStoreField("data", name="title", type="str"), + VectorStoreField("data", name="content", type="str", is_full_text_indexed=True), + VectorStoreField( + "vector", + name="vector", + type="float", + dimensions=1536, + embedding_generator=OpenAITextEmbedding(ai_model_id="text-embedding-3-small"), ), - }, + ], to_dict=lambda record, **_: record.to_dict(orient="records"), from_dict=lambda records, **_: pd.DataFrame(records), + container_mode=True, ) async def main(): - # setup the kernel - kernel = Kernel() - kernel.add_service(OpenAITextEmbedding(service_id="embedding", ai_model_id="text-embedding-3-small")) - # create the record collection - async with AzureAISearchCollection[pd.DataFrame]( + async with AzureAISearchCollection[str, pd.DataFrame]( record_type=pd.DataFrame, - definition=model_fields, - ) as record_collection: + definition=definition, + ) as collection: + await collection.ensure_collection_exists() # create some records records = [ - {"id": str(uuid4()), "content": "my dict text", "vector": None}, - {"id": str(uuid4()), "content": "my second text", "vector": None}, + { + "id": str(uuid4()), + "title": "Document about Semantic Kernel.", + "content": "Semantic Kernel is a framework for building AI applications.", + }, + { + "id": str(uuid4()), + "title": "Document about Python", + "content": "Python is a programming language that lets you work quickly.", + }, ] - # create the dataframe and add the embeddings + # create the dataframe and add the content you want to embed to a new column df = pd.DataFrame(records) - df = await add_vector_to_records(kernel, df, None, definition=model_fields) - print("Records with embeddings:") - print(df.shape) - print(df.head(5)) - + df["vector"] = df.apply(lambda row: f"title: {row['title']}, content: {row['content']}", axis=1) + print(df.head(1)) # upsert the records (for a container, upsert and upsert_batch are equivalent) - await record_collection.upsert_batch(df) + await collection.upsert(df) # retrieve a record - result = await record_collection.get(records[0]["id"]) - print("Retrieved records:") - print(result.shape) - print(result.head(5)) + result = await collection.get(top=2) + if result is None: + print("No records found, this is sometimes because the get is too fast and the index is not ready yet.") + else: + print("Retrieved records:") + print(result.to_string()) + + await collection.ensure_collection_deleted() if __name__ == "__main__": diff --git a/python/samples/concepts/memory/simple_memory.py b/python/samples/concepts/memory/simple_memory.py index 6ee7b7fe355f..85ee6d7cb2a6 100644 --- a/python/samples/concepts/memory/simple_memory.py +++ b/python/samples/concepts/memory/simple_memory.py @@ -10,13 +10,7 @@ from samples.concepts.resources.utils import Colors, print_with_color from semantic_kernel.connectors.ai.open_ai import OpenAITextEmbedding from semantic_kernel.connectors.memory import InMemoryCollection -from semantic_kernel.data import ( - VectorSearchOptions, - VectorStoreRecordDataField, - VectorStoreRecordKeyField, - VectorStoreRecordVectorField, - vectorstoremodel, -) +from semantic_kernel.data import VectorStoreField, vectorstoremodel # This is the most basic example of a vector store and collection # For a more complex example, using different collection types, see "complex_memory.py" @@ -36,14 +30,14 @@ @vectorstoremodel(collection_name="test") @dataclass class DataModel: - content: Annotated[str, VectorStoreRecordDataField()] - id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4())) + content: Annotated[str, VectorStoreField("data")] + id: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4())) vector: Annotated[ - str | list[float] | None, - VectorStoreRecordVectorField(dimensions=1536, property_type="float"), + list[float] | str | None, + VectorStoreField("vector", dimensions=1536), ] = None - title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_indexed=True)] = "title" - tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_indexed=True)] = "tag" + title: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)] = "title" + tag: Annotated[str, VectorStoreField("data", is_indexed=True)] = "tag" def __post_init__(self): if self.vector is None: @@ -80,13 +74,12 @@ async def main(): # for the in memory collection, this is just a no-op # but for other collections, like Azure AI Search, this will open and close the connection async with InMemoryCollection[str, DataModel]( - data_model_type=DataModel, + record_type=DataModel, embedding_generator=embedder, ) as record_collection: # Create the collection after wiping it print_with_color("Creating test collection!", Colors.CGREY) - await record_collection.ensure_collection_deleted() - await record_collection.create_collection_if_not_exists() + await record_collection.delete_create_collection() # First add vectors to the records print_with_color("Adding records!", Colors.CBLUE) @@ -108,10 +101,10 @@ async def main(): # The most important option is the vector_field_name, which is the name of the field that contains the vector # The other options are optional, but can be useful # The filter option is used to filter the results based on the tag field - options = VectorSearchOptions( - vector_property_name="vector", - filter=lambda x: x.tag == "general", - ) + options = { + "vector_property_name": "vector", + "filter": lambda x: x.tag == "general", + } query = "python" print_with_color(f"Searching for '{query}', with filter 'tag == general'", Colors.CBLUE) print_with_color( @@ -120,7 +113,7 @@ async def main(): ) search_results = await record_collection.search( values=query, - options=options, + **options, ) if search_results.total_count == 0: print("\nNothing found...\n") diff --git a/python/samples/concepts/rag/rag_with_vector_collection.py b/python/samples/concepts/rag/rag_with_vector_collection.py index f5c9dc3e3545..9c95d67789bf 100644 --- a/python/samples/concepts/rag/rag_with_vector_collection.py +++ b/python/samples/concepts/rag/rag_with_vector_collection.py @@ -11,7 +11,7 @@ OpenAITextEmbedding, ) from semantic_kernel.connectors.memory import InMemoryCollection -from semantic_kernel.data import VectorStoreDataField, VectorStoreKeyField, VectorStoreVectorField, vectorstoremodel +from semantic_kernel.data import VectorStoreField, vectorstoremodel from semantic_kernel.functions import KernelArguments """ @@ -27,11 +27,11 @@ @vectorstoremodel(collection_name="budget") @dataclass class BudgetItem: - id: Annotated[str, VectorStoreKeyField] - text: Annotated[str, VectorStoreDataField] + id: Annotated[str, VectorStoreField("key")] + text: Annotated[str, VectorStoreField("data")] embedding: Annotated[ list[float] | str | None, - VectorStoreVectorField(dimensions=1536, embedding_generator=OpenAITextEmbedding()), + VectorStoreField("vector", dimensions=1536, embedding_generator=OpenAITextEmbedding()), ] = None def __post_init__(self): diff --git a/python/samples/concepts/rag/self_critique_rag.py b/python/samples/concepts/rag/self_critique_rag.py index e95f0e54d288..7e131ab79747 100644 --- a/python/samples/concepts/rag/self_critique_rag.py +++ b/python/samples/concepts/rag/self_critique_rag.py @@ -9,7 +9,7 @@ from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion, OpenAITextEmbedding from semantic_kernel.connectors.memory import AzureAISearchCollection from semantic_kernel.contents import ChatHistory -from semantic_kernel.data import VectorStoreDataField, VectorStoreKeyField, VectorStoreVectorField, vectorstoremodel +from semantic_kernel.data import VectorStoreField, vectorstoremodel from semantic_kernel.functions.kernel_function import KernelFunction """ @@ -26,11 +26,11 @@ @vectorstoremodel(collection_name="generic") @dataclass class InfoItem: - key: Annotated[str, VectorStoreKeyField] - text: Annotated[str, VectorStoreDataField] + key: Annotated[str, VectorStoreField("key")] + text: Annotated[str, VectorStoreField("data")] embedding: Annotated[ list[float] | str | None, - VectorStoreVectorField(dimensions=1536, embedding_generator=OpenAITextEmbedding()), + VectorStoreField("vector", dimensions=1536, embedding_generator=OpenAITextEmbedding()), ] = None def __post_init__(self): diff --git a/python/samples/getting_started/third_party/postgres-memory.ipynb b/python/samples/getting_started/third_party/postgres-memory.ipynb index bbf8ce694cde..0db378257585 100644 --- a/python/samples/getting_started/third_party/postgres-memory.ipynb +++ b/python/samples/getting_started/third_party/postgres-memory.ipynb @@ -40,9 +40,7 @@ "from semantic_kernel.data import (\n", " DistanceFunction,\n", " IndexKind,\n", - " VectorStoreDataField,\n", - " VectorStoreKeyField,\n", - " VectorStoreVectorField,\n", + " VectorStoreField,\n", " vectorstoremodel,\n", ")\n", "from semantic_kernel.functions import KernelParameterMetadata\n", @@ -152,15 +150,16 @@ "@vectorstoremodel\n", "@dataclass\n", "class ArxivPaper:\n", - " id: Annotated[str, VectorStoreKeyField()]\n", - " title: Annotated[str, VectorStoreDataField()]\n", - " abstract: Annotated[str, VectorStoreDataField()]\n", - " published: Annotated[datetime, VectorStoreDataField()]\n", - " authors: Annotated[list[str], VectorStoreDataField()]\n", - " link: Annotated[str | None, VectorStoreDataField()]\n", + " id: Annotated[str, VectorStoreField(\"key\")]\n", + " title: Annotated[str, VectorStoreField(\"data\")]\n", + " abstract: Annotated[str, VectorStoreField(\"data\")]\n", + " published: Annotated[datetime, VectorStoreField(\"data\")]\n", + " authors: Annotated[list[str], VectorStoreField(\"data\")]\n", + " link: Annotated[str | None, VectorStoreField(\"data\")]\n", " abstract_vector: Annotated[\n", " list[float] | str | None,\n", - " VectorStoreVectorField(\n", + " VectorStoreField(\n", + " \"vector\",\n", " index_kind=IndexKind.HNSW,\n", " dimensions=1536,\n", " distance_function=DistanceFunction.COSINE_DISTANCE,\n", diff --git a/python/semantic_kernel/connectors/memory/azure_ai_search.py b/python/semantic_kernel/connectors/memory/azure_ai_search.py index dcf4b2b5c989..ea25c749069d 100644 --- a/python/semantic_kernel/connectors/memory/azure_ai_search.py +++ b/python/semantic_kernel/connectors/memory/azure_ai_search.py @@ -30,12 +30,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -206,7 +201,7 @@ def _definition_to_azure_ai_search_index( search_algos = [] for field in definition.fields: - if isinstance(field, VectorStoreDataField): + if field.field_type == "data": if not field.type_: logger.debug(f"Field {field.name} has not specified type, defaulting to Edm.String.") if field.type_ and field.type_ not in TYPE_MAP_DATA: @@ -232,7 +227,7 @@ def _definition_to_azure_ai_search_index( hidden=False, ) ) - elif isinstance(field, VectorStoreKeyField): + elif field.field_type == "key": fields.append( SimpleField( name=field.storage_name or field.name, @@ -242,7 +237,7 @@ def _definition_to_azure_ai_search_index( searchable=True, ) ) - elif isinstance(field, VectorStoreVectorField): + elif field.field_type == "vector": if not field.type_: logger.debug(f"Field {field.name} has not specified type, defaulting to Collection(Edm.Single).") if field.index_kind not in INDEX_ALGORITHM_MAP: @@ -563,7 +558,7 @@ async def _inner_search( else [ field.name for field in self.definition.fields - if isinstance(field, VectorStoreDataField) and field.is_full_text_indexed + if field.field_type == "data" and field.is_full_text_indexed ] ) if not search_args["search_fields"]: diff --git a/python/semantic_kernel/connectors/memory/azure_cosmos_db.py b/python/semantic_kernel/connectors/memory/azure_cosmos_db.py index 4649c20686af..32f46a6a1f8c 100644 --- a/python/semantic_kernel/connectors/memory/azure_cosmos_db.py +++ b/python/semantic_kernel/connectors/memory/azure_cosmos_db.py @@ -24,11 +24,7 @@ MongoDBAtlasStore, ) from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreVectorField, -) +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -141,10 +137,10 @@ def _create_default_indexing_policy_nosql(definition: VectorStoreCollectionDefin } for field in definition.fields: - if isinstance(field, VectorStoreDataField) and (not field.is_full_text_indexed and not field.is_indexed): + if field.field_type == "data" and (not field.is_full_text_indexed and not field.is_indexed): indexing_policy["excludedPaths"].append({"path": f'/"{field.storage_name or field.name}"/*'}) - if isinstance(field, VectorStoreVectorField): + if field.field_type == "vector": if field.index_kind not in INDEX_KIND_MAP_NOSQL: raise VectorStoreModelException( f"Index kind '{field.index_kind}' is not supported by Azure Cosmos DB NoSQL container." @@ -177,7 +173,7 @@ def _create_default_vector_embedding_policy(definition: VectorStoreCollectionDef vector_embedding_policy: dict[str, Any] = {"vectorEmbeddings": []} for field in definition.fields: - if isinstance(field, VectorStoreVectorField): + if field.field_type == "vector": if field.distance_function not in DISTANCE_FUNCTION_MAP_NOSQL: raise VectorStoreModelException( f"Distance function '{field.distance_function}' is not supported by Azure Cosmos DB NoSQL." diff --git a/python/semantic_kernel/connectors/memory/faiss.py b/python/semantic_kernel/connectors/memory/faiss.py index f97fa0f194cc..d9aaa7038efa 100644 --- a/python/semantic_kernel/connectors/memory/faiss.py +++ b/python/semantic_kernel/connectors/memory/faiss.py @@ -11,7 +11,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.connectors.memory.in_memory import IN_MEMORY_SCORE_KEY, InMemoryCollection, InMemoryStore, TKey from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreVectorField +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import SearchType, TModel, VectorSearchOptions, VectorSearchResult from semantic_kernel.exceptions import VectorStoreInitializationException, VectorStoreOperationException @@ -35,7 +35,7 @@ } -def _create_index(field: VectorStoreVectorField) -> faiss.Index: +def _create_index(field: VectorStoreField) -> faiss.Index: """Create a Faiss index.""" if field.index_kind not in INDEX_KIND_MAP: raise VectorStoreInitializationException(f"Index kind {field.index_kind} is not supported.") diff --git a/python/semantic_kernel/connectors/memory/in_memory.py b/python/semantic_kernel/connectors/memory/in_memory.py index 3040beee9367..636f0526ed53 100644 --- a/python/semantic_kernel/connectors/memory/in_memory.py +++ b/python/semantic_kernel/connectors/memory/in_memory.py @@ -179,7 +179,7 @@ async def _inner_search( f"Distance function '{field.distance_function}' is not supported. " f"Supported functions are: {list(DISTANCE_FUNCTION_MAP.keys())}" ) - distance_func = DISTANCE_FUNCTION_MAP[field.distance_function] + distance_func = DISTANCE_FUNCTION_MAP[field.distance_function] # type: ignore[assignment] for key, record in self._get_filtered_records(options).items(): if vector and field is not None: @@ -192,7 +192,7 @@ async def _inner_search( if field.distance_function == DistanceFunction.DEFAULT: reverse_func = DISTANCE_FUNCTION_DIRECTION_HELPER[DistanceFunction.COSINE_DISTANCE] else: - reverse_func = DISTANCE_FUNCTION_DIRECTION_HELPER[field.distance_function] + reverse_func = DISTANCE_FUNCTION_DIRECTION_HELPER[field.distance_function] # type: ignore[assignment] sorted_records = dict( sorted( return_records.items(), diff --git a/python/semantic_kernel/connectors/memory/mongodb.py b/python/semantic_kernel/connectors/memory/mongodb.py index 0f4bf81de124..102e96008282 100644 --- a/python/semantic_kernel/connectors/memory/mongodb.py +++ b/python/semantic_kernel/connectors/memory/mongodb.py @@ -16,7 +16,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreVectorField +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -87,7 +87,7 @@ class MongoDBAtlasSettings(KernelBaseSettings): index_name: str = DEFAULT_SEARCH_INDEX_NAME -def _create_vector_field(field: VectorStoreVectorField) -> dict: +def _create_vector_field(field: VectorStoreField) -> dict: """Create a vector field. Args: diff --git a/python/semantic_kernel/connectors/memory/pinecone.py b/python/semantic_kernel/connectors/memory/pinecone.py index 1779ed723b48..d0387b65bf02 100644 --- a/python/semantic_kernel/connectors/memory/pinecone.py +++ b/python/semantic_kernel/connectors/memory/pinecone.py @@ -14,7 +14,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreVectorField +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -208,7 +208,7 @@ async def create_collection(self, **kwargs: Any) -> None: ) async def _create_index_with_integrated_embeddings( - self, vector_field: VectorStoreVectorField | None, **kwargs: Any + self, vector_field: VectorStoreField | None, **kwargs: Any ) -> None: """Create the Pinecone index with the embed parameter.""" if isinstance(self.client, PineconeGRPC): @@ -243,7 +243,7 @@ async def _create_index_with_integrated_embeddings( self.index = await self.client.create_index_for_model(**index_creation_args) await self._load_index_client() - async def _create_regular_index(self, vector_field: VectorStoreVectorField | None, **kwargs: Any) -> None: + async def _create_regular_index(self, vector_field: VectorStoreField | None, **kwargs: Any) -> None: """Create the Pinecone index with the embed parameter.""" if not vector_field: raise VectorStoreOperationException( diff --git a/python/semantic_kernel/connectors/memory/postgres.py b/python/semantic_kernel/connectors/memory/postgres.py index 5c1d68367a0d..4e1bf55ce356 100644 --- a/python/semantic_kernel/connectors/memory/postgres.py +++ b/python/semantic_kernel/connectors/memory/postgres.py @@ -18,13 +18,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreFieldBase, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -142,9 +136,7 @@ def _python_type_to_postgres(python_type_str: str) -> str | None: return None -def _convert_row_to_dict( - row: tuple[Any, ...], fields: Sequence[tuple[str, VectorStoreFieldBase | None]] -) -> dict[str, Any]: +def _convert_row_to_dict(row: tuple[Any, ...], fields: Sequence[tuple[str, VectorStoreField | None]]) -> dict[str, Any]: """Convert a row from a PostgreSQL query to a dictionary. Uses the field information to map the row values to the corresponding field names. @@ -157,10 +149,10 @@ def _convert_row_to_dict( A dictionary representation of the row. """ - def _convert(v: Any | None, field: VectorStoreFieldBase | None) -> Any | None: + def _convert(v: Any | None, field: VectorStoreField | None) -> Any | None: if v is None: return None - if isinstance(field, VectorStoreVectorField) and isinstance(v, str): + if field and field.field_type == "vector" and isinstance(v, str): # psycopg returns vector as a string if pgvector is not loaded. # If pgvector is registered with the connection, no conversion is required. return json.loads(v) @@ -171,7 +163,7 @@ def _convert(v: Any | None, field: VectorStoreFieldBase | None) -> Any | None: def _convert_dict_to_row( record: dict[str, Any], - fields: list[VectorStoreKeyField | VectorStoreVectorField | VectorStoreDataField], + fields: list[VectorStoreField], ) -> tuple[Any, ...]: """Convert a dictionary to a row for a PostgreSQL query. @@ -587,14 +579,14 @@ async def create_collection(self, **kwargs: Any) -> None: # For Vector fields with dimensions, use pgvector's VECTOR type # Note that other vector types are supported in pgvector (e.g. halfvec), # but would need to be created outside of this method. - if isinstance(field, VectorStoreVectorField): + if field.field_type == "vector": column_definitions.append( sql.SQL("{name} VECTOR({dimensions})").format( name=sql.Identifier(field.storage_name or field.name), dimensions=sql.Literal(field.dimensions), ) ) - elif isinstance(field, VectorStoreKeyField): + elif field.field_type == "key": # Use the property_type directly for key fields column_definitions.append( sql.SQL("{name} {col_type} PRIMARY KEY").format( @@ -661,7 +653,7 @@ async def ensure_collection_deleted(self, **kwargs: Any) -> None: ) await conn.commit() - async def _create_index(self, table_name: str, vector_field: VectorStoreVectorField) -> None: + async def _create_index(self, table_name: str, vector_field: VectorStoreField) -> None: """Create an index on a column in the table. Args: @@ -762,7 +754,7 @@ def _construct_vector_query( vector: Sequence[float | int], options: VectorSearchOptions, **kwargs: Any, - ) -> tuple[sql.Composed, list[Any], list[tuple[str, VectorStoreFieldBase | None]]]: + ) -> tuple[sql.Composed, list[Any], list[tuple[str, VectorStoreField | None]]]: """Construct a vector search query. Args: diff --git a/python/semantic_kernel/connectors/memory/redis.py b/python/semantic_kernel/connectors/memory/redis.py index 678117669334..e17d6d7a01b7 100644 --- a/python/semantic_kernel/connectors/memory/redis.py +++ b/python/semantic_kernel/connectors/memory/redis.py @@ -25,12 +25,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -98,8 +93,8 @@ class RedisCollectionTypes(str, Enum): } -def _field_to_redis_field_hashset(name: str, field: VectorStoreVectorField | VectorStoreDataField) -> RedisField: - if isinstance(field, VectorStoreVectorField): +def _field_to_redis_field_hashset(name: str, field: VectorStoreField) -> RedisField: + if field.field_type == "vector": if field.distance_function not in DISTANCE_FUNCTION_MAP: raise VectorStoreOperationException( f"Distance function {field.distance_function} is not supported. " @@ -125,8 +120,8 @@ def _field_to_redis_field_hashset(name: str, field: VectorStoreVectorField | Vec return TagField(name=name) -def _field_to_redis_field_json(name: str, field: VectorStoreVectorField | VectorStoreDataField) -> RedisField: - if isinstance(field, VectorStoreVectorField): +def _field_to_redis_field_json(name: str, field: VectorStoreField) -> RedisField: + if field.field_type == "vector": if field.distance_function not in DISTANCE_FUNCTION_MAP: raise VectorStoreOperationException( f"Distance function {field.distance_function} is not supported. " @@ -159,7 +154,7 @@ def _definition_to_redis_fields( """Create a list of fields for Redis from a definition.""" fields: list[RedisField] = [] for field in definition.fields: - if isinstance(field, VectorStoreKeyField): + if field.field_type == "key": continue if collection_type == RedisCollectionTypes.HASHSET: fields.append(_field_to_redis_field_hashset(field.storage_name or field.name, field)) # type: ignore @@ -376,13 +371,13 @@ def get_field_expr(field_name): ) if field is None: raise VectorStoreOperationException(f"Field '{field_name}' not found in data model.") - if isinstance(field, VectorStoreDataField): + if field.field_type == "data": if field.is_full_text_indexed: return lambda: Text(field_name) if field.type_ in ("int", "float"): return lambda: Num(field_name) return lambda: Tag(field_name) - if isinstance(field, VectorStoreVectorField): + if field.field_type == "vector": raise VectorStoreOperationException(f"Cannot filter on vector field '{field_name}'.") return lambda: Tag(field_name) @@ -593,11 +588,11 @@ def _serialize_dicts_to_store_models( for record in records: result: dict[str, Any] = {"mapping": {}} for field in self.definition.fields: - if isinstance(field, VectorStoreVectorField): + if field.field_type == "vector": dtype = DATATYPE_MAP_VECTOR[field.type_ or "default"].lower() result["mapping"][field.storage_name or field.name] = array_to_buffer(record[field.name], dtype) continue - if isinstance(field, VectorStoreKeyField): + if field.field_type == "key": result["name"] = self._get_redis_key(record[field.name]) continue result["mapping"][field.storage_name or field.name] = record[field.name] @@ -614,10 +609,10 @@ def _deserialize_store_models_to_dicts( for record in records: rec = record.copy() for field in self.definition.fields: - match field: - case VectorStoreKeyField(): + match field.field_type: + case "key": rec[field.name] = self._unget_redis_key(rec[field.name]) - case VectorStoreVectorField(): + case "vector": dtype = DATATYPE_MAP_VECTOR[field.type_ or "default"] rec[field.name] = buffer_to_array(rec[field.name], dtype) results.append(rec) @@ -631,8 +626,8 @@ def _add_return_fields(self, query: TQuery, include_vectors: bool) -> TQuery: """ for field in self.definition.fields: - match field: - case VectorStoreVectorField(): + match field.field_type: + case "vector": if include_vectors: query.return_field(field.name, decode_field=False) case _: @@ -721,10 +716,10 @@ def _serialize_dicts_to_store_models( for record in records: result: dict[str, Any] = {"value": {}} for field in self.definition.fields: - if isinstance(field, VectorStoreKeyField): + if field.field_type == "key": result["name"] = self._get_redis_key(record[field.name]) continue - if isinstance(field, VectorStoreVectorField): + if field.field_type == "vector": result["value"][field.storage_name or field.name] = record[field.name] result["value"][field.storage_name or field.name] = record[field.name] results.append(result) @@ -747,8 +742,8 @@ def _deserialize_store_models_to_dicts( def _add_return_fields(self, query: TQuery, include_vectors: bool) -> TQuery: """Add the return fields to the query.""" for field in self.definition.fields: - match field: - case VectorStoreVectorField(): + match field.field_type: + case "vector": if include_vectors: query.return_field(field.name) case _: diff --git a/python/semantic_kernel/connectors/memory/sql_server.py b/python/semantic_kernel/connectors/memory/sql_server.py index 0231466d4c4c..79de3cd3d6a4 100644 --- a/python/semantic_kernel/connectors/memory/sql_server.py +++ b/python/semantic_kernel/connectors/memory/sql_server.py @@ -18,12 +18,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DISTANCE_FUNCTION_DIRECTION_HELPER, DistanceFunction, IndexKind -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -34,10 +29,10 @@ VectorStore, VectorStoreRecordCollection, ) -from semantic_kernel.exceptions import VectorStoreOperationException -from semantic_kernel.exceptions.vector_store_exceptions import ( +from semantic_kernel.exceptions import ( VectorSearchExecutionException, VectorStoreInitializationException, + VectorStoreOperationException, ) from semantic_kernel.kernel_pydantic import KernelBaseSettings from semantic_kernel.kernel_types import OneOrMany @@ -845,9 +840,9 @@ def _add_cast_check(placeholder: str, value: Any) -> str: def _build_create_table_query( schema: str, table: str, - key_field: VectorStoreKeyField, - data_fields: list[VectorStoreDataField], - vector_fields: list[VectorStoreVectorField], + key_field: VectorStoreField, + data_fields: list[VectorStoreField], + vector_fields: list[VectorStoreField], if_not_exists: bool = False, ) -> SqlCommand: """Build the CREATE TABLE query based on the data model.""" @@ -928,9 +923,9 @@ def _build_select_table_name_query( def _add_field_names( command: SqlCommand, - key_field: VectorStoreKeyField, - data_fields: list[VectorStoreDataField], - vector_fields: list[VectorStoreVectorField] | None, + key_field: VectorStoreField, + data_fields: list[VectorStoreField], + vector_fields: list[VectorStoreField] | None, table_identifier: str | None = None, ) -> None: """Add the field names to the query builder. @@ -956,9 +951,9 @@ def _add_field_names( def _build_merge_query( schema: str, table: str, - key_field: VectorStoreKeyField, - data_fields: list[VectorStoreDataField], - vector_fields: list[VectorStoreVectorField], + key_field: VectorStoreField, + data_fields: list[VectorStoreField], + vector_fields: list[VectorStoreField], records: Sequence[dict[str, Any]], ) -> SqlCommand: """Build the MERGE TABLE query based on the data model.""" @@ -1019,9 +1014,9 @@ def _build_merge_query( def _build_select_query( schema: str, table: str, - key_field: VectorStoreKeyField, - data_fields: list[VectorStoreDataField], - vector_fields: list[VectorStoreVectorField] | None, + key_field: VectorStoreField, + data_fields: list[VectorStoreField], + vector_fields: list[VectorStoreField] | None, keys: Sequence[Any], ) -> SqlCommand: """Build the SELECT query based on the data model.""" @@ -1046,7 +1041,7 @@ def _build_select_query( def _build_delete_query( schema: str, table: str, - key_field: VectorStoreKeyField, + key_field: VectorStoreField, keys: Sequence[Any], ) -> SqlCommand: """Build the DELETE query based on the data model.""" @@ -1066,9 +1061,9 @@ def _build_delete_query( def _build_search_query( schema: str, table: str, - key_field: VectorStoreKeyField, - data_fields: list[VectorStoreDataField], - vector_fields: list[VectorStoreVectorField], + key_field: VectorStoreField, + data_fields: list[VectorStoreField], + vector_fields: list[VectorStoreField], vector: Sequence[float | int], options: VectorSearchOptions, filter: SqlCommand | list[SqlCommand] | None = None, @@ -1079,10 +1074,14 @@ def _build_search_query( # add the data and vector fields _add_field_names(command, key_field, data_fields, vector_fields if options.include_vectors else None) # add the vector search clause - vector_field: VectorStoreVectorField | None = None + vector_field: VectorStoreField | None = None if options.vector_property_name: vector_field = next( - (field for field in vector_fields if field.name == options.vector_property_name), + ( + field + for field in vector_fields + if field.name == options.vector_property_name or field.storage_name == options.vector_property_name + ), None, ) elif len(vector_fields) == 1: diff --git a/python/semantic_kernel/connectors/memory/weaviate.py b/python/semantic_kernel/connectors/memory/weaviate.py index 4a670eda6948..e86bdfa75437 100644 --- a/python/semantic_kernel/connectors/memory/weaviate.py +++ b/python/semantic_kernel/connectors/memory/weaviate.py @@ -21,7 +21,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreVectorField +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -86,15 +86,6 @@ } -def _check_field(vector_field: VectorStoreVectorField): - if vector_field.distance_function not in DISTANCE_FUNCTION_MAP: - raise VectorStoreModelValidationError( - f"Distance function {vector_field.distance_function} is not supported by Weaviate." - ) - if vector_field.index_kind not in INDEX_KIND_MAP: - raise VectorStoreModelValidationError(f"Index kind {vector_field.index_kind} is not supported by Weaviate.") - - def _definition_to_weaviate_named_vectors( definition: VectorStoreCollectionDefinition, ) -> list[_NamedVectorConfigCreate]: @@ -109,7 +100,12 @@ def _definition_to_weaviate_named_vectors( vector_list: list[_NamedVectorConfigCreate] = [] for field in definition.vector_fields: - _check_field(field) + if field.distance_function is None or field.distance_function not in DISTANCE_FUNCTION_MAP: + raise VectorStoreModelValidationError( + f"Distance function {field.distance_function} is not supported by Weaviate." + ) + if field.index_kind is None or field.index_kind not in INDEX_KIND_MAP: + raise VectorStoreModelValidationError(f"Index kind {field.index_kind} is not supported by Weaviate.") vector_list.append( Configure.NamedVectors.none( name=field.storage_name or field.name, # type: ignore @@ -478,7 +474,7 @@ async def _inner_vectorized_search( self, collection: CollectionAsync, vector: list[float | int], - vector_field: VectorStoreVectorField | None, + vector_field: VectorStoreField | None, args: dict[str, Any], ) -> Any: if self.named_vectors and not vector_field: @@ -580,7 +576,17 @@ async def create_collection(self, **kwargs) -> None: vectorizer_config = _definition_to_weaviate_named_vectors(self.definition) else: vector_field = self.definition.vector_fields[0] - _check_field(vector_field) + if ( + vector_field.distance_function is None + or vector_field.distance_function not in DISTANCE_FUNCTION_MAP + ): + raise VectorStoreModelValidationError( + f"Distance function {vector_field.distance_function} is not supported by Weaviate." + ) + if vector_field.index_kind is None or vector_field.index_kind not in INDEX_KIND_MAP: + raise VectorStoreModelValidationError( + f"Index kind {vector_field.index_kind} is not supported by Weaviate." + ) vector_index_config = INDEX_KIND_MAP[vector_field.index_kind]( distance_metric=DISTANCE_FUNCTION_MAP[vector_field.distance_function] ) diff --git a/python/semantic_kernel/data/__init__.py b/python/semantic_kernel/data/__init__.py index 2b7c78374632..46a0c476bc6e 100644 --- a/python/semantic_kernel/data/__init__.py +++ b/python/semantic_kernel/data/__init__.py @@ -8,13 +8,7 @@ DistanceFunction, IndexKind, ) -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, - vectorstoremodel, -) +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField, vectorstoremodel from semantic_kernel.data.search import ( DynamicFilterFunction, KernelSearchResults, @@ -39,10 +33,8 @@ "VectorSearchResult", "VectorStore", "VectorStoreCollectionDefinition", - "VectorStoreDataField", - "VectorStoreKeyField", + "VectorStoreField", "VectorStoreRecordCollection", - "VectorStoreVectorField", "create_options", "default_dynamic_filter_function", "vectorstoremodel", diff --git a/python/semantic_kernel/data/definitions.py b/python/semantic_kernel/data/definitions.py index 1933a8924140..7ff1102fbb0b 100644 --- a/python/semantic_kernel/data/definitions.py +++ b/python/semantic_kernel/data/definitions.py @@ -2,13 +2,12 @@ import logging from collections.abc import Sequence +from dataclasses import dataclass from inspect import Parameter, _empty, signature from types import MappingProxyType, NoneType -from typing import Annotated, Any, Protocol, TypeVar, runtime_checkable -from warnings import warn +from typing import Annotated, Any, Literal, Protocol, TypeVar, overload, runtime_checkable -from pydantic import ConfigDict, Field, model_validator -from pydantic.dataclasses import dataclass +from pydantic import Field, ValidationError from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind @@ -23,128 +22,146 @@ @release_candidate -@dataclass(kw_only=True, config=ConfigDict(extra="allow")) -class VectorStoreFieldBase: - """Vector store fields. +@dataclass +class VectorStoreField: + """Vector store fields.""" - Args: - name: The name of the field. - type: The type of the field. - storage_name: The name of the field in the store, uses the field name by default. - - """ - - name: str = Field(default="") - type_: str | None = Field(default=None, alias="type") + field_type: Literal["key", "data", "vector"] = "data" + name: str = "" storage_name: str | None = None - - @model_validator(mode="before") - @classmethod - def check_deprecated_fields(cls, data: dict[str, Any]) -> dict[str, Any]: - """Check for deprecated fields. - - Args: - data: The data to check. - - Returns: - The data with the deprecated fields removed. - """ - if isinstance(data, dict): - if "is_filterable" in data: - warn( - "The is_filterable field is deprecated. Please use the is_indexed field instead.", - DeprecationWarning, - ) - data["is_indexed"] = data.pop("is_filterable") - if "is_full_text_searchable" in data: - warn( - "The is_full_text_searchable field is deprecated. " - "Please use the is_full_text_indexed field instead.", - DeprecationWarning, - ) - data["is_full_text_indexed"] = data.pop("is_full_text_searchable") - if "local_embedding" in data: - warn( - "The local_embedding field is deprecated. " - "Please use the has_embedding and has_local_embedding field on the data field instead.", - DeprecationWarning, - ) - data.pop("local_embedding") - return data - - -@release_candidate -@dataclass(kw_only=True) -class VectorStoreKeyField(VectorStoreFieldBase): - """Memory record key field. - - When the key will be auto-generated by the store, make sure it has a default, usually None. - - Args: - name: The name of the field. - storage_name: The name of the field in the store, uses the field name by default. - type: The type of the field. - """ - - -@release_candidate -@dataclass(kw_only=True) -class VectorStoreDataField(VectorStoreFieldBase): - """Memory record data field. - - Args: - name: The name of the field. - storage_name: The name of the field in the store, uses the field name by default. - type: The type of the field. - is_indexed: Whether the field is indexed. - is_full_text_indexed: Whether the field is full text indexed. - """ - + type_: str | None = None + # data specific fields (all optional) is_indexed: bool | None = None is_full_text_indexed: bool | None = None + # vector specific fields (dimensions is mandatory) + dimensions: int | None = None + embedding_generator: EmbeddingGeneratorBase | None = None + # defaults for these fields are not set here, because they are not relevant for data and key types + index_kind: IndexKind | None = None + distance_function: DistanceFunction | None = None + + @overload + def __init__( + self, + field_type: Literal["key"] = "key", + *, + name: str | None = None, + type: str | None = None, + storage_name: str | None = None, + ): + """Key field of the record. + + When the key will be auto-generated by the store, make sure it has a default, usually None. + Args: + field_type: always "key". + name: The name of the field. + storage_name: The name of the field in the store, uses the field name by default. + type: The type of the field. + """ + ... + + @overload + def __init__( + self, + field_type: Literal["data"] = "data", + *, + name: str | None = None, + type: str | None = None, + storage_name: str | None = None, + is_indexed: bool | None = None, + is_full_text_indexed: bool | None = None, + ): + """Data field in the record. -@release_candidate -@dataclass(kw_only=True) -class VectorStoreVectorField(VectorStoreFieldBase): - """Memory record vector field. - - This field should contain the value you want to use for the vector. - When passing in the embedding generator, the embedding will be - generated locally before upserting. - If this is not set, the store should support generating the embedding for you. - If you want to retrieve the original content of the vector, - make sure to set this field twice, - once with the VectorStoreRecordDataField and once with the VectorStoreRecordVectorField. - - If you want to be able to get the vectors back, make sure the type allows this, especially for pydantic models. - For instance, if the input is a string, then the type annotation should be `str | list[float] | None`. - - If you want to cast the vector that is returned, you need to set the deserialize_function, - for instance: `deserialize_function=np.array`, (with `import numpy as np` at the top of your file). - If you want to set it up with more specific options, use a lambda, a custom function or a partial. - - Args: - name: The name of the field. - storage_name: The name of the field in the store, uses the field name by default. - type: Property type. - For vectors this should be the inner type of the vector. - By default the vector will be a list of numbers. - If you want to use a numpy array or some other optimized format, - set the cast_function with a function - that takes a list of floats and returns a numpy array. - dimensions: The number of dimensions of the vector. - index_kind: The index kind to use. - distance_function: The distance function to use. - embedding_generator: The embedding generator to use. - If this is set, the embedding will be generated locally before upserting. - """ + Args: + field_type: always "data". + name: The name of the field. + storage_name: The name of the field in the store, uses the field name by default. + type: The type of the field. + is_indexed: Whether the field is indexed. + is_full_text_indexed: Whether the field is full text indexed. + """ + ... + + @overload + def __init__( + self, + field_type: Literal["vector"] = "vector", + *, + name: str | None = None, + type: str | None = None, + dimensions: Annotated[int, Field(gt=0)], + storage_name: str | None = None, + index_kind: IndexKind | None = None, + distance_function: DistanceFunction | None = None, + embedding_generator: EmbeddingGeneratorBase | None = None, + ): + """Vector field in the record. + + This field should contain the value you want to use for the vector. + When passing in the embedding generator, the embedding will be + generated locally before upserting. + If this is not set, the store should support generating the embedding for you. + If you want to retrieve the original content of the vector, + make sure to set this field twice, + once with the VectorStoreRecordDataField and once with the VectorStoreRecordVectorField. + + If you want to be able to get the vectors back, make sure the type allows this, especially for pydantic models. + For instance, if the input is a string, then the type annotation should be `str | list[float] | None`. + + If you want to cast the vector that is returned, you need to set the deserialize_function, + for instance: `deserialize_function=np.array`, (with `import numpy as np` at the top of your file). + If you want to set it up with more specific options, use a lambda, a custom function or a partial. - dimensions: Annotated[int, Field(gt=0)] - index_kind: IndexKind = IndexKind.DEFAULT - distance_function: DistanceFunction = DistanceFunction.DEFAULT - embedding_generator: EmbeddingGeneratorBase | None = None - embedding_property_type: str | None = None + Args: + field_type: always "vector". + name: The name of the field. + storage_name: The name of the field in the store, uses the field name by default. + type: Property type. + For vectors this should be the inner type of the vector. + By default the vector will be a list of numbers. + If you want to use a numpy array or some other optimized format, + set the cast_function with a function + that takes a list of floats and returns a numpy array. + dimensions: The number of dimensions of the vector, mandatory. + index_kind: The index kind to use, uses a default index kind when None. + distance_function: The distance function to use, uses a default distance function when None. + embedding_generator: The embedding generator to use. + If this is set, the embedding will be generated locally before upserting. + """ + ... + + def __init__( + self, + field_type="data", + *, + name=None, + type=None, + storage_name=None, + is_indexed=None, + is_full_text_indexed=None, + dimensions=None, + index_kind=None, + distance_function=None, + embedding_generator=None, + ): + """Vector store field.""" + self.field_type = field_type + # when a field is created, the name can be empty, + # when a field get's added to a definition, the name needs to be there. + self.name = name + self.storage_name = storage_name + self.type_ = type + self.is_indexed = is_indexed + self.is_full_text_indexed = is_full_text_indexed + if field_type == "vector": + if dimensions is None: + raise ValidationError("Vector fields must specify 'dimensions'") + self.dimensions = dimensions + self.index_kind = index_kind or IndexKind.DEFAULT + self.distance_function = distance_function or DistanceFunction.DEFAULT + self.embedding_generator = embedding_generator # region: Protocols @@ -236,8 +253,6 @@ def to_dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]: # region: VectorStoreRecordDefinition -VectorStoreRecordFields = VectorStoreKeyField | VectorStoreDataField | VectorStoreVectorField - @release_candidate class VectorStoreCollectionDefinition(KernelBaseModel): @@ -252,7 +267,7 @@ class VectorStoreCollectionDefinition(KernelBaseModel): """ - fields: list[VectorStoreRecordFields] + fields: list[VectorStoreField] key_name: str = Field(default="", init=False) container_mode: bool = False collection_name: str | None = None @@ -272,7 +287,7 @@ def storage_names(self) -> list[str]: return [field.storage_name or field.name for field in self.fields] @property - def key_field(self) -> "VectorStoreKeyField": + def key_field(self) -> VectorStoreField: """Get the key field.""" return next((field for field in self.fields if field.name == self.key_name), None) # type: ignore @@ -282,26 +297,26 @@ def key_field_storage_name(self) -> str: return self.key_field.storage_name or self.key_field.name @property - def vector_fields(self) -> list["VectorStoreVectorField"]: + def vector_fields(self) -> list[VectorStoreField]: """Get the names of the vector fields.""" - return [field for field in self.fields if isinstance(field, VectorStoreVectorField)] + return [field for field in self.fields if field.field_type == "vector"] @property - def data_fields(self) -> list["VectorStoreDataField"]: - """Get the names of the vector fields.""" - return [field for field in self.fields if isinstance(field, VectorStoreDataField)] + def data_fields(self) -> list[VectorStoreField]: + """Get the names of the data fields.""" + return [field for field in self.fields if field.field_type == "data"] @property def vector_field_names(self) -> list[str]: """Get the names of the vector fields.""" - return [field.name for field in self.fields if isinstance(field, VectorStoreVectorField)] + return [field.name for field in self.fields if field.field_type == "vector"] @property def data_field_names(self) -> list[str]: """Get the names of all the data fields.""" - return [field.name for field in self.fields if isinstance(field, VectorStoreDataField)] + return [field.name for field in self.fields if field.field_type == "data"] - def try_get_vector_field(self, field_name: str | None = None) -> VectorStoreVectorField | None: + def try_get_vector_field(self, field_name: str | None = None) -> VectorStoreField | None: """Try to get the vector field. If the field_name is None, then the first vector field is returned. @@ -319,7 +334,7 @@ def try_get_vector_field(self, field_name: str | None = None) -> VectorStoreVect return self.vector_fields[0] for field in self.fields: if field.name == field_name or field.storage_name == field_name: - if isinstance(field, VectorStoreVectorField): + if field.field_type == "vector": return field raise VectorStoreModelException( f"Field {field_name} is not a vector field, it is of type {type(field).__name__}." @@ -339,8 +354,9 @@ def get_storage_names(self, include_vector_fields: bool = True, include_key_fiel return [ field.storage_name or field.name for field in self.fields - if (include_vector_fields or not isinstance(field, VectorStoreVectorField)) - and (include_key_field or field.name != self.key_name) + if field.field_type == "data" + or (field.field_type == "vector" and include_vector_fields) + or (field.field_type == "key" and include_key_field) ] def get_names(self, include_vector_fields: bool = True, include_key_field: bool = True) -> list[str]: @@ -356,8 +372,9 @@ def get_names(self, include_vector_fields: bool = True, include_key_field: bool return [ field.name for field in self.fields - if (include_vector_fields or not isinstance(field, VectorStoreVectorField)) - and (include_key_field or field.name != self.key_name) + if field.field_type == "data" + or (field.field_type == "vector" and include_vector_fields) + or (field.field_type == "key" and include_key_field) ] def model_post_init(self, _: Any): @@ -373,7 +390,9 @@ def model_post_init(self, _: Any): "There must be at least one field with a VectorStoreRecordField annotation." ) for field in self.fields: - if isinstance(field, VectorStoreKeyField): + if field.name == "": + raise VectorStoreModelException("Field names must not be empty.") + if field.field_type == "key": if self.key_name != "": raise VectorStoreModelException("Memory record definition must have exactly one key field.") self.key_name = field.name @@ -384,24 +403,12 @@ def model_post_init(self, _: Any): # region: Signature parsing functions -def _parse_vector_store_record_field_class( - field_type: type[VectorStoreFieldBase], field: Parameter -) -> VectorStoreFieldBase: - property_type = field.annotation.__origin__ - if (args := getattr(property_type, "__args__", None)) and NoneType in args and len(args) == 2: - property_type = args[0] - property_type_name = str(property_type) if hasattr(property_type, "__args__") else property_type.__name__ - return field_type(name=field.name, type=property_type_name) - - -def _parse_vector_store_record_field_instance( - record_field: VectorStoreFieldBase, field: Parameter -) -> VectorStoreFieldBase: +def _parse_vector_store_record_field_instance(record_field: VectorStoreField, field: Parameter) -> VectorStoreField: if not record_field.name or record_field.name != field.name: record_field.name = field.name if not record_field.type_ and hasattr(field.annotation, "__origin__"): property_type = field.annotation.__origin__ - if isinstance(record_field, VectorStoreVectorField): + if record_field.field_type == "vector": if args := getattr(property_type, "__args__", None): if NoneType in args and len(args) > 1: for arg in args: @@ -429,24 +436,20 @@ def _parse_vector_store_record_field_instance( return record_field -def _parse_parameter_to_field(field: Parameter) -> VectorStoreFieldBase | None: +def _parse_parameter_to_field(field: Parameter) -> VectorStoreField | None: # first check if there are any annotations if field.annotation is not _empty and hasattr(field.annotation, "__metadata__"): for field_annotation in field.annotation.__metadata__: - if isinstance(field_annotation, VectorStoreFieldBase): + if isinstance(field_annotation, VectorStoreField): return _parse_vector_store_record_field_instance(field_annotation, field) - if isinstance(field_annotation, type(VectorStoreFieldBase)): - return _parse_vector_store_record_field_class(field_annotation, field) # This means there are no annotations or that all annotations are of other types. # we will check if there is a default, otherwise this will cause a runtime error. # because it will not be stored, and retrieving this object will fail without a default for this field. if field.default is _empty: raise VectorStoreModelException( - "Fields that do not have a VectorStoreRecordField annotation must have a default value." + "Fields that do not have a VectorStoreField annotation must have a default value." ) - logger.debug( - f'Field "{field.name}" does not have a VectorStoreRecordField annotation, will not be part of the record.' - ) + logger.debug(f'Field "{field.name}" does not have a VectorStoreField annotation, will not be part of the record.') return None @@ -486,8 +489,8 @@ def vectorstoremodel( This decorator makes a class a vector store model. There are three things being checked: - The class must have at least one field with a annotation, - of type VectorStoreRecordKeyField, VectorStoreRecordDataField or VectorStoreRecordVectorField. - - The class must have exactly one field with the VectorStoreRecordKeyField annotation. + of type VectorStoreField. + - The class must have exactly one field with the field_type `key`. - When creating a Vector Field, either supply the property type directly, or make sure to set the property that you want the index to use first. @@ -495,10 +498,10 @@ def vectorstoremodel( Args: cls: The class to be decorated. collection_name: The name of the collection to be used. - This is used to set the collection name in the VectorStoreRecordDefinition. + This is used to set the collection name in the VectorStoreCollectionDefinition. Raises: - VectorStoreModelException: If there are no fields with a VectorStoreRecordField annotation. + VectorStoreModelException: If there are no fields with a VectorStoreField annotation. VectorStoreModelException: If there are fields with no name. VectorStoreModelException: If there is no key field. """ diff --git a/python/semantic_kernel/data/vectors.py b/python/semantic_kernel/data/vectors.py index 8d24c390af06..fa0bfbc2797f 100644 --- a/python/semantic_kernel/data/vectors.py +++ b/python/semantic_kernel/data/vectors.py @@ -17,12 +17,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.data.const import DEFAULT_DESCRIPTION, DEFAULT_FUNCTION_NAME -from semantic_kernel.data.definitions import ( - SerializeMethodProtocol, - VectorStoreCollectionDefinition, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data.definitions import SerializeMethodProtocol, VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import ( DynamicFilterFunction, KernelSearchResults, @@ -47,7 +42,7 @@ from semantic_kernel.functions.kernel_function_from_method import KernelFunctionFromMethod from semantic_kernel.functions.kernel_parameter_metadata import KernelParameterMetadata from semantic_kernel.kernel_pydantic import KernelBaseModel -from semantic_kernel.kernel_types import OneOrMany, OptionalOneOrList, OptionalOneOrMany +from semantic_kernel.kernel_types import OneOrList, OneOrMany, OptionalOneOrList, OptionalOneOrMany from semantic_kernel.utils.feature_stage_decorator import release_candidate from semantic_kernel.utils.list_handler import desync_list @@ -233,7 +228,7 @@ def _deserialize_store_models_to_dicts(self, records: Sequence[Any], **kwargs: A # region Serialization methods - def serialize(self, records: OneOrMany[TModel], **kwargs: Any) -> OneOrMany[Any]: + async def serialize(self, records: OneOrMany[TModel], **kwargs: Any) -> OneOrMany[Any]: """Serialize the data model to the store model. This method follows the following steps: @@ -253,19 +248,38 @@ def serialize(self, records: OneOrMany[TModel], **kwargs: Any) -> OneOrMany[Any] try: if serialized := self._serialize_data_model_to_store_model(records): return serialized + except VectorStoreModelSerializationException: + raise # pragma: no cover + except Exception as exc: + raise VectorStoreModelSerializationException(f"Error serializing records: {exc}") from exc - if isinstance(records, Sequence): - dict_records = [self._serialize_data_model_to_dict(rec) for rec in records] - return self._serialize_dicts_to_store_models(dict_records, **kwargs) # type: ignore - - dict_records = self._serialize_data_model_to_dict(records) # type: ignore - if isinstance(dict_records, Sequence): - # most likely this is a container, so we return all records as a list - # can also be a single record, but the to_dict returns a list - # hence we will treat it as a container. - return self._serialize_dicts_to_store_models(dict_records, **kwargs) # type: ignore - # this case is single record in, single record out - return self._serialize_dicts_to_store_models([dict_records], **kwargs)[0] + try: + dict_records: list[dict[str, Any]] = [] + if not isinstance(records, list): + records = [records] # type: ignore + for rec in records: + dict_rec = self._serialize_data_model_to_dict(rec) + if isinstance(dict_rec, list): + dict_records.extend(dict_rec) + else: + dict_records.append(dict_rec) + except VectorStoreModelSerializationException: + raise # pragma: no cover + except Exception as exc: + raise VectorStoreModelSerializationException(f"Error serializing records: {exc}") from exc + + # add vectors + try: + dict_records = await self._add_vectors_to_records(dict_records) # type: ignore + except (VectorStoreModelException, VectorStoreOperationException): + raise + except Exception as exc: + raise VectorStoreOperationException( + "Exception occurred while trying to add the vectors to the records." + ) from exc + + try: + return self._serialize_dicts_to_store_models(dict_records, **kwargs) # type: ignore except VectorStoreModelSerializationException: raise # pragma: no cover except Exception as exc: @@ -290,7 +304,7 @@ def _serialize_data_model_to_store_model(self, record: OneOrMany[TModel], **kwar return record.serialize(**kwargs) return None - def _serialize_data_model_to_dict(self, record: TModel, **kwargs: Any) -> OneOrMany[dict[str, Any]]: + def _serialize_data_model_to_dict(self, record: TModel, **kwargs: Any) -> OneOrList[dict[str, Any]]: """This function is used if no serialize method is found on the data model. This will generally serialize the data model to a dict, should not be overridden by child classes. @@ -298,7 +312,7 @@ def _serialize_data_model_to_dict(self, record: TModel, **kwargs: Any) -> OneOrM The output of this should be passed to the serialize_dict_to_store_model method. """ if self.definition.to_dict: - return self.definition.to_dict(record, **kwargs) + return self.definition.to_dict(record, **kwargs) # type: ignore if isinstance(record, BaseModel): return record.model_dump() @@ -392,7 +406,7 @@ def _deserialize_dict_to_data_model(self, record: OneOrMany[dict[str, Any]], **k data_model_dict: dict[str, Any] = {} for field in self.definition.fields: value = record.get(field.storage_name or field.name, None) - if isinstance(field, VectorStoreVectorField) and not kwargs.get("include_vectors"): + if field.field_type == "vector" and not kwargs.get("include_vectors"): continue data_model_dict[field.name] = value if self.record_type is dict: @@ -425,6 +439,10 @@ async def _add_vectors_to_records( embedding_generator = field.embedding_generator or self.embedding_generator if not embedding_generator: continue + if field.dimensions is None: + raise VectorStoreModelException( + f"Field {field.name} has no dimensions, cannot create embedding for field." + ) embeddings_to_make.append(( field.storage_name or field.name, field.dimensions, @@ -455,9 +473,7 @@ async def _add_embedding_to_object( contents: list[Any] = [] dict_like = (getter := getattr(inputs, "get", False)) and callable(getter) list_of_dicts: bool = False - if container_mode: - contents = inputs[field_name].tolist() # type: ignore - elif isinstance(inputs, list): + if isinstance(inputs, list): list_of_dicts = (getter := getattr(inputs[0], "get", False)) and callable(getter) for record in inputs: if list_of_dicts: @@ -475,9 +491,6 @@ async def _add_embedding_to_object( ) # type: ignore if vectors is None: raise VectorStoreOperationException("No vectors were generated.") - if container_mode: - inputs[field_name] = vectors # type: ignore - return if isinstance(inputs, list): for record, vector in zip(inputs, vectors): if list_of_dicts: @@ -698,20 +711,11 @@ async def upsert( raise VectorStoreOperationException("Either record or records must be provided.") try: - data = self.serialize(records) + data = await self.serialize(records) # the serialize method will parse any exception into a VectorStoreModelSerializationException except VectorStoreModelSerializationException: raise - try: - # fix this! - data = await self._add_vectors_to_records(data) - except (VectorStoreModelException, VectorStoreOperationException): - raise - except Exception as exc: - raise VectorStoreOperationException( - "Exception occurred while trying to add the vectors to the records." - ) from exc try: results = await self._inner_upsert(data if isinstance(data, list) else [data], **kwargs) # type: ignore except Exception as exc: @@ -952,7 +956,7 @@ async def does_collection_exist(self, collection_name: str) -> bool: to check if the collection exists. """ try: - data_model = VectorStoreCollectionDefinition(fields=[VectorStoreKeyField(name="id")]) + data_model = VectorStoreCollectionDefinition(fields=[VectorStoreField("key", name="id")]) collection = self.get_collection(record_type=dict, definition=data_model, collection_name=collection_name) return await collection.does_collection_exist() except VectorStoreOperationException: @@ -965,7 +969,7 @@ async def ensure_collection_deleted(self, collection_name: str) -> None: to delete the collection. """ try: - data_model = VectorStoreCollectionDefinition(fields=[VectorStoreKeyField(name="id")]) + data_model = VectorStoreCollectionDefinition(fields=[VectorStoreField("key", name="id")]) collection = self.get_collection(record_type=dict, definition=data_model, collection_name=collection_name) await collection.ensure_collection_deleted() except VectorStoreOperationException: diff --git a/python/tests/conftest.py b/python/tests/conftest.py index c7b39dcfe0c1..30e20e578c2a 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -3,7 +3,7 @@ import logging from collections.abc import Callable from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Annotated +from typing import TYPE_CHECKING, Annotated, Any from unittest.mock import MagicMock from uuid import uuid4 @@ -12,13 +12,7 @@ from pytest import fixture from semantic_kernel.agents import Agent, DeclarativeSpecMixin, register_agent_type -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, - vectorstoremodel, -) +from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField, vectorstoremodel if TYPE_CHECKING: from semantic_kernel import Kernel @@ -325,15 +319,16 @@ def dataclass_vector_data_model( class MyDataModel: vector: Annotated[ str | list[float] | None, - VectorStoreVectorField( + VectorStoreField( + "vector", index_kind=index_kind, dimensions=dimensions, distance_function=distance_function, type=vector_property_type, ), ] = None - id: Annotated[str, VectorStoreKeyField(type="str")] = field(default_factory=lambda: str(uuid4())) - content: Annotated[str, VectorStoreDataField(type="str")] = "content1" + id: Annotated[str, VectorStoreField("key", type="str")] = field(default_factory=lambda: str(uuid4())) + content: Annotated[str, VectorStoreField("data", type="str")] = "content1" return MyDataModel @@ -344,9 +339,10 @@ def definition( ) -> VectorStoreCollectionDefinition: return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id", type="str"), - VectorStoreDataField(name="content", type="str", is_full_text_indexed=True), - VectorStoreVectorField( + VectorStoreField("key", name="id", type="str"), + VectorStoreField("data", name="content", type="str", is_full_text_indexed=True), + VectorStoreField( + "vector", name="vector", dimensions=dimensions, index_kind=index_kind, @@ -361,15 +357,16 @@ def definition( def definition_pandas(index_kind: str, distance_function: str, vector_property_type: str, dimensions: int) -> object: return VectorStoreCollectionDefinition( fields=[ - VectorStoreVectorField( + VectorStoreField( + "vector", name="vector", index_kind=index_kind, dimensions=dimensions, distance_function=distance_function, type=vector_property_type, ), - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content", type="str"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content", type="str"), ], container_mode=True, to_dict=lambda x: x.to_dict(orient="records"), @@ -381,17 +378,22 @@ def definition_pandas(index_kind: str, distance_function: str, vector_property_t def record_type(index_kind: str, distance_function: str, vector_property_type: str, dimensions: int) -> object: @vectorstoremodel class DataModelClass(BaseModel): - content: Annotated[str, VectorStoreDataField()] + content: Annotated[str, VectorStoreField("data")] vector: Annotated[ str | list[float] | None, - VectorStoreVectorField( - index_kind=index_kind, - distance_function=distance_function, + VectorStoreField( + "vector", type=vector_property_type, dimensions=dimensions, + index_kind=index_kind, + distance_function=distance_function, ), - ] - id: Annotated[str, VectorStoreKeyField()] + ] = None + id: Annotated[str, VectorStoreField("key")] + + def model_post_init(self, context: Any) -> None: + if self.vector is None: + self.vector = self.content return DataModelClass @@ -404,17 +406,18 @@ def record_type_with_key_as_key_field( @vectorstoremodel class DataModelClass(BaseModel): - content: Annotated[str, VectorStoreDataField()] + content: Annotated[str, VectorStoreField("data")] vector: Annotated[ str | list[float] | None, - VectorStoreVectorField( + VectorStoreField( + "vector", index_kind=index_kind, distance_function=distance_function, type=vector_property_type, dimensions=dimensions, ), ] - key: Annotated[str, VectorStoreKeyField()] + key: Annotated[str, VectorStoreField("key")] return DataModelClass diff --git a/python/tests/integration/memory/azure_cosmos_db/conftest.py b/python/tests/integration/memory/azure_cosmos_db/conftest.py index 089e674f7a55..9276453fdcc1 100644 --- a/python/tests/integration/memory/azure_cosmos_db/conftest.py +++ b/python/tests/integration/memory/azure_cosmos_db/conftest.py @@ -8,12 +8,7 @@ from pydantic import BaseModel from pytest import fixture -from semantic_kernel.data.definitions import ( - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, - vectorstoremodel, -) +from semantic_kernel.data.definitions import VectorStoreField, vectorstoremodel @fixture @@ -32,17 +27,18 @@ def record_type() -> type: class TestDataModelType(BaseModel): vector: Annotated[ list[float] | None, - VectorStoreVectorField( + VectorStoreField( + "vector", index_kind="flat", dimensions=5, distance_function="cosine_similarity", type="float", ), ] = None - id: Annotated[str, VectorStoreKeyField()] = field(default_factory=lambda: str(uuid4())) - product_type: Annotated[str, VectorStoreDataField()] = "N/A" + id: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4())) + product_type: Annotated[str, VectorStoreField("data")] = "N/A" description: Annotated[ - str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector", type="str") + str, VectorStoreField("data", has_embedding=True, embedding_property_name="vector", type="str") ] = "N/A" return TestDataModelType @@ -64,17 +60,18 @@ def record_type_with_key_as_key_field() -> type: class TestDataModelType(BaseModel): vector: Annotated[ list[float] | None, - VectorStoreVectorField( + VectorStoreField( + "vector", index_kind="flat", dimensions=5, distance_function="cosine_similarity", type="float", ), ] = None - key: Annotated[str, VectorStoreKeyField()] = field(default_factory=lambda: str(uuid4())) - product_type: Annotated[str, VectorStoreDataField()] = "N/A" + key: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4())) + product_type: Annotated[str, VectorStoreField("data")] = "N/A" description: Annotated[ - str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector", type="str") + str, VectorStoreField("data", has_embedding=True, embedding_property_name="vector", type="str") ] = "N/A" return TestDataModelType diff --git a/python/tests/integration/memory/postgres/test_postgres_int.py b/python/tests/integration/memory/postgres/test_postgres_int.py index 8a570c04e68d..97ef8971bcab 100644 --- a/python/tests/integration/memory/postgres/test_postgres_int.py +++ b/python/tests/integration/memory/postgres/test_postgres_int.py @@ -11,12 +11,7 @@ from pydantic import BaseModel from semantic_kernel.connectors.memory.postgres import PostgresCollection, PostgresSettings, PostgresStore -from semantic_kernel.data import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.const import DistanceFunction, IndexKind from semantic_kernel.data.definitions import vectorstoremodel from semantic_kernel.data.vectors import VectorSearchOptions @@ -47,10 +42,11 @@ @vectorstoremodel class SimpleDataModel(BaseModel): - id: Annotated[int, VectorStoreKeyField()] + id: Annotated[int, VectorStoreField("key")] embedding: Annotated[ - list[float] | None, - VectorStoreVectorField( + list[float] | str | None, + VectorStoreField( + "vector", index_kind=IndexKind.HNSW, dimensions=3, distance_function=DistanceFunction.COSINE_SIMILARITY, @@ -58,25 +54,28 @@ class SimpleDataModel(BaseModel): ] = None data: Annotated[ dict[str, Any], - VectorStoreDataField(has_embedding=True, embedding_property_name="embedding", type="JSONB"), + VectorStoreField("data", type="JSONB"), ] + def model_post_init(self, context: Any) -> None: + if self.embedding is None: + self.embedding = self.data + def DataModelPandas(record) -> tuple: definition = VectorStoreCollectionDefinition( - fields={ - "embedding": VectorStoreVectorField( + fields=[ + VectorStoreField( + "vector", name="embedding", index_kind="hnsw", dimensions=3, distance_function="cosine_similarity", type="float", ), - "id": VectorStoreKeyField(name="id", type="int"), - "data": VectorStoreDataField( - name="data", has_embedding=True, embedding_property_name="embedding", type="dict" - ), - }, + VectorStoreField("key", name="id", type="int"), + VectorStoreField("data", name="data", type="dict"), + ], container_mode=True, to_dict=lambda x: x.to_dict(orient="records"), from_dict=lambda x, **_: pd.DataFrame(x), @@ -203,18 +202,18 @@ async def test_upsert_get_and_delete_pandas(vector_store): await collection.ensure_collection_deleted() -async def test_upsert_get_and_delete_batch(vector_store: PostgresStore): +async def test_upsert_get_and_delete_multiple(vector_store: PostgresStore): async with create_simple_collection(vector_store) as simple_collection: record1 = SimpleDataModel(id=1, embedding=[1.1, 2.2, 3.3], data={"key": "value"}) record2 = SimpleDataModel(id=2, embedding=[4.4, 5.5, 6.6], data={"key": "value"}) - result_before_upsert = await simple_collection.get_batch([1, 2]) + result_before_upsert = await simple_collection.get([1, 2]) assert result_before_upsert is None - await simple_collection.upsert_batch([record1, record2]) - # Test get_batch for the two existing keys and one non-existing key; + await simple_collection.upsert([record1, record2]) + # Test get for the two existing keys and one non-existing key; # this should return only the two existing records. - result = await simple_collection.get_batch([1, 2, 3]) + result = await simple_collection.get([1, 2, 3]) assert result is not None assert isinstance(result, Sequence) assert len(result) == 2 @@ -227,8 +226,8 @@ async def test_upsert_get_and_delete_batch(vector_store: PostgresStore): assert result[1].embedding == record2.embedding assert result[1].data == record2.data - await simple_collection.delete_batch([1, 2]) - result_after_delete = await simple_collection.get_batch([1, 2]) + await simple_collection.delete([1, 2]) + result_after_delete = await simple_collection.get([1, 2]) assert result_after_delete is None @@ -243,7 +242,7 @@ async def test_search(vector_store: PostgresStore): SimpleDataModel(id=6, embedding=[1.0, 0.0, 1.0], data={"key": "value6"}), ] - await simple_collection.upsert_batch(records) + await simple_collection.upsert(records) try: search_results = await simple_collection.vectorized_search( @@ -254,4 +253,4 @@ async def test_search(vector_store: PostgresStore): assert {result.record.id async for result in search_results.results} == {1, 2, 3} finally: - await simple_collection.delete_batch([r.id for r in records]) + await simple_collection.delete([r.id for r in records]) diff --git a/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_mongodb_collection.py b/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_mongodb_collection.py index d7bf66d7e1fb..04d8b412d979 100644 --- a/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_mongodb_collection.py +++ b/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_mongodb_collection.py @@ -6,12 +6,7 @@ from pymongo import AsyncMongoClient from semantic_kernel.connectors.memory.azure_cosmos_db import CosmosMongoCollection -from semantic_kernel.data.definitions import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.exceptions import VectorStoreInitializationException @@ -19,9 +14,9 @@ def mock_model() -> VectorStoreCollectionDefinition: return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(name="vector", dimensions=5), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", name="vector", dimensions=5), ] ) diff --git a/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_no_sql_collection.py b/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_no_sql_collection.py index 2da7dce7ae30..6242e212eca0 100644 --- a/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_no_sql_collection.py +++ b/python/tests/unit/connectors/memory/azure_cosmos_db/test_azure_cosmos_db_no_sql_collection.py @@ -433,13 +433,12 @@ async def test_azure_cosmos_db_no_sql_get( vector_collection._get_container_proxy = AsyncMock(return_value=mock_container_proxy) get_results = MagicMock(spec=AsyncGenerator) - get_results.__aiter__.return_value = [{"content": "test_content", "vector": [1.0, 2.0, 3.0], "id": "test_id"}] + get_results.__aiter__.return_value = [{"content": "test_content", "id": "test_id"}] mock_container_proxy.query_items.return_value = get_results record = await vector_collection.get("test_id") assert isinstance(record, record_type) assert record.content == "test_content" - assert record.vector == [1.0, 2.0, 3.0] assert record.id == "test_id" diff --git a/python/tests/unit/connectors/memory/test_faiss.py b/python/tests/unit/connectors/memory/test_faiss.py index ef7664d60ff0..2db5919e6cd6 100644 --- a/python/tests/unit/connectors/memory/test_faiss.py +++ b/python/tests/unit/connectors/memory/test_faiss.py @@ -4,13 +4,7 @@ from pytest import fixture, mark, raises from semantic_kernel.connectors.memory.faiss import FaissCollection, FaissStore -from semantic_kernel.data import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) -from semantic_kernel.data.const import DistanceFunction +from semantic_kernel.data import DistanceFunction, VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.exceptions import VectorStoreInitializationException @@ -18,9 +12,10 @@ def data_model_def() -> VectorStoreCollectionDefinition: return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField( + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField( + "vector", name="vector", dimensions=5, index_kind="flat", diff --git a/python/tests/unit/connectors/memory/test_postgres_store.py b/python/tests/unit/connectors/memory/test_postgres_store.py index 21742fd9670a..d7e129c74b0f 100644 --- a/python/tests/unit/connectors/memory/test_postgres_store.py +++ b/python/tests/unit/connectors/memory/test_postgres_store.py @@ -18,12 +18,7 @@ PostgresStore, ) from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import ( - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, - vectorstoremodel, -) +from semantic_kernel.data.definitions import VectorStoreField, vectorstoremodel @fixture(scope="function") @@ -61,14 +56,15 @@ async def vector_store(postgres_unit_test_env) -> AsyncGenerator[PostgresStore, @vectorstoremodel @dataclass class SimpleDataModel: - id: Annotated[int, VectorStoreKeyField()] + id: Annotated[int, VectorStoreField("key")] data: Annotated[ list[float] | str | None, - VectorStoreVectorField( - index_kind=IndexKind.HNSW, + VectorStoreField( + "vector", + type="float", dimensions=1536, + index_kind=IndexKind.HNSW, distance_function=DistanceFunction.COSINE_SIMILARITY, - type="float", ), ] = None @@ -159,12 +155,12 @@ async def test_create_collection_model_with_python_types(vector_store: PostgresS @vectorstoremodel @dataclass class ModelWithImplicitTypes: - name: Annotated[str, VectorStoreKeyField()] - age: Annotated[int, VectorStoreDataField()] - data: Annotated[dict[str, Any], VectorStoreDataField()] - embedding: Annotated[list[float], VectorStoreVectorField(dimensions=20)] - scores: Annotated[list[float], VectorStoreDataField()] - tags: Annotated[list[str], VectorStoreDataField()] + name: Annotated[str, VectorStoreField("key")] + age: Annotated[int, VectorStoreField("data")] + data: Annotated[dict[str, Any], VectorStoreField("data")] + embedding: Annotated[list[float], VectorStoreField("vector", dimensions=20)] + scores: Annotated[list[float], VectorStoreField("data")] + tags: Annotated[list[str], VectorStoreField("data")] collection = vector_store.get_collection(collection_name="test_collection", record_type=ModelWithImplicitTypes) @@ -260,10 +256,11 @@ async def test_vector_search( @vectorstoremodel @dataclass class SimpleDataModel: - id: Annotated[int, VectorStoreKeyField()] + id: Annotated[int, VectorStoreField("key")] embedding: Annotated[ - list[float], - VectorStoreVectorField( + list[float] | str | None, + VectorStoreField( + "vector", index_kind=IndexKind.HNSW, dimensions=1536, distance_function=distance_function, @@ -272,9 +269,13 @@ class SimpleDataModel: ] data: Annotated[ dict[str, Any], - VectorStoreDataField(type="JSONB"), + VectorStoreField("data", type="JSONB"), ] + def model_post_init(self, context: Any) -> None: + if self.embedding is None: + self.embedding = self.data + collection = vector_store.get_collection(collection_name="test_collection", record_type=SimpleDataModel) assert isinstance(collection, PostgresCollection) @@ -326,14 +327,15 @@ async def test_model_post_init_conflicting_distance_column_name(vector_store: Po @vectorstoremodel @dataclass class ConflictingDataModel: - id: Annotated[int, VectorStoreKeyField()] + id: Annotated[int, VectorStoreField("key")] sk_pg_distance: Annotated[ - float, VectorStoreDataField() + float, VectorStoreField("data") ] # Note: test depends on value of DISTANCE_COLUMN_NAME constant embedding: Annotated[ list[float], - VectorStoreVectorField( + VectorStoreField( + "vector", index_kind=IndexKind.HNSW, dimensions=1536, distance_function=DistanceFunction.COSINE_SIMILARITY, @@ -342,7 +344,7 @@ class ConflictingDataModel: ] data: Annotated[ dict[str, Any], - VectorStoreDataField(type="JSONB"), + VectorStoreField("data", type="JSONB"), ] collection = vector_store.get_collection(collection_name="test_collection", record_type=ConflictingDataModel) diff --git a/python/tests/unit/connectors/memory/test_qdrant.py b/python/tests/unit/connectors/memory/test_qdrant.py index 83751a1d48df..db278c8756fb 100644 --- a/python/tests/unit/connectors/memory/test_qdrant.py +++ b/python/tests/unit/connectors/memory/test_qdrant.py @@ -8,7 +8,7 @@ from semantic_kernel.connectors.memory.qdrant import QdrantCollection, QdrantStore from semantic_kernel.data.const import DistanceFunction -from semantic_kernel.data.definitions import VectorStoreVectorField +from semantic_kernel.data.definitions import VectorStoreField from semantic_kernel.exceptions import ( VectorSearchExecutionException, VectorStoreInitializationException, @@ -197,7 +197,7 @@ def test_collection_init_fail(definition): with raises( VectorStoreModelValidationError, match="Only one vector field is allowed when not using named vectors." ): - definition.fields.append(VectorStoreVectorField(name="vector2", dimensions=3)) + definition.fields.append(VectorStoreField("vector", name="vector2", dimensions=3)) QdrantCollection( record_type=dict, collection_name="test", diff --git a/python/tests/unit/connectors/memory/test_sql_server.py b/python/tests/unit/connectors/memory/test_sql_server.py index 3a8c9e083c83..b287064c4e54 100644 --- a/python/tests/unit/connectors/memory/test_sql_server.py +++ b/python/tests/unit/connectors/memory/test_sql_server.py @@ -22,7 +22,7 @@ _build_select_table_names_query, ) from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import VectorStoreDataField, VectorStoreKeyField, VectorStoreVectorField +from semantic_kernel.data.definitions import VectorStoreField from semantic_kernel.data.vectors import VectorSearchOptions from semantic_kernel.exceptions.vector_store_exceptions import ( VectorStoreInitializationException, @@ -110,13 +110,13 @@ class TestQueryBuildFunctions: def test_build_create_table_query(self): schema = "dbo" table = "Test" - key_field = VectorStoreKeyField(name="id", type="str") + key_field = VectorStoreField("key", name="id", type="str") data_fields = [ - VectorStoreDataField(name="name", type="str"), - VectorStoreDataField(name="age", type="int"), + VectorStoreField("data", name="name", type="str"), + VectorStoreField("data", name="age", type="int"), ] vector_fields = [ - VectorStoreVectorField(name="embedding", type="float", dimensions=1536), + VectorStoreField("vector", name="embedding", type="float", dimensions=1536), ] cmd = _build_create_table_query(schema, table, key_field, data_fields, vector_fields) assert not cmd.parameters @@ -149,13 +149,13 @@ def test_build_select_table_names_query(self, schema): def test_build_merge_query(self): schema = "dbo" table = "Test" - key_field = VectorStoreKeyField(name="id", type="str") + key_field = VectorStoreField("key", name="id", type="str") data_fields = [ - VectorStoreDataField(name="name", type="str"), - VectorStoreDataField(name="age", type="int"), + VectorStoreField("data", name="name", type="str"), + VectorStoreField("data", name="age", type="int"), ] vector_fields = [ - VectorStoreVectorField(name="embedding", type="float", dimensions=5), + VectorStoreField("vector", name="embedding", type="float", dimensions=5), ] records = [ { @@ -182,13 +182,13 @@ def test_build_merge_query(self): def test_build_select_query(self): schema = "dbo" table = "Test" - key_field = VectorStoreKeyField(name="id", type="str") + key_field = VectorStoreField("key", name="id", type="str") data_fields = [ - VectorStoreDataField(name="name", type="str"), - VectorStoreDataField(name="age", type="int"), + VectorStoreField("data", name="name", type="str"), + VectorStoreField("data", name="age", type="int"), ] vector_fields = [ - VectorStoreVectorField(name="embedding", type="float", dimensions=5), + VectorStoreField("vector", name="embedding", type="float", dimensions=5), ] keys = ["test"] cmd = _build_select_query(schema, table, key_field, data_fields, vector_fields, keys) @@ -199,7 +199,7 @@ def test_build_select_query(self): def test_build_delete_query(self): schema = "dbo" table = "Test" - key_field = VectorStoreKeyField(name="id", type="str") + key_field = VectorStoreField("key", name="id", type="str") keys = ["test"] cmd = _build_delete_query(schema, table, key_field, keys) str_cmd = str(cmd) @@ -209,13 +209,14 @@ def test_build_delete_query(self): def test_build_search_query(self): schema = "dbo" table = "Test" - key_field = VectorStoreKeyField(name="id", type="str") + key_field = VectorStoreField("key", name="id", type="str") data_fields = [ - VectorStoreDataField(name="name", type="str"), - VectorStoreDataField(name="age", type="int"), + VectorStoreField("data", name="name", type="str"), + VectorStoreField("data", name="age", type="int"), ] vector_fields = [ - VectorStoreVectorField( + VectorStoreField( + "vector", name="embedding", type="float", dimensions=5, diff --git a/python/tests/unit/data/conftest.py b/python/tests/unit/data/conftest.py index 24d9e90ac75c..3533483f4610 100644 --- a/python/tests/unit/data/conftest.py +++ b/python/tests/unit/data/conftest.py @@ -13,12 +13,10 @@ from semantic_kernel.data import ( KernelSearchResults, VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, VectorStoreRecordCollection, - VectorStoreVectorField, vectorstoremodel, ) +from semantic_kernel.data.definitions import VectorStoreField from semantic_kernel.data.vectors import VectorSearch, VectorSearchResult from semantic_kernel.kernel_types import OptionalOneOrMany @@ -100,9 +98,9 @@ def _lambda_parser(self, node: ast.AST) -> str: def definition() -> object: return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(dimensions=5, name="vector"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", dimensions=5, name="vector"), ] ) @@ -117,9 +115,9 @@ def deserialize(records, **kwargs): return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(dimensions=5, name="vector"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", dimensions=5, name="vector"), ], serialize=serialize, deserialize=deserialize, @@ -136,9 +134,9 @@ def from_dict(records, **kwargs): return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(dimensions=5, name="vector"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", dimensions=5, name="vector"), ], to_dict=to_dict, from_dict=from_dict, @@ -159,9 +157,9 @@ def from_dict(records: list[dict[str, Any]], **kwargs) -> dict[str, dict[str, An return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(dimensions=5, name="vector"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", dimensions=5, name="vector"), ], container_mode=True, to_dict=to_dict, @@ -183,9 +181,9 @@ def deserialize(records: list[dict[str, Any]], **kwargs) -> dict[str, dict[str, return VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(dimensions=5, name="vector"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", dimensions=5, name="vector"), ], container_mode=True, serialize=serialize, @@ -199,15 +197,17 @@ def data_model_pandas_definition() -> object: return VectorStoreCollectionDefinition( fields=[ - VectorStoreVectorField( + VectorStoreField( + "vector", name="vector", index_kind="hnsw", dimensions=5, distance_function="cosine_similarity", type="float", ), - VectorStoreKeyField(name="id"), - VectorStoreDataField( + VectorStoreField("key", name="id"), + VectorStoreField( + "data", name="content", type="str", ), @@ -224,9 +224,9 @@ def record_type_vanilla(): class DataModelClass: def __init__( self, - content: Annotated[str, VectorStoreDataField()], - id: Annotated[str, VectorStoreKeyField()], - vector: Annotated[list[float] | str | None, VectorStoreVectorField(dimensions=5)] = None, + content: Annotated[str, VectorStoreField("data")], + id: Annotated[str, VectorStoreField("key")], + vector: Annotated[list[float] | str | None, VectorStoreField("vector", dimensions=5)] = None, ): self.content = content self.vector = vector @@ -244,11 +244,12 @@ def record_type_vector_array(): class DataModelClass: def __init__( self, - id: Annotated[str, VectorStoreKeyField()], - content: Annotated[str, VectorStoreDataField()], + id: Annotated[str, VectorStoreField("key")], + content: Annotated[str, VectorStoreField("data")], vector: Annotated[ list[float] | str | None, - VectorStoreVectorField( + VectorStoreField( + "vector", dimensions=5, ), ] = None, @@ -269,9 +270,9 @@ def record_type_vanilla_serialize(): class DataModelClass: def __init__( self, - id: Annotated[str, VectorStoreKeyField()], - content: Annotated[str, VectorStoreDataField()], - vector: Annotated[list[float] | str | None, VectorStoreVectorField(dimensions=5)] = None, + id: Annotated[str, VectorStoreField("key")], + content: Annotated[str, VectorStoreField("data")], + vector: Annotated[list[float] | str | None, VectorStoreField("vector", dimensions=5)] = None, ): self.content = content self.vector = vector @@ -298,9 +299,9 @@ def record_type_vanilla_to_from_dict(): class DataModelClass: def __init__( self, - id: Annotated[str, VectorStoreKeyField()], - content: Annotated[str, VectorStoreDataField()], - vector: Annotated[str | list[float] | None, VectorStoreVectorField(dimensions=5)] = None, + id: Annotated[str, VectorStoreField("key")], + content: Annotated[str, VectorStoreField("data")], + vector: Annotated[str | list[float] | None, VectorStoreField("vector", dimensions=5)] = None, ): self.content = content self.vector = vector @@ -325,9 +326,9 @@ def __eq__(self, other) -> bool: def record_type_pydantic(): @vectorstoremodel class DataModelClass(BaseModel): - content: Annotated[str, VectorStoreDataField()] - id: Annotated[str, VectorStoreKeyField()] - vector: Annotated[str | list[float] | None, VectorStoreVectorField(dimensions=5)] = None + content: Annotated[str, VectorStoreField("data")] + id: Annotated[str, VectorStoreField("key")] + vector: Annotated[str | list[float] | None, VectorStoreField("vector", dimensions=5)] = None return DataModelClass @@ -337,9 +338,9 @@ def record_type_dataclass(): @vectorstoremodel @dataclass class DataModelClass: - content: Annotated[str, VectorStoreDataField()] - id: Annotated[str, VectorStoreKeyField()] - vector: Annotated[list[float] | str | None, VectorStoreVectorField(dimensions=5)] = None + content: Annotated[str, VectorStoreField("data")] + id: Annotated[str, VectorStoreField("key")] + vector: Annotated[list[float] | str | None, VectorStoreField("vector", dimensions=5)] = None return DataModelClass diff --git a/python/tests/unit/data/test_vector_store_model_decorator.py b/python/tests/unit/data/test_vector_store_model_decorator.py index f621c84797b1..3e6cbdccf3e7 100644 --- a/python/tests/unit/data/test_vector_store_model_decorator.py +++ b/python/tests/unit/data/test_vector_store_model_decorator.py @@ -9,12 +9,7 @@ from pydantic.dataclasses import dataclass as pydantic_dataclass from pytest import raises -from semantic_kernel.data import ( - VectorStoreCollectionDefinition, - VectorStoreDataField, - VectorStoreKeyField, - VectorStoreVectorField, -) +from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.definitions import vectorstoremodel from semantic_kernel.exceptions import VectorStoreModelException @@ -28,12 +23,12 @@ def test_vanilla(): class DataModelClassVanilla: def __init__( self, - content: Annotated[str, VectorStoreDataField()], - content2: Annotated[str, VectorStoreDataField], - vector: Annotated[list[float], VectorStoreVectorField(dimensions=5)], - id: Annotated[str, VectorStoreKeyField()], + content: Annotated[str, VectorStoreField("data")], + content2: Annotated[str, VectorStoreField("data")], + vector: Annotated[list[float], VectorStoreField("vector", dimensions=5)], + id: Annotated[str, VectorStoreField("key")], non_vector_store_content: str | None = None, - optional_content: Annotated[str | None, VectorStoreDataField()] = None, + optional_content: Annotated[str | None, VectorStoreField("data")] = None, annotated_content: Annotated[str | None, "description"] = None, ): self.content = content @@ -66,8 +61,8 @@ def test_vanilla_2(): class DataModelClassVanilla2: def __init__( self, - content: Annotated[str, VectorStoreDataField()], - id: Annotated[str, VectorStoreKeyField()], + content: Annotated[str, VectorStoreField("data")], + id: Annotated[str, VectorStoreField("key")], ): self.content = content self.id = id @@ -82,12 +77,12 @@ def test_dataclass(): @vectorstoremodel @dataclass class DataModelClassDataclass: - content: Annotated[str, VectorStoreDataField()] - content2: Annotated[str, VectorStoreDataField] - vector: Annotated[list[float], VectorStoreVectorField(dimensions=5)] - id: Annotated[str, VectorStoreKeyField()] + content: Annotated[str, VectorStoreField("data")] + content2: Annotated[str, VectorStoreField("data")] + vector: Annotated[list[float], VectorStoreField("vector", dimensions=5)] + id: Annotated[str, VectorStoreField("key")] non_vector_store_content: str | None = None - optional_content: Annotated[str | None, VectorStoreDataField()] = None + optional_content: Annotated[str | None, VectorStoreField("data")] = None annotated_content: Annotated[str | None, "description"] = None assert hasattr(DataModelClassDataclass, "__kernel_vectorstoremodel__") @@ -113,19 +108,19 @@ def test_dataclass_inverse_fail(): @dataclass @vectorstoremodel class DataModelClass: - id: Annotated[str, VectorStoreKeyField()] - content: Annotated[str, VectorStoreDataField()] + id: Annotated[str, VectorStoreField("key")] + content: Annotated[str, VectorStoreField("data")] def test_pydantic_base_model(): @vectorstoremodel class DataModelClassPydantic(BaseModel): - content: Annotated[str, VectorStoreDataField()] - content2: Annotated[str, VectorStoreDataField] - vector: Annotated[list[float], VectorStoreVectorField(dimensions=5)] - id: Annotated[str, VectorStoreKeyField()] + content: Annotated[str, VectorStoreField("data")] + content2: Annotated[str, VectorStoreField("data")] + vector: Annotated[list[float], VectorStoreField("vector", dimensions=5)] + id: Annotated[str, VectorStoreField("key")] non_vector_store_content: str | None = None - optional_content: Annotated[str | None, VectorStoreDataField()] = None + optional_content: Annotated[str | None, VectorStoreField("data")] = None annotated_content: Annotated[str | None, "description"] = None assert hasattr(DataModelClassPydantic, "__kernel_vectorstoremodel__") @@ -149,12 +144,12 @@ def test_pydantic_dataclass(): @vectorstoremodel @pydantic_dataclass class DataModelClassPydanticDataclass: - content: Annotated[str, VectorStoreDataField()] - content2: Annotated[str, VectorStoreDataField] - vector: Annotated[list[float], VectorStoreVectorField(dimensions=5)] - id: Annotated[str, VectorStoreKeyField()] + content: Annotated[str, VectorStoreField("data")] + content2: Annotated[str, VectorStoreField("data")] + vector: Annotated[list[float], VectorStoreField("vector", dimensions=5)] + id: Annotated[str, VectorStoreField("key")] non_vector_store_content: str | None = None - optional_content: Annotated[str | None, VectorStoreDataField()] = None + optional_content: Annotated[str | None, VectorStoreField("data")] = None annotated_content: Annotated[str | None, "description"] = None assert hasattr(DataModelClassPydanticDataclass, "__kernel_vectorstoremodel__") @@ -208,13 +203,13 @@ def test_non_vector_list_and_dict(): @vectorstoremodel @dataclass class DataModelClassListDict: - key: Annotated[str, VectorStoreKeyField()] - list1: Annotated[list[int], VectorStoreDataField()] - list2: Annotated[list[str], VectorStoreDataField] - list3: Annotated[list[str] | None, VectorStoreDataField] - dict1: Annotated[dict[str, int], VectorStoreDataField()] - dict2: Annotated[dict[str, str], VectorStoreDataField] - dict3: Annotated[dict[str, str] | None, VectorStoreDataField] + key: Annotated[str, VectorStoreField("key")] + list1: Annotated[list[int], VectorStoreField("data")] + list2: Annotated[list[str], VectorStoreField("data")] + list3: Annotated[list[str] | None, VectorStoreField("data")] + dict1: Annotated[dict[str, int], VectorStoreField("data")] + dict2: Annotated[dict[str, str], VectorStoreField("data")] + dict3: Annotated[dict[str, str] | None, VectorStoreField("data")] assert hasattr(DataModelClassListDict, "__kernel_vectorstoremodel__") assert hasattr(DataModelClassListDict, "__kernel_vectorstoremodel_definition__") @@ -239,12 +234,12 @@ def test_vector_fields_checks(): @vectorstoremodel class DataModelClassVectorFields(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - id: Annotated[str, VectorStoreKeyField()] - vector_str: Annotated[str, VectorStoreVectorField(dimensions=5)] - vector_list: Annotated[list[float], VectorStoreVectorField(dimensions=5)] + id: Annotated[str, VectorStoreField("key")] + vector_str: Annotated[str, VectorStoreField("vector", dimensions=5)] + vector_list: Annotated[list[float], VectorStoreField("vector", dimensions=5)] vector_array: Annotated[ ndarray, - VectorStoreVectorField(dimensions=5), + VectorStoreField("vector", dimensions=5), ] assert hasattr(DataModelClassVectorFields, "__kernel_vectorstoremodel__") diff --git a/python/tests/unit/data/test_vector_store_record_definition.py b/python/tests/unit/data/test_vector_store_record_definition.py index 60bb4b64d5a7..7f88bcfffa4c 100644 --- a/python/tests/unit/data/test_vector_store_record_definition.py +++ b/python/tests/unit/data/test_vector_store_record_definition.py @@ -1,17 +1,15 @@ # Copyright (c) Microsoft. All rights reserved. -from pydantic import ValidationError from pytest import raises -from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreDataField, VectorStoreKeyField -from semantic_kernel.data.definitions import VectorStoreVectorField +from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.exceptions import VectorStoreModelException def test_vector_store_record_definition(): - id_field = VectorStoreKeyField(name="id") + id_field = VectorStoreField("key", name="id") vsrd = VectorStoreCollectionDefinition(fields=[id_field]) - assert vsrd.fields == [VectorStoreKeyField(name="id")] + assert vsrd.fields == [VectorStoreField("key", name="id")] assert vsrd.key_name == "id" assert vsrd.key_field == id_field assert vsrd.names == ["id"] @@ -29,28 +27,30 @@ def test_no_fields_fail(): def test_no_name_fields_fail(): - with raises(ValidationError): - VectorStoreCollectionDefinition(fields=[VectorStoreKeyField(name=None)]) # type: ignore with raises(VectorStoreModelException): - VectorStoreCollectionDefinition(fields=[VectorStoreKeyField(name="")]) + VectorStoreCollectionDefinition(fields=[VectorStoreField("key", name=None)]) + with raises(VectorStoreModelException): + VectorStoreCollectionDefinition(fields=[VectorStoreField("key", name="")]) def test_no_key_field_fail(): with raises(VectorStoreModelException): - VectorStoreCollectionDefinition(fields=[VectorStoreDataField(name="content")]) + VectorStoreCollectionDefinition(fields=[VectorStoreField("data", name="content")]) def test_multiple_key_field_fail(): with raises(VectorStoreModelException): - VectorStoreCollectionDefinition(fields=[VectorStoreKeyField(name="key1"), VectorStoreKeyField(name="key2")]) + VectorStoreCollectionDefinition( + fields=[VectorStoreField("key", name="key1"), VectorStoreField("key", name="key2")] + ) def test_vector_and_non_vector_field_names(): definition = VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(name="vector", dimensions=5), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", name="vector", dimensions=5), ] ) assert definition.vector_field_names == ["vector"] @@ -60,9 +60,9 @@ def test_vector_and_non_vector_field_names(): def test_try_get_vector_field(): definition = VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(name="vector", dimensions=5), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", name="vector", dimensions=5), ] ) assert definition.try_get_vector_field() == definition.fields[2] @@ -72,8 +72,8 @@ def test_try_get_vector_field(): def test_try_get_vector_field_none(): definition = VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), ] ) assert definition.try_get_vector_field() is None @@ -84,8 +84,8 @@ def test_try_get_vector_field_none(): def test_try_get_vector_field_wrong_name_fail(): definition = VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), ] ) with raises(VectorStoreModelException, match="Field content is not a vector field."): @@ -95,9 +95,9 @@ def test_try_get_vector_field_wrong_name_fail(): def test_get_field_names(): definition = VectorStoreCollectionDefinition( fields=[ - VectorStoreKeyField(name="id"), - VectorStoreDataField(name="content"), - VectorStoreVectorField(name="vector", dimensions=5), + VectorStoreField("key", name="id"), + VectorStoreField("data", name="content"), + VectorStoreField("vector", name="vector", dimensions=5), ] ) assert definition.get_names() == ["id", "content", "vector"] From 7304908320f3ffdfcdfe967f7655f27bbf435b00 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 23 May 2025 15:18:34 +0200 Subject: [PATCH 2/3] fixed tests --- python/semantic_kernel/data/definitions.py | 2 +- .../unit/data/test_vector_store_record_collection.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/semantic_kernel/data/definitions.py b/python/semantic_kernel/data/definitions.py index 7ff1102fbb0b..af8ac66d7fcc 100644 --- a/python/semantic_kernel/data/definitions.py +++ b/python/semantic_kernel/data/definitions.py @@ -390,7 +390,7 @@ def model_post_init(self, _: Any): "There must be at least one field with a VectorStoreRecordField annotation." ) for field in self.fields: - if field.name == "": + if not field.name or field.name == "": raise VectorStoreModelException("Field names must not be empty.") if field.field_type == "key": if self.key_name != "": diff --git a/python/tests/unit/data/test_vector_store_record_collection.py b/python/tests/unit/data/test_vector_store_record_collection.py index 656008766de9..12bc3868faeb 100644 --- a/python/tests/unit/data/test_vector_store_record_collection.py +++ b/python/tests/unit/data/test_vector_store_record_collection.py @@ -322,7 +322,7 @@ async def test_serialize_in_upsert_fail(DictVectorStoreRecordCollection, definit await vector_store_record_collection.upsert([record]) -def test_serialize_record_type_serialize_fail(DictVectorStoreRecordCollection, record_type_vanilla_serialize): +async def test_serialize_record_type_serialize_fail(DictVectorStoreRecordCollection, record_type_vanilla_serialize): vector_store_record_collection = DictVectorStoreRecordCollection( collection_name="test", record_type=record_type_vanilla_serialize, @@ -330,7 +330,7 @@ def test_serialize_record_type_serialize_fail(DictVectorStoreRecordCollection, r record = MagicMock(spec=SerializeMethodProtocol) record.serialize = MagicMock(side_effect=Exception) with raises(VectorStoreModelSerializationException, match="Error serializing record"): - vector_store_record_collection.serialize(record) + await vector_store_record_collection.serialize(record) def test_serialize_data_model_to_dict_fail_object(DictVectorStoreRecordCollection, record_type_vanilla): @@ -344,7 +344,7 @@ def test_serialize_data_model_to_dict_fail_object(DictVectorStoreRecordCollectio @mark.parametrize("vector_store_record_collection", ["type_pydantic"], indirect=True) -def test_pydantic_serialize_fail(vector_store_record_collection): +async def test_pydantic_serialize_fail(vector_store_record_collection): id = "test_id" model = deepcopy(vector_store_record_collection.record_type) model.model_dump = MagicMock(side_effect=Exception) @@ -352,15 +352,15 @@ def test_pydantic_serialize_fail(vector_store_record_collection): dict_record = {"id": id, "content": "test_content", "vector": [1.0, 2.0, 3.0]} record = model(**dict_record) with raises(VectorStoreModelSerializationException, match="Error serializing record"): - vector_store_record_collection.serialize(record) + await vector_store_record_collection.serialize(record) @mark.parametrize("vector_store_record_collection", ["type_vanilla_with_to_from_dict"], indirect=True) -def test_to_dict_fail(vector_store_record_collection): +async def test_to_dict_fail(vector_store_record_collection): record = MagicMock(spec=ToDictMethodProtocol) record.to_dict = MagicMock(side_effect=Exception) with raises(VectorStoreModelSerializationException, match="Error serializing record"): - vector_store_record_collection.serialize(record) + await vector_store_record_collection.serialize(record) # region Deserialize From 01e70b5d6b203b269f51dd17235cd247b616cce2 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 26 May 2025 09:41:21 +0200 Subject: [PATCH 3/3] added enum --- .../connectors/memory/azure_ai_search.py | 10 ++-- .../connectors/memory/azure_cosmos_db.py | 12 ++--- .../connectors/memory/postgres.py | 8 +-- .../connectors/memory/redis.py | 22 ++++---- python/semantic_kernel/data/__init__.py | 8 ++- python/semantic_kernel/data/definitions.py | 52 ++++++++++++------- python/semantic_kernel/data/vectors.py | 10 +++- 7 files changed, 74 insertions(+), 48 deletions(-) diff --git a/python/semantic_kernel/connectors/memory/azure_ai_search.py b/python/semantic_kernel/connectors/memory/azure_ai_search.py index ea25c749069d..de041b5c502c 100644 --- a/python/semantic_kernel/connectors/memory/azure_ai_search.py +++ b/python/semantic_kernel/connectors/memory/azure_ai_search.py @@ -30,7 +30,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition +from semantic_kernel.data.definitions import FieldTypes, VectorStoreCollectionDefinition from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -201,7 +201,7 @@ def _definition_to_azure_ai_search_index( search_algos = [] for field in definition.fields: - if field.field_type == "data": + if field.field_type == FieldTypes.DATA: if not field.type_: logger.debug(f"Field {field.name} has not specified type, defaulting to Edm.String.") if field.type_ and field.type_ not in TYPE_MAP_DATA: @@ -227,7 +227,7 @@ def _definition_to_azure_ai_search_index( hidden=False, ) ) - elif field.field_type == "key": + elif field.field_type == FieldTypes.KEY: fields.append( SimpleField( name=field.storage_name or field.name, @@ -237,7 +237,7 @@ def _definition_to_azure_ai_search_index( searchable=True, ) ) - elif field.field_type == "vector": + elif field.field_type == FieldTypes.VECTOR: if not field.type_: logger.debug(f"Field {field.name} has not specified type, defaulting to Collection(Edm.Single).") if field.index_kind not in INDEX_ALGORITHM_MAP: @@ -558,7 +558,7 @@ async def _inner_search( else [ field.name for field in self.definition.fields - if field.field_type == "data" and field.is_full_text_indexed + if field.field_type == FieldTypes.DATA and field.is_full_text_indexed ] ) if not search_args["search_fields"]: diff --git a/python/semantic_kernel/connectors/memory/azure_cosmos_db.py b/python/semantic_kernel/connectors/memory/azure_cosmos_db.py index 32f46a6a1f8c..6ce200dc3b4f 100644 --- a/python/semantic_kernel/connectors/memory/azure_cosmos_db.py +++ b/python/semantic_kernel/connectors/memory/azure_cosmos_db.py @@ -24,7 +24,7 @@ MongoDBAtlasStore, ) from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition +from semantic_kernel.data.definitions import FieldTypes, VectorStoreCollectionDefinition from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -137,10 +137,10 @@ def _create_default_indexing_policy_nosql(definition: VectorStoreCollectionDefin } for field in definition.fields: - if field.field_type == "data" and (not field.is_full_text_indexed and not field.is_indexed): + if field.field_type == FieldTypes.DATA and (not field.is_full_text_indexed and not field.is_indexed): indexing_policy["excludedPaths"].append({"path": f'/"{field.storage_name or field.name}"/*'}) - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: if field.index_kind not in INDEX_KIND_MAP_NOSQL: raise VectorStoreModelException( f"Index kind '{field.index_kind}' is not supported by Azure Cosmos DB NoSQL container." @@ -173,7 +173,7 @@ def _create_default_vector_embedding_policy(definition: VectorStoreCollectionDef vector_embedding_policy: dict[str, Any] = {"vectorEmbeddings": []} for field in definition.fields: - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: if field.distance_function not in DISTANCE_FUNCTION_MAP_NOSQL: raise VectorStoreModelException( f"Distance function '{field.distance_function}' is not supported by Azure Cosmos DB NoSQL." @@ -384,7 +384,7 @@ def _get_index_definitions(self, **kwargs: Any) -> dict[str, Any]: indexes = [ { "name": f"{field.storage_name or field.name}_", - "key": {field.storage_name or field.name: 1}, + FieldTypes.KEY: {field.storage_name or field.name: 1}, } for field in self.definition.data_fields if field.is_indexed or field.is_full_text_indexed @@ -402,7 +402,7 @@ def _get_index_definitions(self, **kwargs: Any) -> dict[str, Any]: index_kind = DISTANCE_FUNCTION_MAP_MONGODB[field.distance_function] index: dict[str, Any] = { "name": index_name, - "key": {field.storage_name or field.name: "cosmosSearch"}, + FieldTypes.KEY: {field.storage_name or field.name: "cosmosSearch"}, "cosmosSearchOptions": { "kind": index_kind, "similarity": DISTANCE_FUNCTION_MAP_MONGODB[field.distance_function], diff --git a/python/semantic_kernel/connectors/memory/postgres.py b/python/semantic_kernel/connectors/memory/postgres.py index 4e1bf55ce356..2e2edb9187be 100644 --- a/python/semantic_kernel/connectors/memory/postgres.py +++ b/python/semantic_kernel/connectors/memory/postgres.py @@ -18,7 +18,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField +from semantic_kernel.data.definitions import FieldTypes, VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -152,7 +152,7 @@ def _convert_row_to_dict(row: tuple[Any, ...], fields: Sequence[tuple[str, Vecto def _convert(v: Any | None, field: VectorStoreField | None) -> Any | None: if v is None: return None - if field and field.field_type == "vector" and isinstance(v, str): + if field and field.field_type == FieldTypes.VECTOR and isinstance(v, str): # psycopg returns vector as a string if pgvector is not loaded. # If pgvector is registered with the connection, no conversion is required. return json.loads(v) @@ -579,14 +579,14 @@ async def create_collection(self, **kwargs: Any) -> None: # For Vector fields with dimensions, use pgvector's VECTOR type # Note that other vector types are supported in pgvector (e.g. halfvec), # but would need to be created outside of this method. - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: column_definitions.append( sql.SQL("{name} VECTOR({dimensions})").format( name=sql.Identifier(field.storage_name or field.name), dimensions=sql.Literal(field.dimensions), ) ) - elif field.field_type == "key": + elif field.field_type == FieldTypes.KEY: # Use the property_type directly for key fields column_definitions.append( sql.SQL("{name} {col_type} PRIMARY KEY").format( diff --git a/python/semantic_kernel/connectors/memory/redis.py b/python/semantic_kernel/connectors/memory/redis.py index e17d6d7a01b7..61a7472529ff 100644 --- a/python/semantic_kernel/connectors/memory/redis.py +++ b/python/semantic_kernel/connectors/memory/redis.py @@ -25,7 +25,7 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.data.const import DistanceFunction, IndexKind -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField +from semantic_kernel.data.definitions import FieldTypes, VectorStoreCollectionDefinition, VectorStoreField from semantic_kernel.data.search import KernelSearchResults from semantic_kernel.data.vectors import ( GetFilteredRecordOptions, @@ -94,7 +94,7 @@ class RedisCollectionTypes(str, Enum): def _field_to_redis_field_hashset(name: str, field: VectorStoreField) -> RedisField: - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: if field.distance_function not in DISTANCE_FUNCTION_MAP: raise VectorStoreOperationException( f"Distance function {field.distance_function} is not supported. " @@ -121,7 +121,7 @@ def _field_to_redis_field_hashset(name: str, field: VectorStoreField) -> RedisFi def _field_to_redis_field_json(name: str, field: VectorStoreField) -> RedisField: - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: if field.distance_function not in DISTANCE_FUNCTION_MAP: raise VectorStoreOperationException( f"Distance function {field.distance_function} is not supported. " @@ -154,7 +154,7 @@ def _definition_to_redis_fields( """Create a list of fields for Redis from a definition.""" fields: list[RedisField] = [] for field in definition.fields: - if field.field_type == "key": + if field.field_type == FieldTypes.KEY: continue if collection_type == RedisCollectionTypes.HASHSET: fields.append(_field_to_redis_field_hashset(field.storage_name or field.name, field)) # type: ignore @@ -371,13 +371,13 @@ def get_field_expr(field_name): ) if field is None: raise VectorStoreOperationException(f"Field '{field_name}' not found in data model.") - if field.field_type == "data": + if field.field_type == FieldTypes.DATA: if field.is_full_text_indexed: return lambda: Text(field_name) if field.type_ in ("int", "float"): return lambda: Num(field_name) return lambda: Tag(field_name) - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: raise VectorStoreOperationException(f"Cannot filter on vector field '{field_name}'.") return lambda: Tag(field_name) @@ -588,11 +588,11 @@ def _serialize_dicts_to_store_models( for record in records: result: dict[str, Any] = {"mapping": {}} for field in self.definition.fields: - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: dtype = DATATYPE_MAP_VECTOR[field.type_ or "default"].lower() result["mapping"][field.storage_name or field.name] = array_to_buffer(record[field.name], dtype) continue - if field.field_type == "key": + if field.field_type == FieldTypes.KEY: result["name"] = self._get_redis_key(record[field.name]) continue result["mapping"][field.storage_name or field.name] = record[field.name] @@ -610,7 +610,7 @@ def _deserialize_store_models_to_dicts( rec = record.copy() for field in self.definition.fields: match field.field_type: - case "key": + case FieldTypes.KEY: rec[field.name] = self._unget_redis_key(rec[field.name]) case "vector": dtype = DATATYPE_MAP_VECTOR[field.type_ or "default"] @@ -716,7 +716,7 @@ def _serialize_dicts_to_store_models( for record in records: result: dict[str, Any] = {"value": {}} for field in self.definition.fields: - if field.field_type == "key": + if field.field_type == FieldTypes.KEY: result["name"] = self._get_redis_key(record[field.name]) continue if field.field_type == "vector": @@ -743,7 +743,7 @@ def _add_return_fields(self, query: TQuery, include_vectors: bool) -> TQuery: """Add the return fields to the query.""" for field in self.definition.fields: match field.field_type: - case "vector": + case FieldTypes.VECTOR: if include_vectors: query.return_field(field.name) case _: diff --git a/python/semantic_kernel/data/__init__.py b/python/semantic_kernel/data/__init__.py index 46a0c476bc6e..f7c5854f66e0 100644 --- a/python/semantic_kernel/data/__init__.py +++ b/python/semantic_kernel/data/__init__.py @@ -8,7 +8,12 @@ DistanceFunction, IndexKind, ) -from semantic_kernel.data.definitions import VectorStoreCollectionDefinition, VectorStoreField, vectorstoremodel +from semantic_kernel.data.definitions import ( + FieldTypes, + VectorStoreCollectionDefinition, + VectorStoreField, + vectorstoremodel, +) from semantic_kernel.data.search import ( DynamicFilterFunction, KernelSearchResults, @@ -25,6 +30,7 @@ "DISTANCE_FUNCTION_DIRECTION_HELPER", "DistanceFunction", "DynamicFilterFunction", + "FieldTypes", "IndexKind", "KernelSearchResults", "TextSearch", diff --git a/python/semantic_kernel/data/definitions.py b/python/semantic_kernel/data/definitions.py index af8ac66d7fcc..4d12401684a7 100644 --- a/python/semantic_kernel/data/definitions.py +++ b/python/semantic_kernel/data/definitions.py @@ -3,6 +3,7 @@ import logging from collections.abc import Sequence from dataclasses import dataclass +from enum import Enum from inspect import Parameter, _empty, signature from types import MappingProxyType, NoneType from typing import Annotated, Any, Literal, Protocol, TypeVar, overload, runtime_checkable @@ -21,12 +22,25 @@ # region: Fields +@release_candidate +class FieldTypes(str, Enum): + """Enumeration for field types in vector store models.""" + + KEY = "key" + VECTOR = "vector" + DATA = "data" + + def __str__(self) -> str: + """Return the string representation of the enum.""" + return self.value + + @release_candidate @dataclass class VectorStoreField: """Vector store fields.""" - field_type: Literal["key", "data", "vector"] = "data" + field_type: Literal[FieldTypes.DATA, FieldTypes.KEY, FieldTypes.VECTOR] = FieldTypes.DATA name: str = "" storage_name: str | None = None type_: str | None = None @@ -43,7 +57,7 @@ class VectorStoreField: @overload def __init__( self, - field_type: Literal["key"] = "key", + field_type: Literal[FieldTypes.KEY, "key"] = FieldTypes.KEY, # type: ignore[assignment] *, name: str | None = None, type: str | None = None, @@ -64,7 +78,7 @@ def __init__( @overload def __init__( self, - field_type: Literal["data"] = "data", + field_type: Literal[FieldTypes.DATA, "data"] = FieldTypes.DATA, # type: ignore[assignment] *, name: str | None = None, type: str | None = None, @@ -87,7 +101,7 @@ def __init__( @overload def __init__( self, - field_type: Literal["vector"] = "vector", + field_type: Literal[FieldTypes.VECTOR, "vector"] = FieldTypes.VECTOR, # type: ignore[assignment] *, name: str | None = None, type: str | None = None, @@ -134,7 +148,7 @@ def __init__( def __init__( self, - field_type="data", + field_type=FieldTypes.DATA, *, name=None, type=None, @@ -147,7 +161,7 @@ def __init__( embedding_generator=None, ): """Vector store field.""" - self.field_type = field_type + self.field_type = field_type if isinstance(field_type, FieldTypes) else FieldTypes(field_type) # when a field is created, the name can be empty, # when a field get's added to a definition, the name needs to be there. self.name = name @@ -299,22 +313,22 @@ def key_field_storage_name(self) -> str: @property def vector_fields(self) -> list[VectorStoreField]: """Get the names of the vector fields.""" - return [field for field in self.fields if field.field_type == "vector"] + return [field for field in self.fields if field.field_type == FieldTypes.VECTOR] @property def data_fields(self) -> list[VectorStoreField]: """Get the names of the data fields.""" - return [field for field in self.fields if field.field_type == "data"] + return [field for field in self.fields if field.field_type == FieldTypes.DATA] @property def vector_field_names(self) -> list[str]: """Get the names of the vector fields.""" - return [field.name for field in self.fields if field.field_type == "vector"] + return [field.name for field in self.fields if field.field_type == FieldTypes.VECTOR] @property def data_field_names(self) -> list[str]: """Get the names of all the data fields.""" - return [field.name for field in self.fields if field.field_type == "data"] + return [field.name for field in self.fields if field.field_type == FieldTypes.DATA] def try_get_vector_field(self, field_name: str | None = None) -> VectorStoreField | None: """Try to get the vector field. @@ -334,7 +348,7 @@ def try_get_vector_field(self, field_name: str | None = None) -> VectorStoreFiel return self.vector_fields[0] for field in self.fields: if field.name == field_name or field.storage_name == field_name: - if field.field_type == "vector": + if field.field_type == FieldTypes.VECTOR: return field raise VectorStoreModelException( f"Field {field_name} is not a vector field, it is of type {type(field).__name__}." @@ -354,9 +368,9 @@ def get_storage_names(self, include_vector_fields: bool = True, include_key_fiel return [ field.storage_name or field.name for field in self.fields - if field.field_type == "data" - or (field.field_type == "vector" and include_vector_fields) - or (field.field_type == "key" and include_key_field) + if field.field_type == FieldTypes.DATA + or (field.field_type == FieldTypes.VECTOR and include_vector_fields) + or (field.field_type == FieldTypes.KEY and include_key_field) ] def get_names(self, include_vector_fields: bool = True, include_key_field: bool = True) -> list[str]: @@ -372,9 +386,9 @@ def get_names(self, include_vector_fields: bool = True, include_key_field: bool return [ field.name for field in self.fields - if field.field_type == "data" - or (field.field_type == "vector" and include_vector_fields) - or (field.field_type == "key" and include_key_field) + if field.field_type == FieldTypes.DATA + or (field.field_type == FieldTypes.VECTOR and include_vector_fields) + or (field.field_type == FieldTypes.KEY and include_key_field) ] def model_post_init(self, _: Any): @@ -392,7 +406,7 @@ def model_post_init(self, _: Any): for field in self.fields: if not field.name or field.name == "": raise VectorStoreModelException("Field names must not be empty.") - if field.field_type == "key": + if field.field_type == FieldTypes.KEY: if self.key_name != "": raise VectorStoreModelException("Memory record definition must have exactly one key field.") self.key_name = field.name @@ -408,7 +422,7 @@ def _parse_vector_store_record_field_instance(record_field: VectorStoreField, fi record_field.name = field.name if not record_field.type_ and hasattr(field.annotation, "__origin__"): property_type = field.annotation.__origin__ - if record_field.field_type == "vector": + if record_field.field_type == FieldTypes.VECTOR: if args := getattr(property_type, "__args__", None): if NoneType in args and len(args) > 1: for arg in args: diff --git a/python/semantic_kernel/data/vectors.py b/python/semantic_kernel/data/vectors.py index fa0bfbc2797f..ab9b7ea975f8 100644 --- a/python/semantic_kernel/data/vectors.py +++ b/python/semantic_kernel/data/vectors.py @@ -17,7 +17,12 @@ from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.data.const import DEFAULT_DESCRIPTION, DEFAULT_FUNCTION_NAME -from semantic_kernel.data.definitions import SerializeMethodProtocol, VectorStoreCollectionDefinition, VectorStoreField +from semantic_kernel.data.definitions import ( + FieldTypes, + SerializeMethodProtocol, + VectorStoreCollectionDefinition, + VectorStoreField, +) from semantic_kernel.data.search import ( DynamicFilterFunction, KernelSearchResults, @@ -60,6 +65,7 @@ TSearchOptions = TypeVar("TSearchOptions", bound=SearchOptions) TFilters = TypeVar("TFilters") + # region: Helpers @@ -406,7 +412,7 @@ def _deserialize_dict_to_data_model(self, record: OneOrMany[dict[str, Any]], **k data_model_dict: dict[str, Any] = {} for field in self.definition.fields: value = record.get(field.storage_name or field.name, None) - if field.field_type == "vector" and not kwargs.get("include_vectors"): + if field.field_type == FieldTypes.VECTOR and not kwargs.get("include_vectors"): continue data_model_dict[field.name] = value if self.record_type is dict: