Skip to content

Python: moved vector field to new setup #12256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 5 additions & 15 deletions python/samples/concepts/caching/semantic_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,7 @@
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion, OpenAITextEmbedding
from semantic_kernel.connectors.memory.in_memory import InMemoryStore
from semantic_kernel.data import (
VectorSearchOptions,
VectorStore,
VectorStoreDataField,
VectorStoreKeyField,
VectorStoreRecordCollection,
VectorStoreVectorField,
vectorstoremodel,
)
from semantic_kernel.data import VectorStore, VectorStoreField, VectorStoreRecordCollection, vectorstoremodel
from semantic_kernel.filters import FilterTypes, FunctionInvocationContext, PromptRenderContext
from semantic_kernel.functions import FunctionResult

Expand All @@ -32,9 +24,9 @@
@vectorstoremodel(collection_name=COLLECTION_NAME)
@dataclass
class CacheRecord:
result: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)]
prompt: Annotated[str | None, VectorStoreVectorField(dimensions=1536)] = None
id: Annotated[str, VectorStoreKeyField] = field(default_factory=lambda: str(uuid4()))
result: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)]
prompt: Annotated[str | None, VectorStoreField("vector", dimensions=1536)] = None
id: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4()))


# Define the filters, one for caching the results and one for using the cache.
Expand Down Expand Up @@ -66,9 +58,7 @@ async def on_prompt_render(
"""
await next(context)
await self.collection.ensure_collection_exists()
results = await self.collection.search(
context.rendered_prompt, options=VectorSearchOptions(vector_property_name="prompt", top=1)
)
results = await self.collection.search(context.rendered_prompt, vector_property_name="prompt", top=1)
async for result in results.results:
if result.score and result.score < self.score_threshold:
context.function_result = FunctionResult(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,7 @@
from semantic_kernel.contents import ChatHistory, ChatMessageContent
from semantic_kernel.core_plugins.math_plugin import MathPlugin
from semantic_kernel.core_plugins.time_plugin import TimePlugin
from semantic_kernel.data import (
VectorStore,
VectorStoreDataField,
VectorStoreKeyField,
VectorStoreRecordCollection,
vectorstoremodel,
)
from semantic_kernel.data import VectorStore, VectorStoreField, VectorStoreRecordCollection, vectorstoremodel

"""
This sample demonstrates how to build a conversational chatbot
Expand All @@ -39,9 +33,9 @@
@vectorstoremodel
@dataclass
class ChatHistoryModel:
session_id: Annotated[str, VectorStoreKeyField]
user_id: Annotated[str, VectorStoreDataField(is_indexed=True)]
messages: Annotated[list[dict[str, str]], VectorStoreDataField(is_indexed=True)]
session_id: Annotated[str, VectorStoreField("key")]
user_id: Annotated[str, VectorStoreField("data", is_indexed=True)]
messages: Annotated[list[dict[str, str]], VectorStoreField("data", is_indexed=True)]


# 2. We then create a class that extends the ChatHistory class
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)
from pydantic import BaseModel, ConfigDict

from semantic_kernel.data import VectorStoreDataField, VectorStoreKeyField, VectorStoreVectorField, vectorstoremodel
from semantic_kernel.data import VectorStoreField, vectorstoremodel

"""
The data model used for this sample is based on the hotel data model from the Azure AI Search samples.
Expand Down Expand Up @@ -55,29 +55,20 @@ class Address(BaseModel):

@vectorstoremodel(collection_name="hotel-index")
class HotelSampleClass(BaseModel):
HotelId: Annotated[str, VectorStoreKeyField]
HotelName: Annotated[str | None, VectorStoreDataField()] = None
Description: Annotated[
str,
VectorStoreDataField(is_full_text_indexed=True),
]
DescriptionVector: Annotated[
list[float] | str | None,
VectorStoreVectorField(dimensions=1536),
] = None
Description_fr: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)]
DescriptionFrVector: Annotated[
list[float] | str | None,
VectorStoreVectorField(dimensions=1536),
] = None
Category: Annotated[str, VectorStoreDataField()]
Tags: Annotated[list[str], VectorStoreDataField(is_indexed=True)]
ParkingIncluded: Annotated[bool | None, VectorStoreDataField()] = None
LastRenovationDate: Annotated[str | None, VectorStoreDataField(type=SearchFieldDataType.DateTimeOffset)] = None
Rating: Annotated[float, VectorStoreDataField()]
Location: Annotated[dict[str, Any], VectorStoreDataField(type=SearchFieldDataType.GeographyPoint)]
Address: Annotated[Address, VectorStoreDataField()]
Rooms: Annotated[list[Rooms], VectorStoreDataField()]
HotelId: Annotated[str, VectorStoreField("key")]
HotelName: Annotated[str | None, VectorStoreField("data")] = None
Description: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)]
DescriptionVector: Annotated[list[float] | str | None, VectorStoreField("vector", dimensions=1536)] = None
Description_fr: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)]
DescriptionFrVector: Annotated[list[float] | str | None, VectorStoreField("vector", dimensions=1536)] = None
Category: Annotated[str, VectorStoreField("data")]
Tags: Annotated[list[str], VectorStoreField("data", is_indexed=True)]
ParkingIncluded: Annotated[bool | None, VectorStoreField("data")] = None
LastRenovationDate: Annotated[str | None, VectorStoreField("data", type=SearchFieldDataType.DateTimeOffset)] = None
Rating: Annotated[float, VectorStoreField("data")]
Location: Annotated[dict[str, Any], VectorStoreField("data", type=SearchFieldDataType.GeographyPoint)]
Address: Annotated[Address, VectorStoreField("data")]
Rooms: Annotated[list[Rooms], VectorStoreField("data")]

model_config = ConfigDict(extra="ignore")

Expand Down
24 changes: 12 additions & 12 deletions python/samples/concepts/memory/complex_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,8 @@
SqlServerCollection,
WeaviateCollection,
)
from semantic_kernel.data import (
VectorStoreDataField,
VectorStoreKeyField,
VectorStoreRecordCollection,
VectorStoreVectorField,
vectorstoremodel,
)
from semantic_kernel.data import VectorStoreRecordCollection, vectorstoremodel
from semantic_kernel.data.definitions import VectorStoreField
from semantic_kernel.data.vectors import SearchType, VectorSearch

# This is a rather complex sample, showing how to use the vector store
Expand All @@ -48,14 +43,19 @@
@vectorstoremodel(collection_name="test")
@dataclass
class DataModel:
title: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)]
content: Annotated[str, VectorStoreDataField(is_full_text_indexed=True)]
title: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)]
content: Annotated[str, VectorStoreField("data", is_full_text_indexed=True)]
embedding: Annotated[
str | None,
VectorStoreVectorField(dimensions=1536, type_="float"),
VectorStoreField("vector", dimensions=1536, type_="float"),
] = None
id: Annotated[str, VectorStoreKeyField()] = field(default_factory=lambda: str(uuid4()))
tag: Annotated[str | None, VectorStoreDataField(type_="str", is_indexed=True)] = None
id: Annotated[
str,
VectorStoreField(
"key",
),
] = field(default_factory=lambda: str(uuid4()))
tag: Annotated[str | None, VectorStoreField("data", type_="str", is_indexed=True)] = None

def __post_init__(self, **kwargs):
if self.embedding is None:
Expand Down
45 changes: 18 additions & 27 deletions python/samples/concepts/memory/data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,15 @@
from pandas import DataFrame
from pydantic import BaseModel, Field

from semantic_kernel.data import (
VectorStoreCollectionDefinition,
VectorStoreDataField,
VectorStoreKeyField,
VectorStoreVectorField,
vectorstoremodel,
)
from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField, vectorstoremodel

# This concept shows the different ways you can create a vector store data model
# using dataclasses, Pydantic, and Python classes.
# As well as using types like Pandas Dataframes.

# There are a number of universal things about these data models:
# they must specify the type of field through the annotation (or the definition).
# there must be at least one field of type VectorStoreRecordKeyField.
# If you set the embedding_property_name in the VectorStoreRecordDataField, that field must exist and be a vector field.
# there must be at least one field of type `key`.
# A unannotated field is allowed but must have a default value.

# The purpose of these models is to be what you pass to and get back from a vector store.
Expand All @@ -32,7 +25,7 @@
# so defining the key with a int, might make some stores unusable.

# The decorator takes the class and pulls out the fields and annotations to create a definition,
# of type VectorStoreRecordDefinition.
# of type VectorStoreCollectionDefinition.
# This definition is used for the vector store to know how to handle the data model.

# You can also create the definition yourself, and pass it to the vector stores together with a standard type,
Expand All @@ -44,18 +37,18 @@
@vectorstoremodel
@dataclass
class DataModelDataclass:
vector: Annotated[list[float], VectorStoreVectorField]
key: Annotated[str, VectorStoreKeyField()] = field(default_factory=lambda: str(uuid4()))
content: Annotated[str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector")] = "content1"
vector: Annotated[list[float] | None, VectorStoreField("vector", dimensions=3)] = None
key: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4()))
content: Annotated[str, VectorStoreField("data")] = "content1"
other: str | None = None


# Data model using Pydantic BaseModels
@vectorstoremodel
class DataModelPydantic(BaseModel):
id: Annotated[str, VectorStoreKeyField()] = Field(default_factory=lambda: str(uuid4()))
content: Annotated[str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector")] = "content1"
vector: Annotated[list[float], VectorStoreVectorField]
id: Annotated[str, VectorStoreField("key")] = Field(default_factory=lambda: str(uuid4()))
content: Annotated[str, VectorStoreField("data")] = "content1"
vector: Annotated[list[float] | None, VectorStoreField("vector", dimensions=3)] = None
other: str | None = None


Expand All @@ -65,11 +58,9 @@ class DataModelPydantic(BaseModel):
class DataModelPython:
def __init__(
self,
vector: Annotated[list[float], VectorStoreVectorField],
key: Annotated[str, VectorStoreKeyField] = None,
content: Annotated[
str, VectorStoreDataField(has_embedding=True, embedding_property_name="vector")
] = "content1",
key: Annotated[str | None, VectorStoreField("key")] = None,
vector: Annotated[list[float], VectorStoreField("vector", dimensions=3)] = None,
content: Annotated[str, VectorStoreField("data")] = "content1",
other: str | None = None,
):
self.vector = vector
Expand All @@ -88,7 +79,7 @@ def serialize(self) -> dict[str, Any]:
}

@classmethod
def deserialize(cls, obj: dict[str, Any]) -> "DataModelDataclass":
def deserialize(cls, obj: dict[str, Any]) -> "DataModelPython":
return cls(
vector=obj["vector"],
key=obj["key"],
Expand All @@ -102,11 +93,11 @@ def deserialize(cls, obj: dict[str, Any]) -> "DataModelDataclass":
# There is also a to_dict and from_dict method, which are used to convert the data model to and from a dict,
# these should be specific to the type used, if using dict as type then these can be left off.
definition_pandas = VectorStoreCollectionDefinition(
fields={
"vector": VectorStoreVectorField(type_="list[float]"),
"key": VectorStoreKeyField(type_="str"),
"content": VectorStoreDataField(type_="str", has_embedding=True, embedding_property_name="vector"),
},
fields=[
VectorStoreField("vector", name="vector", type="float", dimensions=3),
VectorStoreField("key", name="key", type="str"),
VectorStoreField("data", name="content", type="str"),
],
container_mode=True,
to_dict=lambda record, **_: record.to_dict(orient="records"),
from_dict=lambda records, **_: DataFrame(records),
Expand Down
79 changes: 42 additions & 37 deletions python/samples/concepts/memory/memory_with_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,62 +5,67 @@

import pandas as pd

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import OpenAIEmbeddingPromptExecutionSettings, OpenAITextEmbedding
from semantic_kernel.connectors.ai.open_ai import OpenAITextEmbedding
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
from semantic_kernel.data import (
VectorStoreCollectionDefinition,
VectorStoreDataField,
VectorStoreKeyField,
VectorStoreVectorField,
)
from semantic_kernel.data.vectors import add_vector_to_records
from semantic_kernel.data import VectorStoreCollectionDefinition, VectorStoreField

model_fields = VectorStoreCollectionDefinition(
container_mode=True,
fields={
"content": VectorStoreDataField(has_embedding=True, embedding_property_name="vector"),
"id": VectorStoreKeyField(),
"vector": VectorStoreVectorField(
embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)}
definition = VectorStoreCollectionDefinition(
collection_name="pandas_test_index",
fields=[
VectorStoreField("key", name="id", type="str"),
VectorStoreField("data", name="title", type="str"),
VectorStoreField("data", name="content", type="str", is_full_text_indexed=True),
VectorStoreField(
"vector",
name="vector",
type="float",
dimensions=1536,
embedding_generator=OpenAITextEmbedding(ai_model_id="text-embedding-3-small"),
),
},
],
to_dict=lambda record, **_: record.to_dict(orient="records"),
from_dict=lambda records, **_: pd.DataFrame(records),
container_mode=True,
)


async def main():
# setup the kernel
kernel = Kernel()
kernel.add_service(OpenAITextEmbedding(service_id="embedding", ai_model_id="text-embedding-3-small"))

# create the record collection
async with AzureAISearchCollection[pd.DataFrame](
async with AzureAISearchCollection[str, pd.DataFrame](
record_type=pd.DataFrame,
definition=model_fields,
) as record_collection:
definition=definition,
) as collection:
await collection.ensure_collection_exists()
# create some records
records = [
{"id": str(uuid4()), "content": "my dict text", "vector": None},
{"id": str(uuid4()), "content": "my second text", "vector": None},
{
"id": str(uuid4()),
"title": "Document about Semantic Kernel.",
"content": "Semantic Kernel is a framework for building AI applications.",
},
{
"id": str(uuid4()),
"title": "Document about Python",
"content": "Python is a programming language that lets you work quickly.",
},
]

# create the dataframe and add the embeddings
# create the dataframe and add the content you want to embed to a new column
df = pd.DataFrame(records)
df = await add_vector_to_records(kernel, df, None, definition=model_fields)
print("Records with embeddings:")
print(df.shape)
print(df.head(5))

df["vector"] = df.apply(lambda row: f"title: {row['title']}, content: {row['content']}", axis=1)
print(df.head(1))
# upsert the records (for a container, upsert and upsert_batch are equivalent)
await record_collection.upsert_batch(df)
await collection.upsert(df)

# retrieve a record
result = await record_collection.get(records[0]["id"])
print("Retrieved records:")
print(result.shape)
print(result.head(5))
result = await collection.get(top=2)
if result is None:
print("No records found, this is sometimes because the get is too fast and the index is not ready yet.")
else:
print("Retrieved records:")
print(result.to_string())

await collection.ensure_collection_deleted()


if __name__ == "__main__":
Expand Down
Loading