Skip to content

Python: Feature new memory stores and collections #7614

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/python-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ jobs:
VERTEX_AI_PROJECT_ID: ${{ vars.VERTEX_AI_PROJECT_ID }}
VERTEX_AI_GEMINI_MODEL_ID: ${{ vars.VERTEX_AI_GEMINI_MODEL_ID }}
VERTEX_AI_EMBEDDING_MODEL_ID: ${{ vars.VERTEX_AI_EMBEDDING_MODEL_ID }}
REDIS_CONNECTION_STRING: ${{ vars.REDIS_CONNECTION_STRING }}
run: |
cd python
poetry run pytest ./tests/integration ./tests/samples -v --junitxml=pytest.xml
Expand Down Expand Up @@ -242,6 +243,7 @@ jobs:
VERTEX_AI_PROJECT_ID: ${{ vars.VERTEX_AI_PROJECT_ID }}
VERTEX_AI_GEMINI_MODEL_ID: ${{ vars.VERTEX_AI_GEMINI_MODEL_ID }}
VERTEX_AI_EMBEDDING_MODEL_ID: ${{ vars.VERTEX_AI_EMBEDDING_MODEL_ID }}
REDIS_CONNECTION_STRING: ${{ vars.REDIS_CONNECTION_STRING }}
run: |
if ${{ matrix.os == 'ubuntu-latest' }}; then
docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest
Expand Down
6 changes: 3 additions & 3 deletions python/.coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ omit =
semantic_kernel/connectors/memory/mongodb_atlas/*
semantic_kernel/connectors/memory/pinecone/*
semantic_kernel/connectors/memory/postgres/*
semantic_kernel/connectors/memory/qdrant/*
semantic_kernel/connectors/memory/redis/*
semantic_kernel/connectors/memory/qdrant/qdrant_memory_store.py
semantic_kernel/connectors/memory/redis/redis_memory_store.py
semantic_kernel/connectors/memory/usearch/*
semantic_kernel/connectors/memory/weaviate/*
semantic_kernel/reliability/*
Expand All @@ -33,4 +33,4 @@ exclude_lines =
# TYPE_CHECKING and @overload blocks are never executed during pytest run
if TYPE_CHECKING:
@overload
@abstractmethod
@abstractmethod
6 changes: 5 additions & 1 deletion python/.cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
"protos",
"endregion",
"vertexai",
"aiplatform"
"aiplatform",
"serde",
"datamodel",
"vectorstoremodel",
"qdrant"
]
}
6 changes: 6 additions & 0 deletions python/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ ignore_errors = true
[mypy-semantic_kernel.connectors.memory.astradb.*]
ignore_errors = true

[mypy-semantic_kernel.connectors.memory.azure_ai_search.*]
ignore_errors = false
[mypy-semantic_kernel.connectors.memory.azure_cognitive_search.*]
ignore_errors = true

Expand All @@ -50,9 +52,13 @@ ignore_errors = true
[mypy-semantic_kernel.connectors.memory.postgres.*]
ignore_errors = true

[mypy-semantic_kernel.connectors.memory.qdrant.qdrant_vector_record_store.*]
ignore_errors = true
[mypy-semantic_kernel.connectors.memory.qdrant.*]
ignore_errors = true

[mypy-semantic_kernel.connectors.memory.redis.redis_vector_record_store.*]
ignore_errors = true
[mypy-semantic_kernel.connectors.memory.redis.*]
ignore_errors = true

Expand Down
327 changes: 229 additions & 98 deletions python/poetry.lock

Large diffs are not rendered by default.

37 changes: 24 additions & 13 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ chromadb = { version = ">=0.4.13,<0.6.0", optional = true}
google-cloud-aiplatform = { version = "^1.60.0", optional = true}
google-generativeai = { version = "^0.7.2", optional = true}
# hugging face
transformers = { version = "^4.28.1", extras=["torch"], optional = true}
transformers = { version = "^4.28.1", extras=['torch'], optional = true}
sentence-transformers = { version = "^2.2.2", optional = true}
torch = {version = "2.2.2", optional = true}
# mongo
motor = { version = "^3.3.2", optional = true }
# notebooks
Expand All @@ -73,20 +74,20 @@ ollama = { version = "^0.2.1", optional = true}
# pinecone
pinecone-client = { version = ">=3.0.0", optional = true}
# postgres
psycopg = { version="^3.1.9", extras=["binary","pool"], optional = true}
psycopg = { version="^3.2.1", extras=["binary","pool"], optional = true}
# qdrant
qdrant-client = { version = '^1.9', optional = true}
# redis
redis = { version = "^4.6.0", optional = true}
redis = { version = "^5.0.7", extras=['hiredis'], optional = true}
types-redis = { version="^4.6.0.20240425", optional = true }
# usearch
usearch = { version = "^2.9", optional = true}
pyarrow = { version = ">=12.0.1,<18.0.0", optional = true}
weaviate-client = { version = ">=3.18,<5.0", optional = true}
ruff = "0.5.2"
pandas = {version = "^2.2.2", optional = true}

[tool.poetry.group.dev.dependencies]
pre-commit = ">=3.7.1"
ruff = ">=0.5"
ipykernel = "^6.29.4"
nbconvert = "^7.16.4"
pytest = "^8.2.1"
Expand All @@ -96,6 +97,7 @@ pytest-asyncio = "^0.23.7"
snoop = "^0.4.3"
mypy = ">=1.10.0"
types-PyYAML = "^6.0.12.20240311"
ruff = "^0.5.2"

[tool.poetry.group.unit-tests]
optional = true
Expand All @@ -109,8 +111,14 @@ mistralai = "^0.4.1"
ollama = "^0.2.1"
google-cloud-aiplatform = "^1.60.0"
google-generativeai = "^0.7.2"
transformers = { version = "^4.28.1", extras=["torch"]}
sentence-transformers = "^2.2.2"
transformers = { version = "^4.28.1", extras=['torch']}
sentence-transformers = { version = "^2.2.2"}
torch = {version = "2.2.2"}
# qdrant
qdrant-client = '^1.9'
# redis
redis = { version = "^5.0.7", extras=['hiredis']}
pandas = {version = "^2.2.2"}

[tool.poetry.group.tests]
optional = true
Expand All @@ -129,8 +137,9 @@ chromadb = ">=0.4.13,<0.6.0"
google-cloud-aiplatform = "^1.60.0"
google-generativeai = "^0.7.2"
# hugging face
transformers = { version = "^4.28.1", extras=["torch"]}
sentence-transformers = "^2.2.2"
transformers = { version = "^4.28.1", extras=['torch']}
sentence-transformers = { version = "^2.2.2"}
torch = {version = "2.2.2"}
# milvus
pymilvus = ">=2.3,<2.4.4"
milvus = { version = ">=2.3,<2.3.8", markers = 'sys_platform != "win32"'}
Expand All @@ -147,21 +156,23 @@ psycopg = { version="^3.1.9", extras=["binary","pool"]}
# qdrant
qdrant-client = '^1.9'
# redis
redis = "^4.6.0"
redis = { version="^5.0.7", extras=['hiredis']}
types-redis = { version="^4.6.0.20240425" }
# usearch
usearch = "^2.9"
pyarrow = ">=12.0.1,<18.0.0"
# weaviate
weaviate-client = ">=3.18,<5.0"
pandas = {version = "^2.2.2"}

# Extras are exposed to pip, this allows a user to easily add the right dependencies to their environment
[tool.poetry.extras]
all = ["transformers", "sentence-transformers", "qdrant-client", "chromadb", "pymilvus", "milvus", "mistralai", "ollama", "google", "weaviate-client", "pinecone-client", "psycopg", "redis", "azure-ai-inference", "azure-search-documents", "azure-core", "azure-identity", "azure-cosmos", "usearch", "pyarrow", "ipykernel", "motor"]
all = ["transformers", "sentence-transformers", "torch", "qdrant-client", "chromadb", "pymilvus", "milvus", "mistralai", "ollama", "google", "weaviate-client", "pinecone-client", "psycopg", "redis", "azure-ai-inference", "azure-search-documents", "azure-core", "azure-identity", "azure-cosmos", "usearch", "pyarrow", "ipykernel", "motor"]

azure = ["azure-ai-inference", "azure-search-documents", "azure-core", "azure-identity", "azure-cosmos", "msgraph-sdk"]
chromadb = ["chromadb"]
google = ["google-cloud-aiplatform", "google-generativeai"]
hugging_face = ["transformers", "sentence-transformers"]
hugging_face = ["transformers", "sentence-transformers", "torch"]
milvus = ["pymilvus", "milvus"]
mistralai = ["mistralai"]
ollama = ["ollama"]
Expand All @@ -170,7 +181,7 @@ notebooks = ["ipykernel"]
pinecone = ["pinecone-client"]
postgres = ["psycopg"]
qdrant = ["qdrant-client"]
redis = ["redis"]
redis = ["redis", "types-redis"]
usearch = ["usearch", "pyarrow"]
weaviate = ["weaviate-client"]

Expand Down
160 changes: 160 additions & 0 deletions python/samples/concepts/memory/data_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Copyright (c) Microsoft. All rights reserved.

from dataclasses import dataclass, field
from typing import Annotated, Any
from uuid import uuid4

from pandas import DataFrame
from pydantic import Field

from semantic_kernel.data.vector_store_model_decorator import vectorstoremodel
from semantic_kernel.data.vector_store_model_definition import VectorStoreRecordDefinition
from semantic_kernel.data.vector_store_record_fields import (
VectorStoreRecordDataField,
VectorStoreRecordKeyField,
VectorStoreRecordVectorField,
)
from semantic_kernel.kernel_pydantic import KernelBaseModel

# This concept shows the different ways you can create a vector store data model
# using dataclasses, Pydantic, and Python classes.
# As well as using types like Pandas Dataframes.

# There are a number of universal things about these data models:
# they must specify the type of field through the annotation (or the definition).
# there must be at least one field of type VectorStoreRecordKeyField.
# If you set the embedding_property_name in the VectorStoreRecordDataField, that field must exist and be a vector field.
# A unannotated field is allowed but must have a default value.

# The purpose of these models is to be what you pass to and get back from a vector store.
# There maybe limitations to data types that the vector store can handle,
# so not every store will be able to handle completely the same model.
# for instance, some stores only allow a string as the keyfield, while others allow str and int,
# so defining the key with a int, might make some stores unusable.

# The decorator takes the class and pulls out the fields and annotations to create a definition,
# of type VectorStoreRecordDefinition.
# This definition is used for the vector store to know how to handle the data model.

# You can also create the definition yourself, and pass it to the vector stores together with a standard type,
# like a dict or list.
# Or you can use the definition in container mode with something like a Pandas Dataframe.


# Data model using built-in Python dataclasses
@vectorstoremodel
@dataclass
class DataModelDataclass:
vector: Annotated[list[float], VectorStoreRecordVectorField]
key: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
content: Annotated[str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")] = (
"content1"
)
other: str | None = None


# Data model using Pydantic BaseModels
@vectorstoremodel
class DataModelPydantic(KernelBaseModel):
vector: Annotated[list[float], VectorStoreRecordVectorField]
key: Annotated[str, VectorStoreRecordKeyField()] = Field(default_factory=lambda: str(uuid4()))
content: Annotated[str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")] = (
"content1"
)
other: str | None = None


# Data model using Pydantic BaseModels with mixed annotations (from pydantic and SK)
@vectorstoremodel
class DataModelPydanticComplex(KernelBaseModel):
vector: Annotated[list[float], VectorStoreRecordVectorField]
key: Annotated[str, Field(default_factory=lambda: str(uuid4())), VectorStoreRecordKeyField()]
content: Annotated[str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")] = (
"content1"
)
other: str | None = None


# Data model using Python classes
# This one includes a custom serialize and deserialize method
@vectorstoremodel
class DataModelPython:
def __init__(
self,
vector: Annotated[list[float], VectorStoreRecordVectorField],
key: Annotated[str, VectorStoreRecordKeyField] = None,
content: Annotated[
str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")
] = "content1",
other: str | None = None,
):
self.vector = vector
self.other = other
self.key = key or str(uuid4())
self.content = content

def __str__(self) -> str:
return f"DataModelPython(vector={self.vector}, key={self.key}, content={self.content}, other={self.other})"

def serialize(self) -> dict[str, Any]:
return {
"vector": self.vector,
"key": self.key,
"content": self.content,
}

@classmethod
def deserialize(cls, obj: dict[str, Any]) -> "DataModelDataclass":
return cls(
vector=obj["vector"],
key=obj["key"],
content=obj["content"],
)


# Data model definition for use with Pandas
# note the container mode flag, which makes sure that records that are returned are in a container
# even when requesting a batch of records.
# There is also a to_dict and from_dict method, which are used to convert the data model to and from a dict,
# these should be specific to the type used, if using dict as type then these can be left off.
data_model_definition_pandas = VectorStoreRecordDefinition(
fields={
"vector": VectorStoreRecordVectorField(property_type="list[float]"),
"key": VectorStoreRecordKeyField(property_type="str"),
"content": VectorStoreRecordDataField(
property_type="str", has_embedding=True, embedding_property_name="vector"
),
},
container_mode=True,
to_dict=lambda record, **_: record.to_dict(orient="records"),
from_dict=lambda records, **_: DataFrame(records),
)


if __name__ == "__main__":
data_item1 = DataModelDataclass(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
data_item2 = DataModelPydantic(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
data_item3 = DataModelPydanticComplex(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
data_item4 = DataModelPython(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
print("Example records:")
print(f"DataClass:\n {data_item1}", end="\n\n")
print(f"Pydantic:\n {data_item2}", end="\n\n")
print(f"Pydantic with annotations:\n {data_item3}", end="\n\n")
print(f"Python:\n {data_item4}", end="\n\n")

print("Item definitions:")
print(f"DataClass:\n {data_item1.__kernel_vectorstoremodel_definition__}", end="\n\n")
print(f"Pydantic:\n {data_item2.__kernel_vectorstoremodel_definition__}", end="\n\n")
print(f"Pydantic with annotations:\n {data_item3.__kernel_vectorstoremodel_definition__}", end="\n\n")
print(f"Python:\n {data_item4.__kernel_vectorstoremodel_definition__}", end="\n\n")
print(f"Definition for use with Pandas:\n {data_model_definition_pandas}", end="\n\n")
if (
data_item1.__kernel_vectorstoremodel_definition__.fields
== data_item2.__kernel_vectorstoremodel_definition__.fields
== data_item3.__kernel_vectorstoremodel_definition__.fields
== data_item4.__kernel_vectorstoremodel_definition__.fields
== data_model_definition_pandas.fields
):
print("All data models are the same")
else:
print("Data models are not the same")
Loading
Loading