From b7ac7b6eb22e5109d1752428fc9cc8b9190cca98 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Wed, 1 May 2024 11:55:25 -0700 Subject: [PATCH 01/27] WIP - add Cosmos DB NoSQL Provider --- dotnet/Directory.Packages.props | 2 +- dotnet/SK-dotnet.sln | 10 + .../AssemblyInfo.cs | 6 + .../AzureCosmosDBNoSQLConfig.cs | 86 +++++ .../AzureCosmosDBNoSQLMemoryRecord.cs | 89 +++++ .../AzureCosmosDBNoSQLMemoryRecordMetadata.cs | 82 +++++ .../AzureCosmosDBNoSQLMemoryStore.cs | 318 ++++++++++++++++++ .../AzureCosmosDBSimilarityType.cs | 30 ++ ...onnectors.Memory.AzureCosmosDBNoSQL.csproj | 30 ++ 9 files changed, 652 insertions(+), 1 deletion(-) create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AssemblyInfo.cs create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj diff --git a/dotnet/Directory.Packages.props b/dotnet/Directory.Packages.props index 0f45264e4068..fbb2de10129d 100644 --- a/dotnet/Directory.Packages.props +++ b/dotnet/Directory.Packages.props @@ -75,7 +75,6 @@ - @@ -87,6 +86,7 @@ + diff --git a/dotnet/SK-dotnet.sln b/dotnet/SK-dotnet.sln index 8b58bb93f4aa..6320eeb19832 100644 --- a/dotnet/SK-dotnet.sln +++ b/dotnet/SK-dotnet.sln @@ -310,6 +310,7 @@ EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "QualityCheckWithFilters", "samples\Demos\QualityCheck\QualityCheckWithFilters\QualityCheckWithFilters.csproj", "{1D3EEB5B-0E06-4700-80D5-164956E43D0A}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TimePlugin", "samples\Demos\TimePlugin\TimePlugin.csproj", "{F312FCE1-12D7-4DEF-BC29-2FF6618509F3}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Memory.AzureCosmosDBNoSQL", "src\Connectors\Connectors.Memory.AzureCosmosDBNoSQL\Connectors.Memory.AzureCosmosDBNoSQL.csproj", "{B0B3901E-AF56-432B-8FAA-858468E5D0DF}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -762,6 +763,12 @@ Global {F312FCE1-12D7-4DEF-BC29-2FF6618509F3}.Publish|Any CPU.Build.0 = Debug|Any CPU {F312FCE1-12D7-4DEF-BC29-2FF6618509F3}.Release|Any CPU.ActiveCfg = Release|Any CPU {F312FCE1-12D7-4DEF-BC29-2FF6618509F3}.Release|Any CPU.Build.0 = Release|Any CPU + {B0B3901E-AF56-432B-8FAA-858468E5D0DF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B0B3901E-AF56-432B-8FAA-858468E5D0DF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B0B3901E-AF56-432B-8FAA-858468E5D0DF}.Publish|Any CPU.ActiveCfg = Publish|Any CPU + {B0B3901E-AF56-432B-8FAA-858468E5D0DF}.Publish|Any CPU.Build.0 = Publish|Any CPU + {B0B3901E-AF56-432B-8FAA-858468E5D0DF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B0B3901E-AF56-432B-8FAA-858468E5D0DF}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -867,6 +874,9 @@ Global {3ED53702-0E53-473A-A0F4-645DB33541C2} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} {1D3EEB5B-0E06-4700-80D5-164956E43D0A} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} {F312FCE1-12D7-4DEF-BC29-2FF6618509F3} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} + {6EF9663D-976C-4A27-B8D3-8B1E63BA3BF2} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} + {925B1185-8B58-4E2D-95C9-4CA0BA9364E5} = {FA3720F1-C99A-49B2-9577-A940257098BF} + {B0B3901E-AF56-432B-8FAA-858468E5D0DF} = {24503383-A8C4-4255-9998-28D70FE8E99A} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FBDC56A3-86AD-4323-AA0F-201E59123B83} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AssemblyInfo.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AssemblyInfo.cs new file mode 100644 index 000000000000..d174fc92303c --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AssemblyInfo.cs @@ -0,0 +1,6 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Diagnostics.CodeAnalysis; + +// This assembly is currently experimental. +[assembly: Experimental("SKEXP0020")] diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs new file mode 100644 index 000000000000..2db29adf968f --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.SemanticKernel.Http; + +namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; + +/// +/// Get more details about Azure Cosmos DB and these configs https://learn.microsoft.com/azure/cosmos-db/ +/// +public class AzureCosmosDBNoSQLConfig +{ + /// + /// Application name for the client for tracking and logging + /// + public string ApplicationName { get; set; } + + /// + /// Index name for the embedding + /// + public string IndexName { get; set; } + + /// + /// Kind: Type of vector index to create. + /// Possible options are: + /// - vector-ivf + /// - vector-hnsw: available as a preview feature only, + /// to enable visit https://learn.microsoft.com/azure/azure-resource-manager/management/preview-features + /// + public AzureCosmosDBVectorSearchType Kind { get; set; } + + /// + /// NumLists: This integer is the number of clusters that the inverted file (IVF) index uses to group the vector data. + /// We recommend that numLists is set to documentCount/1000 for up to 1 million documents and to sqrt(documentCount) + /// for more than 1 million documents. Using a numLists value of 1 is akin to performing brute-force search, which has + /// limited performance. + /// + public int NumLists { get; set; } + + /// + /// Number of dimensions for vector similarity. The maximum number of supported dimensions is 2000. + /// + public int Dimensions { get; set; } + + /// + /// Similarity: Similarity metric to use with the IVF index. + /// Possible options are: + /// - COS (cosine distance), + /// - L2 (Euclidean distance), and + /// - IP (inner product). + /// + public AzureCosmosDBSimilarityType Similarity { get; set; } + + /// + /// NumberOfConnections: The max number of connections per layer (16 by default, minimum value is 2, maximum value is + /// 100). Higher m is suitable for datasets with high dimensionality and/or high accuracy requirements. + /// + public int NumberOfConnections { get; set; } + + /// + /// EfConstruction: the size of the dynamic candidate list for constructing the graph (64 by default, minimum value is 4, + /// maximum value is 1000). Higher ef_construction will result in better index quality and higher accuracy, but it will + /// also increase the time required to build the index. EfConstruction has to be at least 2 * m + /// + public int EfConstruction { get; set; } + + /// + /// EfSearch: The size of the dynamic candidate list for search (40 by default). A higher value provides better recall at + /// the cost of speed. + /// + public int EfSearch { get; set; } + + /// + /// Initialize the AzureCosmosDBNoSQLConfig with default values + /// + public AzureCosmosDBNoSQLConfig() + { + this.ApplicationName = HttpHeaderConstant.Values.UserAgent; + this.IndexName = "default_index"; + this.Kind = AzureCosmosDBVectorSearchType.VectorHNSW; + this.NumLists = 1; + this.Similarity = AzureCosmosDBSimilarityType.Cosine; + this.NumberOfConnections = 16; + this.EfConstruction = 64; + this.EfSearch = 40; + } +} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs new file mode 100644 index 000000000000..29caf1ccd5f8 --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Linq; +using Microsoft.SemanticKernel.Memory; +using NoSQL.Bson; +using NoSQL.Bson.Serialization; +using NoSQL.Bson.Serialization.Attributes; + +namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; + +/// +/// A NoSQL memory record. +/// +internal sealed class AzureCosmosDBNoSQLMemoryRecord +{ + /// + /// Unique identifier of the memory entry. + /// + [BsonId] + public string Id { get; set; } + + /// + /// Metadata associated with memory entity. + /// + [BsonElement("metadata")] + public AzureCosmosDBNoSQLMemoryRecordMetadata Metadata { get; set; } + + /// + /// Source content embedding. + /// +#pragma warning disable CA1819 // Properties should not return arrays + [BsonElement("embedding")] + public float[] Embedding { get; set; } +#pragma warning restore CA1819 // Properties should not return arrays + + /// + /// Optional timestamp. + /// + [BsonElement("timestamp")] + [BsonDateTimeOptions(Kind = DateTimeKind.Utc, Representation = BsonType.DateTime)] + public DateTime? Timestamp { get; set; } + + /// + /// Initializes a new instance of the class. + /// + /// Instance to copy values from. + public AzureCosmosDBNoSQLMemoryRecord(MemoryRecord memoryRecord) + { + this.Id = memoryRecord.Key; + this.Metadata = new AzureCosmosDBNoSQLMemoryRecordMetadata(memoryRecord.Metadata); + this.Embedding = memoryRecord.Embedding.ToArray(); + this.Timestamp = memoryRecord.Timestamp?.UtcDateTime; + } + + /// + /// Returns mapped . + /// + public static MemoryRecord ToMemoryRecord(BsonDocument doc, bool withEmbedding) + { + return new( + BsonSerializer + .Deserialize( + doc["metadata"].AsBsonDocument + ) + .ToMemoryRecordMetadata(), + withEmbedding + ? doc["embedding"].AsBsonArray.Select(x => (float)x.AsDouble).ToArray() + : null, + doc["_id"].AsString, + doc["timestamp"]?.ToUniversalTime() + ); + + // return result; + } + + /// + /// Returns mapped . + /// + public MemoryRecord ToMemoryRecord(bool withEmbedding) + { + return new( + this.Metadata.ToMemoryRecordMetadata(), + withEmbedding ? this.Embedding : null, + this.Id, + this.Timestamp?.ToLocalTime() + ); + } +} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs new file mode 100644 index 000000000000..13f9874c144f --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.SemanticKernel.Memory; +using NoSQL.Bson.Serialization.Attributes; + +namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; + +/// +/// A NoSQL memory record metadata. +/// +#pragma warning disable CA1815 // Override equals and operator equals on value types +internal struct AzureCosmosDBNoSQLMemoryRecordMetadata +#pragma warning restore CA1815 // Override equals and operator equals on value types +{ + /// + /// Whether the source data used to calculate embeddings are stored in the local + /// storage provider or is available through and external service, such as web site, MS Graph, etc. + /// + [BsonElement("isReference")] + public bool IsReference { get; set; } + + /// + /// A value used to understand which external service owns the data, to avoid storing the information + /// inside the URI. E.g. this could be "MSTeams", "WebSite", "GitHub", etc. + /// + [BsonElement("externalSourceName")] + [BsonIgnoreIfDefault] + public string ExternalSourceName { get; set; } + + /// + /// Unique identifier. The format of the value is domain specific, so it can be a URL, a GUID, etc. + /// + [BsonId] + public string Id { get; set; } + + /// + /// Optional title describing the content. Note: the title is not indexed. + /// + [BsonElement("description")] + [BsonIgnoreIfDefault] + public string Description { get; set; } + + /// + /// Source text, available only when the memory is not an external source. + /// + [BsonElement("text")] + [BsonIgnoreIfDefault] + public string Text { get; set; } + + /// + /// Field for saving custom metadata with a memory. + /// + [BsonElement("additionalMetadata")] + [BsonIgnoreIfDefault] + public string AdditionalMetadata { get; set; } + + /// + /// Initializes a new instance of structure. + /// + public AzureCosmosDBNoSQLMemoryRecordMetadata(MemoryRecordMetadata memoryRecordMetadata) + { + this.IsReference = memoryRecordMetadata.IsReference; + this.ExternalSourceName = memoryRecordMetadata.ExternalSourceName; + this.Id = memoryRecordMetadata.Id; + this.Description = memoryRecordMetadata.Description; + this.Text = memoryRecordMetadata.Text; + this.AdditionalMetadata = memoryRecordMetadata.AdditionalMetadata; + } + + /// + /// Returns mapped . + /// + public MemoryRecordMetadata ToMemoryRecordMetadata() => + new( + this.IsReference, + this.ExternalSourceName, + this.Id, + this.Description, + this.Text, + this.AdditionalMetadata + ); +} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs new file mode 100644 index 000000000000..6afea189c9ba --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -0,0 +1,318 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Azure.Cosmos; +using Microsoft.SemanticKernel.Memory; + +namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; + +/// +/// An implementation of backed by a Azure Cosmos DB database. +/// Get more details about Azure Cosmos DB vector search https://learn.microsoft.com/en-us/azure/cosmos-db/ +/// +public class AzureCosmosDBNoSQLMemoryStore : IMemoryStore, IDisposable +{ + private readonly CosmosClient _cosmosClient; + private readonly AzureCosmosDBNoSQLConfig _config; + + /// + /// Initiates a AzureCosmosDBNoSQLMemoryStore instance using a Azure Cosmos DB connection string + /// and other properties required for vector search. + /// + /// Connection string required to connect to Azure Cosmos DB. + /// The database name to connect to. + /// Azure CosmosDB NoSQL Config containing specific parameters for vector search. + public AzureCosmosDBNoSQLMemoryStore( + string connectionString, + string databaseName, + AzureCosmosDBNoSQLConfig config + ) + { + MongoClientSettings settings = MongoClientSettings.FromConnectionString(connectionString); + this._config = config; + settings.ApplicationName = this._config.ApplicationName; + this._mongoClient = new MongoClient(settings); + this._mongoDatabase = this._mongoClient.GetDatabase(databaseName); + } + + /// + /// Initiates a AzureCosmosDBNoSQLMemoryStore instance using a instance + /// and other properties required for vector search. + /// + public AzureCosmosDBNoSQLMemoryStore( + CosmosClient cosmosClient, + string databaseName, + AzureCosmosDBNoSQLonfig config) + { + MongoClientSettings settings = cosmosClient.Settings; + this._config = config; + settings.ApplicationName = this._config.ApplicationName; + this._cosmosClient = new MongoClient(settings); + this._mongoDatabase = this._mongoClient.GetDatabase(databaseName); + } + + /// + public async Task CreateCollectionAsync( + string collectionName, + CancellationToken cancellationToken = default + ) + { + await this + ._mongoDatabase.CreateCollectionAsync( + collectionName, + cancellationToken: cancellationToken + ) + .ConfigureAwait(false); + var indexes = await this.GetCollection(collectionName) + .Indexes.ListAsync(cancellationToken: cancellationToken) + .ConfigureAwait(false); + + if (!indexes.ToList(cancellationToken: cancellationToken).Any(index => index["name"] == this._config.IndexName)) + { + var command = new BsonDocument(); + switch (this._config.Kind) + { + case AzureCosmosDBVectorSearchType.VectorIVF: + command = this.GetIndexDefinitionVectorIVF(collectionName); + break; + case AzureCosmosDBVectorSearchType.VectorHNSW: + command = this.GetIndexDefinitionVectorHNSW(collectionName); + break; + } + await this + ._mongoDatabase.RunCommandAsync( + command, + cancellationToken: cancellationToken + ) + .ConfigureAwait(false); + } + } + + /// + public async IAsyncEnumerable GetCollectionsAsync( + [EnumeratorCancellation] CancellationToken cancellationToken = default + ) + { + using var cursor = await this + ._mongoDatabase.ListCollectionNamesAsync(cancellationToken: cancellationToken) + .ConfigureAwait(false); + + while (await cursor.MoveNextAsync(cancellationToken).ConfigureAwait(false)) + { + foreach (var name in cursor.Current) + { + yield return name; + } + } + } + + /// + public async Task DoesCollectionExistAsync( + string collectionName, + CancellationToken cancellationToken = default + ) + { + await foreach ( + var existingCollectionName in this.GetCollectionsAsync(cancellationToken) + .ConfigureAwait(false) + ) + { + if (existingCollectionName == collectionName) + { + return true; + } + } + return false; + } + + /// + public Task DeleteCollectionAsync( + string collectionName, + CancellationToken cancellationToken = default + ) => this._mongoDatabase.DropCollectionAsync(collectionName, cancellationToken); + + /// + public async Task UpsertAsync( + string collectionName, + MemoryRecord record, + CancellationToken cancellationToken = default + ) + { + var replaceOptions = new ReplaceOptions() { IsUpsert = true }; + + var result = await this.GetCollection(collectionName) + .ReplaceOneAsync( + GetFilterById(record.Metadata.Id), + new AzureCosmosDBNoSQLryRecord(record), + replaceOptions, + cancellationToken + ) + .ConfigureAwait(false); + + return record.Key; + } + + /// + public async IAsyncEnumerable UpsertBatchAsync( + string collectionName, + IEnumerable records, + [EnumeratorCancellation] CancellationToken cancellationToken = default + ) + { + foreach (var record in records) + { + yield return await this.UpsertAsync(collectionName, record, cancellationToken) + .ConfigureAwait(false); + } + } + + /// + public async Task GetAsync( + string collectionName, + string key, + bool withEmbedding = false, + CancellationToken cancellationToken = default + ) + { + using var cursor = await this.GetCollection(collectionName) + .FindAsync(GetFilterById(key), null, cancellationToken) + .ConfigureAwait(false); + + var cosmosRecord = await cursor + .SingleOrDefaultAsync(cancellationToken) + .ConfigureAwait(false); + var result = cosmosRecord?.ToMemoryRecord(withEmbedding); + + return result; + } + + /// + public async IAsyncEnumerable GetBatchAsync( + string collectionName, + IEnumerable keys, + bool withEmbeddings = false, + [EnumeratorCancellation] CancellationToken cancellationToken = default + ) + { + using var cursor = await this.GetCollection(collectionName) + .FindAsync(GetFilterByIds(keys), null, cancellationToken) + .ConfigureAwait(false); + + while (await cursor.MoveNextAsync(cancellationToken).ConfigureAwait(false)) + { + foreach (var cosmosRecord in cursor.Current) + { + yield return cosmosRecord.ToMemoryRecord(withEmbeddings); + } + } + } + + /// + public Task RemoveAsync( + string collectionName, + string key, + CancellationToken cancellationToken = default + ) => this.GetCollection(collectionName).DeleteOneAsync(GetFilterById(key), cancellationToken); + + /// + public Task RemoveBatchAsync( + string collectionName, + IEnumerable keys, + CancellationToken cancellationToken = default + ) => + this.GetCollection(collectionName).DeleteManyAsync(GetFilterByIds(keys), cancellationToken); + + /// + public async Task<(MemoryRecord, double)?> GetNearestMatchAsync( + string collectionName, + ReadOnlyMemory embedding, + double minRelevanceScore = 0, + bool withEmbedding = false, + CancellationToken cancellationToken = default + ) + { + using var cursor = await this.VectorSearchAsync( + 1, + embedding, + collectionName, + cancellationToken + ) + .ConfigureAwait(false); + var result = await cursor.FirstOrDefaultAsync(cancellationToken).ConfigureAwait(false); + // Access the similarityScore from the BSON document + double similarityScore = result.GetValue("similarityScore").AsDouble; + if (similarityScore < minRelevanceScore) + { + return null; + } + + MemoryRecord memoryRecord = AzureCosmosDBNoSQLmoryRecord.ToMemoryRecord( + result["document"].AsBsonDocument, + withEmbedding + ); + return (memoryRecord, similarityScore); + } + + /// + public async IAsyncEnumerable<(MemoryRecord, double)> GetNearestMatchesAsync( + string collectionName, + ReadOnlyMemory embedding, + int limit, + double minRelevanceScore = 0, + bool withEmbeddings = false, + [EnumeratorCancellation] CancellationToken cancellationToken = default + ) + { + using var cursor = await this.VectorSearchAsync( + limit, + embedding, + collectionName, + cancellationToken + ) + .ConfigureAwait(false); + while (await cursor.MoveNextAsync(cancellationToken).ConfigureAwait(false)) + { + foreach (var doc in cursor.Current) + { + // Access the similarityScore from the BSON document + var similarityScore = doc.GetValue("similarityScore").AsDouble; + if (similarityScore < minRelevanceScore) + { + continue; + } + + MemoryRecord memoryRecord = AzureCosmosDBNoSQLMemoryRecord.ToMemoryRecord( + doc["document"].AsBsonDocument, + withEmbeddings + ); + yield return (memoryRecord, similarityScore); + } + } + } + + /// + /// Disposes the instance. + /// + public void Dispose() + { + this.Dispose(true); + GC.SuppressFinalize(this); + } + + /// + /// Disposes the resources used by the instance. + /// + /// True to release both managed and unmanaged resources; false to release only unmanaged resources. + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + this._cosmosClient.Dispose(); + } + } +} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs new file mode 100644 index 000000000000..f51c5b626fee --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Text.Json.Serialization; + +// ReSharper disable InconsistentNaming +namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; + +/// +/// Similarity metric to use with the index. Possible options are COS (cosine distance), L2 (Euclidean distance), and IP (inner product). +/// +public enum AzureCosmosDBSimilarityType +{ + /// + /// Cosine similarity + /// + [JsonPropertyName("COS")] + Cosine, + + /// + /// Inner Product similarity + /// + [JsonPropertyName("IP")] + InnerProduct, + + /// + /// Euclidian similarity + /// + [JsonPropertyName("L2")] + Euclidian +} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj new file mode 100644 index 000000000000..3c5a610f3c0d --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj @@ -0,0 +1,30 @@ + + + + + Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL + $(AssemblyName) + netstandard2.0 + $(NoWarn);NU5104;SKEXP0001,SKEXP0010 + alpha + + + + + + + + + Semantic Kernel - Azure CosmosDB NoSQL Connector + Azure CosmosDB NoSQL connector for Semantic Kernel plugins and semantic memory + + + + + + + + + + + From 40a28f21515a62780848ba660af4fc0fc865cf91 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Wed, 1 May 2024 12:52:42 -0700 Subject: [PATCH 02/27] more WIP --- dotnet/Directory.Packages.props | 2 +- dotnet/nuget.config | 4 + .../AzureCosmosDBNoSQLConfig.cs | 52 ++--------- .../AzureCosmosDBNoSQLMemoryRecord.cs | 89 ------------------- .../AzureCosmosDBNoSQLMemoryRecordMetadata.cs | 82 ----------------- .../AzureCosmosDBSimilarityType.cs | 30 ------- 6 files changed, 11 insertions(+), 248 deletions(-) delete mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs delete mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs delete mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs diff --git a/dotnet/Directory.Packages.props b/dotnet/Directory.Packages.props index fbb2de10129d..84c664e5e648 100644 --- a/dotnet/Directory.Packages.props +++ b/dotnet/Directory.Packages.props @@ -86,7 +86,7 @@ - + diff --git a/dotnet/nuget.config b/dotnet/nuget.config index 7159fcd04c36..9ad18d1ddc85 100644 --- a/dotnet/nuget.config +++ b/dotnet/nuget.config @@ -4,9 +4,13 @@ + + + + diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs index 2db29adf968f..8c43049fce31 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs @@ -1,5 +1,6 @@ // Copyright (c) Microsoft. All rights reserved. +using Microsoft.Azure.Cosmos; using Microsoft.SemanticKernel.Http; namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; @@ -14,11 +15,6 @@ public class AzureCosmosDBNoSQLConfig /// public string ApplicationName { get; set; } - /// - /// Index name for the embedding - /// - public string IndexName { get; set; } - /// /// Kind: Type of vector index to create. /// Possible options are: @@ -26,15 +22,7 @@ public class AzureCosmosDBNoSQLConfig /// - vector-hnsw: available as a preview feature only, /// to enable visit https://learn.microsoft.com/azure/azure-resource-manager/management/preview-features /// - public AzureCosmosDBVectorSearchType Kind { get; set; } - - /// - /// NumLists: This integer is the number of clusters that the inverted file (IVF) index uses to group the vector data. - /// We recommend that numLists is set to documentCount/1000 for up to 1 million documents and to sqrt(documentCount) - /// for more than 1 million documents. Using a numLists value of 1 is akin to performing brute-force search, which has - /// limited performance. - /// - public int NumLists { get; set; } + public VectorIndexType Kind { get; set; } /// /// Number of dimensions for vector similarity. The maximum number of supported dimensions is 2000. @@ -42,32 +30,9 @@ public class AzureCosmosDBNoSQLConfig public int Dimensions { get; set; } /// - /// Similarity: Similarity metric to use with the IVF index. - /// Possible options are: - /// - COS (cosine distance), - /// - L2 (Euclidean distance), and - /// - IP (inner product). - /// - public AzureCosmosDBSimilarityType Similarity { get; set; } - - /// - /// NumberOfConnections: The max number of connections per layer (16 by default, minimum value is 2, maximum value is - /// 100). Higher m is suitable for datasets with high dimensionality and/or high accuracy requirements. - /// - public int NumberOfConnections { get; set; } - - /// - /// EfConstruction: the size of the dynamic candidate list for constructing the graph (64 by default, minimum value is 4, - /// maximum value is 1000). Higher ef_construction will result in better index quality and higher accuracy, but it will - /// also increase the time required to build the index. EfConstruction has to be at least 2 * m - /// - public int EfConstruction { get; set; } - - /// - /// EfSearch: The size of the dynamic candidate list for search (40 by default). A higher value provides better recall at - /// the cost of speed. + /// Similarity: Distance function to use for the index. /// - public int EfSearch { get; set; } + public DistanceFunction DistanceFunction { get; set; } /// /// Initialize the AzureCosmosDBNoSQLConfig with default values @@ -75,12 +40,7 @@ public class AzureCosmosDBNoSQLConfig public AzureCosmosDBNoSQLConfig() { this.ApplicationName = HttpHeaderConstant.Values.UserAgent; - this.IndexName = "default_index"; - this.Kind = AzureCosmosDBVectorSearchType.VectorHNSW; - this.NumLists = 1; - this.Similarity = AzureCosmosDBSimilarityType.Cosine; - this.NumberOfConnections = 16; - this.EfConstruction = 64; - this.EfSearch = 40; + this.Kind = VectorIndexType.QuantizedFlat; + this.DistanceFunction = DistanceFunction.Cosine; } } diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs deleted file mode 100644 index 29caf1ccd5f8..000000000000 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecord.cs +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System; -using System.Linq; -using Microsoft.SemanticKernel.Memory; -using NoSQL.Bson; -using NoSQL.Bson.Serialization; -using NoSQL.Bson.Serialization.Attributes; - -namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; - -/// -/// A NoSQL memory record. -/// -internal sealed class AzureCosmosDBNoSQLMemoryRecord -{ - /// - /// Unique identifier of the memory entry. - /// - [BsonId] - public string Id { get; set; } - - /// - /// Metadata associated with memory entity. - /// - [BsonElement("metadata")] - public AzureCosmosDBNoSQLMemoryRecordMetadata Metadata { get; set; } - - /// - /// Source content embedding. - /// -#pragma warning disable CA1819 // Properties should not return arrays - [BsonElement("embedding")] - public float[] Embedding { get; set; } -#pragma warning restore CA1819 // Properties should not return arrays - - /// - /// Optional timestamp. - /// - [BsonElement("timestamp")] - [BsonDateTimeOptions(Kind = DateTimeKind.Utc, Representation = BsonType.DateTime)] - public DateTime? Timestamp { get; set; } - - /// - /// Initializes a new instance of the class. - /// - /// Instance to copy values from. - public AzureCosmosDBNoSQLMemoryRecord(MemoryRecord memoryRecord) - { - this.Id = memoryRecord.Key; - this.Metadata = new AzureCosmosDBNoSQLMemoryRecordMetadata(memoryRecord.Metadata); - this.Embedding = memoryRecord.Embedding.ToArray(); - this.Timestamp = memoryRecord.Timestamp?.UtcDateTime; - } - - /// - /// Returns mapped . - /// - public static MemoryRecord ToMemoryRecord(BsonDocument doc, bool withEmbedding) - { - return new( - BsonSerializer - .Deserialize( - doc["metadata"].AsBsonDocument - ) - .ToMemoryRecordMetadata(), - withEmbedding - ? doc["embedding"].AsBsonArray.Select(x => (float)x.AsDouble).ToArray() - : null, - doc["_id"].AsString, - doc["timestamp"]?.ToUniversalTime() - ); - - // return result; - } - - /// - /// Returns mapped . - /// - public MemoryRecord ToMemoryRecord(bool withEmbedding) - { - return new( - this.Metadata.ToMemoryRecordMetadata(), - withEmbedding ? this.Embedding : null, - this.Id, - this.Timestamp?.ToLocalTime() - ); - } -} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs deleted file mode 100644 index 13f9874c144f..000000000000 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryRecordMetadata.cs +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using Microsoft.SemanticKernel.Memory; -using NoSQL.Bson.Serialization.Attributes; - -namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; - -/// -/// A NoSQL memory record metadata. -/// -#pragma warning disable CA1815 // Override equals and operator equals on value types -internal struct AzureCosmosDBNoSQLMemoryRecordMetadata -#pragma warning restore CA1815 // Override equals and operator equals on value types -{ - /// - /// Whether the source data used to calculate embeddings are stored in the local - /// storage provider or is available through and external service, such as web site, MS Graph, etc. - /// - [BsonElement("isReference")] - public bool IsReference { get; set; } - - /// - /// A value used to understand which external service owns the data, to avoid storing the information - /// inside the URI. E.g. this could be "MSTeams", "WebSite", "GitHub", etc. - /// - [BsonElement("externalSourceName")] - [BsonIgnoreIfDefault] - public string ExternalSourceName { get; set; } - - /// - /// Unique identifier. The format of the value is domain specific, so it can be a URL, a GUID, etc. - /// - [BsonId] - public string Id { get; set; } - - /// - /// Optional title describing the content. Note: the title is not indexed. - /// - [BsonElement("description")] - [BsonIgnoreIfDefault] - public string Description { get; set; } - - /// - /// Source text, available only when the memory is not an external source. - /// - [BsonElement("text")] - [BsonIgnoreIfDefault] - public string Text { get; set; } - - /// - /// Field for saving custom metadata with a memory. - /// - [BsonElement("additionalMetadata")] - [BsonIgnoreIfDefault] - public string AdditionalMetadata { get; set; } - - /// - /// Initializes a new instance of structure. - /// - public AzureCosmosDBNoSQLMemoryRecordMetadata(MemoryRecordMetadata memoryRecordMetadata) - { - this.IsReference = memoryRecordMetadata.IsReference; - this.ExternalSourceName = memoryRecordMetadata.ExternalSourceName; - this.Id = memoryRecordMetadata.Id; - this.Description = memoryRecordMetadata.Description; - this.Text = memoryRecordMetadata.Text; - this.AdditionalMetadata = memoryRecordMetadata.AdditionalMetadata; - } - - /// - /// Returns mapped . - /// - public MemoryRecordMetadata ToMemoryRecordMetadata() => - new( - this.IsReference, - this.ExternalSourceName, - this.Id, - this.Description, - this.Text, - this.AdditionalMetadata - ); -} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs deleted file mode 100644 index f51c5b626fee..000000000000 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBSimilarityType.cs +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System.Text.Json.Serialization; - -// ReSharper disable InconsistentNaming -namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; - -/// -/// Similarity metric to use with the index. Possible options are COS (cosine distance), L2 (Euclidean distance), and IP (inner product). -/// -public enum AzureCosmosDBSimilarityType -{ - /// - /// Cosine similarity - /// - [JsonPropertyName("COS")] - Cosine, - - /// - /// Inner Product similarity - /// - [JsonPropertyName("IP")] - InnerProduct, - - /// - /// Euclidian similarity - /// - [JsonPropertyName("L2")] - Euclidian -} From 19ebe03824ee8c77898141fb6453a1be329af6ec Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Wed, 1 May 2024 13:04:15 -0700 Subject: [PATCH 03/27] More WIP --- .../AzureCosmosDBNoSQLConfig.cs | 46 ------------------- .../AzureCosmosDBNoSQLMemoryStore.cs | 14 +++--- 2 files changed, 7 insertions(+), 53 deletions(-) delete mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs deleted file mode 100644 index 8c43049fce31..000000000000 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLConfig.cs +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using Microsoft.Azure.Cosmos; -using Microsoft.SemanticKernel.Http; - -namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; - -/// -/// Get more details about Azure Cosmos DB and these configs https://learn.microsoft.com/azure/cosmos-db/ -/// -public class AzureCosmosDBNoSQLConfig -{ - /// - /// Application name for the client for tracking and logging - /// - public string ApplicationName { get; set; } - - /// - /// Kind: Type of vector index to create. - /// Possible options are: - /// - vector-ivf - /// - vector-hnsw: available as a preview feature only, - /// to enable visit https://learn.microsoft.com/azure/azure-resource-manager/management/preview-features - /// - public VectorIndexType Kind { get; set; } - - /// - /// Number of dimensions for vector similarity. The maximum number of supported dimensions is 2000. - /// - public int Dimensions { get; set; } - - /// - /// Similarity: Distance function to use for the index. - /// - public DistanceFunction DistanceFunction { get; set; } - - /// - /// Initialize the AzureCosmosDBNoSQLConfig with default values - /// - public AzureCosmosDBNoSQLConfig() - { - this.ApplicationName = HttpHeaderConstant.Values.UserAgent; - this.Kind = VectorIndexType.QuantizedFlat; - this.DistanceFunction = DistanceFunction.Cosine; - } -} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 6afea189c9ba..1013f8bec996 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -18,7 +18,8 @@ namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; public class AzureCosmosDBNoSQLMemoryStore : IMemoryStore, IDisposable { private readonly CosmosClient _cosmosClient; - private readonly AzureCosmosDBNoSQLConfig _config; + private readonly Embedding _embedding; + private readonly IndexingPolicy _indexingPolicy; /// /// Initiates a AzureCosmosDBNoSQLMemoryStore instance using a Azure Cosmos DB connection string @@ -30,14 +31,13 @@ public class AzureCosmosDBNoSQLMemoryStore : IMemoryStore, IDisposable public AzureCosmosDBNoSQLMemoryStore( string connectionString, string databaseName, - AzureCosmosDBNoSQLConfig config + Embedding embedding, + IndexingPolicy indexingPolicy, + string applicationName, ) { - MongoClientSettings settings = MongoClientSettings.FromConnectionString(connectionString); - this._config = config; - settings.ApplicationName = this._config.ApplicationName; - this._mongoClient = new MongoClient(settings); - this._mongoDatabase = this._mongoClient.GetDatabase(databaseName); + this._cosmosClient = new CosmosClient(connectionString); + this._databaseName = databaseName; } /// From 1c09ecc990840ae721c0e8f5769180d92ae79955 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Wed, 1 May 2024 15:02:53 -0700 Subject: [PATCH 04/27] Something that compiles --- .../AzureCosmosDBNoSQLMemoryStore.cs | 252 ++++++++---------- 1 file changed, 117 insertions(+), 135 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 1013f8bec996..9a3d7cbe6e4d 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -7,6 +7,7 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos; +using Microsoft.SemanticKernel.Http; using Microsoft.SemanticKernel.Memory; namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; @@ -18,8 +19,9 @@ namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; public class AzureCosmosDBNoSQLMemoryStore : IMemoryStore, IDisposable { private readonly CosmosClient _cosmosClient; - private readonly Embedding _embedding; + private readonly VectorEmbeddingPolicy _vectorEmbeddingPolicy; private readonly IndexingPolicy _indexingPolicy; + private readonly string _databaseName; /// /// Initiates a AzureCosmosDBNoSQLMemoryStore instance using a Azure Cosmos DB connection string @@ -27,17 +29,24 @@ public class AzureCosmosDBNoSQLMemoryStore : IMemoryStore, IDisposable /// /// Connection string required to connect to Azure Cosmos DB. /// The database name to connect to. - /// Azure CosmosDB NoSQL Config containing specific parameters for vector search. + /// Details about the to use. + /// The to use. + /// The application name to use in requests. public AzureCosmosDBNoSQLMemoryStore( string connectionString, string databaseName, - Embedding embedding, + VectorEmbeddingPolicy vectorEmbeddingPolicy, IndexingPolicy indexingPolicy, - string applicationName, - ) + string? applicationName = null) { - this._cosmosClient = new CosmosClient(connectionString); + var options = new CosmosClientOptions + { + ApplicationName = applicationName ?? HttpHeaderConstant.Values.UserAgent, + }; + this._cosmosClient = new CosmosClient(connectionString, options); this._databaseName = databaseName; + this._vectorEmbeddingPolicy = vectorEmbeddingPolicy; + this._indexingPolicy = indexingPolicy; } /// @@ -47,66 +56,50 @@ public AzureCosmosDBNoSQLMemoryStore( public AzureCosmosDBNoSQLMemoryStore( CosmosClient cosmosClient, string databaseName, - AzureCosmosDBNoSQLonfig config) + VectorEmbeddingPolicy vectorEmbeddingPolicy, + IndexingPolicy indexingPolicy, + string? applicationName = null) { - MongoClientSettings settings = cosmosClient.Settings; - this._config = config; - settings.ApplicationName = this._config.ApplicationName; - this._cosmosClient = new MongoClient(settings); - this._mongoDatabase = this._mongoClient.GetDatabase(databaseName); + this._cosmosClient = cosmosClient; + this._cosmosClient.ClientOptions.ApplicationName = applicationName; + this._databaseName = databaseName; + this._vectorEmbeddingPolicy = vectorEmbeddingPolicy; + this._indexingPolicy = indexingPolicy; } /// public async Task CreateCollectionAsync( string collectionName, - CancellationToken cancellationToken = default - ) + CancellationToken cancellationToken = default) { - await this - ._mongoDatabase.CreateCollectionAsync( - collectionName, - cancellationToken: cancellationToken - ) - .ConfigureAwait(false); - var indexes = await this.GetCollection(collectionName) - .Indexes.ListAsync(cancellationToken: cancellationToken) - .ConfigureAwait(false); + var databaseResponse = await this._cosmosClient.CreateDatabaseAsync( + this._databaseName, cancellationToken: cancellationToken).ConfigureAwait(false); - if (!indexes.ToList(cancellationToken: cancellationToken).Any(index => index["name"] == this._config.IndexName)) + var containerProperties = new ContainerProperties(collectionName, "id") { - var command = new BsonDocument(); - switch (this._config.Kind) - { - case AzureCosmosDBVectorSearchType.VectorIVF: - command = this.GetIndexDefinitionVectorIVF(collectionName); - break; - case AzureCosmosDBVectorSearchType.VectorHNSW: - command = this.GetIndexDefinitionVectorHNSW(collectionName); - break; - } - await this - ._mongoDatabase.RunCommandAsync( - command, - cancellationToken: cancellationToken - ) - .ConfigureAwait(false); - } + VectorEmbeddingPolicy = this._vectorEmbeddingPolicy, + IndexingPolicy = this._indexingPolicy, + }; + var containerResponse = databaseResponse.Database.CreateContainerAsync( + containerProperties, + cancellationToken: cancellationToken).ConfigureAwait(false); } /// public async IAsyncEnumerable GetCollectionsAsync( - [EnumeratorCancellation] CancellationToken cancellationToken = default - ) + [EnumeratorCancellation] CancellationToken cancellationToken = default) { - using var cursor = await this - ._mongoDatabase.ListCollectionNamesAsync(cancellationToken: cancellationToken) - .ConfigureAwait(false); + using var feedIterator = this. + _cosmosClient + .GetDatabase(this._databaseName) + .GetContainerQueryIterator("SELECT c.Id FROM c"); - while (await cursor.MoveNextAsync(cancellationToken).ConfigureAwait(false)) + while (feedIterator.HasMoreResults) { - foreach (var name in cursor.Current) + var next = await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false); + foreach (var containerName in next.Resource) { - yield return name; + yield return containerName; } } } @@ -114,8 +107,7 @@ public async IAsyncEnumerable GetCollectionsAsync( /// public async Task DoesCollectionExistAsync( string collectionName, - CancellationToken cancellationToken = default - ) + CancellationToken cancellationToken = default) { await foreach ( var existingCollectionName in this.GetCollectionsAsync(cancellationToken) @@ -133,25 +125,24 @@ var existingCollectionName in this.GetCollectionsAsync(cancellationToken) /// public Task DeleteCollectionAsync( string collectionName, - CancellationToken cancellationToken = default - ) => this._mongoDatabase.DropCollectionAsync(collectionName, cancellationToken); + CancellationToken cancellationToken = default) + { + return this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .DeleteContainerAsync(cancellationToken: cancellationToken); + } /// public async Task UpsertAsync( string collectionName, MemoryRecord record, - CancellationToken cancellationToken = default - ) + CancellationToken cancellationToken = default) { - var replaceOptions = new ReplaceOptions() { IsUpsert = true }; - - var result = await this.GetCollection(collectionName) - .ReplaceOneAsync( - GetFilterById(record.Metadata.Id), - new AzureCosmosDBNoSQLryRecord(record), - replaceOptions, - cancellationToken - ) + var result = await this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .UpsertItemAsync(record, cancellationToken: cancellationToken) .ConfigureAwait(false); return record.Key; @@ -161,8 +152,7 @@ public async Task UpsertAsync( public async IAsyncEnumerable UpsertBatchAsync( string collectionName, IEnumerable records, - [EnumeratorCancellation] CancellationToken cancellationToken = default - ) + [EnumeratorCancellation] CancellationToken cancellationToken = default) { foreach (var record in records) { @@ -176,19 +166,15 @@ public async IAsyncEnumerable UpsertBatchAsync( string collectionName, string key, bool withEmbedding = false, - CancellationToken cancellationToken = default - ) + CancellationToken cancellationToken = default) { - using var cursor = await this.GetCollection(collectionName) - .FindAsync(GetFilterById(key), null, cancellationToken) - .ConfigureAwait(false); - - var cosmosRecord = await cursor - .SingleOrDefaultAsync(cancellationToken) - .ConfigureAwait(false); - var result = cosmosRecord?.ToMemoryRecord(withEmbedding); + var result = await this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .ReadItemAsync(key, new PartitionKey(key), cancellationToken: cancellationToken) + .ConfigureAwait(false); - return result; + return result.Resource; } /// @@ -196,36 +182,49 @@ public async IAsyncEnumerable GetBatchAsync( string collectionName, IEnumerable keys, bool withEmbeddings = false, - [EnumeratorCancellation] CancellationToken cancellationToken = default - ) + [EnumeratorCancellation] CancellationToken cancellationToken = default) { - using var cursor = await this.GetCollection(collectionName) - .FindAsync(GetFilterByIds(keys), null, cancellationToken) + var items = keys.Select(k => (k, new PartitionKey(k))).ToList(); + var feedResponse = await this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .ReadManyItemsAsync(items, cancellationToken: cancellationToken) .ConfigureAwait(false); - while (await cursor.MoveNextAsync(cancellationToken).ConfigureAwait(false)) + foreach (var item in feedResponse.Resource) { - foreach (var cosmosRecord in cursor.Current) - { - yield return cosmosRecord.ToMemoryRecord(withEmbeddings); - } + yield return item; } } /// - public Task RemoveAsync( + public async Task RemoveAsync( string collectionName, string key, - CancellationToken cancellationToken = default - ) => this.GetCollection(collectionName).DeleteOneAsync(GetFilterById(key), cancellationToken); + CancellationToken cancellationToken = default) + { + var response = await this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .DeleteItemAsync(key, new PartitionKey(key), cancellationToken: cancellationToken) + .ConfigureAwait(false); + } /// - public Task RemoveBatchAsync( + public async Task RemoveBatchAsync( string collectionName, IEnumerable keys, - CancellationToken cancellationToken = default - ) => - this.GetCollection(collectionName).DeleteManyAsync(GetFilterByIds(keys), cancellationToken); + CancellationToken cancellationToken = default) + { + foreach (var key in keys) + { + var response = await this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .DeleteItemAsync(key, new PartitionKey(key), cancellationToken: cancellationToken) + .ConfigureAwait(false); + } + } /// public async Task<(MemoryRecord, double)?> GetNearestMatchAsync( @@ -233,29 +232,14 @@ public Task RemoveBatchAsync( ReadOnlyMemory embedding, double minRelevanceScore = 0, bool withEmbedding = false, - CancellationToken cancellationToken = default - ) + CancellationToken cancellationToken = default) { - using var cursor = await this.VectorSearchAsync( - 1, - embedding, - collectionName, - cancellationToken - ) - .ConfigureAwait(false); - var result = await cursor.FirstOrDefaultAsync(cancellationToken).ConfigureAwait(false); - // Access the similarityScore from the BSON document - double similarityScore = result.GetValue("similarityScore").AsDouble; - if (similarityScore < minRelevanceScore) + await foreach (var item in this.GetNearestMatchesAsync(collectionName, embedding, limit: 1, minRelevanceScore, withEmbedding, cancellationToken).ConfigureAwait(false)) { - return null; + return item; } - MemoryRecord memoryRecord = AzureCosmosDBNoSQLmoryRecord.ToMemoryRecord( - result["document"].AsBsonDocument, - withEmbedding - ); - return (memoryRecord, similarityScore); + return null; } /// @@ -265,32 +249,30 @@ public Task RemoveBatchAsync( int limit, double minRelevanceScore = 0, bool withEmbeddings = false, - [EnumeratorCancellation] CancellationToken cancellationToken = default - ) + [EnumeratorCancellation] CancellationToken cancellationToken = default) { - using var cursor = await this.VectorSearchAsync( - limit, - embedding, - collectionName, - cancellationToken - ) - .ConfigureAwait(false); - while (await cursor.MoveNextAsync(cancellationToken).ConfigureAwait(false)) + var queryDefinition = new QueryDefinition(""" + SELECT x, VectorSimilarity(x.Embedding, @embedding) AS SimilarityScore + FROM x + ORDER BY VectorSimilarity(x.Embedding, @embedding) + WHERE SimilarityScore >= @minRelevanceScore + TOP @limit + """); + queryDefinition.WithParameter("embedding", embedding); + queryDefinition.WithParameter("limit", limit); + queryDefinition.WithParameter("minRelevanceScore", minRelevanceScore); + + var feedIterator = this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .GetItemQueryIterator(queryDefinition); + + while (feedIterator.HasMoreResults) { - foreach (var doc in cursor.Current) + foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) { - // Access the similarityScore from the BSON document - var similarityScore = doc.GetValue("similarityScore").AsDouble; - if (similarityScore < minRelevanceScore) - { - continue; - } - - MemoryRecord memoryRecord = AzureCosmosDBNoSQLMemoryRecord.ToMemoryRecord( - doc["document"].AsBsonDocument, - withEmbeddings - ); - yield return (memoryRecord, similarityScore); + // TODO: Get the similarity score out too. + yield return (memoryRecord, 0.0); } } } From 5f7ad7ff57078df4477d4c6662ef8020b57fcc3b Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Thu, 2 May 2024 08:30:59 -0700 Subject: [PATCH 05/27] Something sort of working --- .../AzureCosmosDBNoSQLMemoryStore.cs | 62 +++++--- .../CosmosSystemTextJSonSerializer.cs | 131 ++++++++++++++++ .../AzureCosmosDBNoSQLMemoryStoreTests.cs | 142 ++++++++++++++++++ ...ureCosmosDBNoSQLMemoryStoreTestsFixture.cs | 95 ++++++++++++ .../Memory/AzureCosmosDBNoSQL/DataHelper.cs | 36 +++++ .../IntegrationTests/IntegrationTests.csproj | 5 +- .../Memory/MemoryRecord.cs | 8 + 7 files changed, 457 insertions(+), 22 deletions(-) create mode 100644 dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs create mode 100644 dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs create mode 100644 dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs create mode 100644 dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 9a3d7cbe6e4d..318f9a87b08d 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Linq; using System.Runtime.CompilerServices; +using System.Text.Json; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos; @@ -42,6 +43,7 @@ public AzureCosmosDBNoSQLMemoryStore( var options = new CosmosClientOptions { ApplicationName = applicationName ?? HttpHeaderConstant.Values.UserAgent, + Serializer = new CosmosSystemTextJsonSerializer(JsonSerializerOptions.Default), }; this._cosmosClient = new CosmosClient(connectionString, options); this._databaseName = databaseName; @@ -72,15 +74,15 @@ public async Task CreateCollectionAsync( string collectionName, CancellationToken cancellationToken = default) { - var databaseResponse = await this._cosmosClient.CreateDatabaseAsync( + var databaseResponse = await this._cosmosClient.CreateDatabaseIfNotExistsAsync( this._databaseName, cancellationToken: cancellationToken).ConfigureAwait(false); - var containerProperties = new ContainerProperties(collectionName, "id") + var containerProperties = new ContainerProperties(collectionName, "/key") { VectorEmbeddingPolicy = this._vectorEmbeddingPolicy, IndexingPolicy = this._indexingPolicy, }; - var containerResponse = databaseResponse.Database.CreateContainerAsync( + var containerResponse = await databaseResponse.Database.CreateContainerIfNotExistsAsync( containerProperties, cancellationToken: cancellationToken).ConfigureAwait(false); } @@ -92,14 +94,14 @@ public async IAsyncEnumerable GetCollectionsAsync( using var feedIterator = this. _cosmosClient .GetDatabase(this._databaseName) - .GetContainerQueryIterator("SELECT c.Id FROM c"); + .GetContainerQueryIterator("SELECT VALUE(c.id) FROM c"); while (feedIterator.HasMoreResults) { var next = await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false); - foreach (var containerName in next.Resource) + foreach (var container in next.Resource) { - yield return containerName; + yield return container; } } } @@ -123,14 +125,15 @@ var existingCollectionName in this.GetCollectionsAsync(cancellationToken) } /// - public Task DeleteCollectionAsync( + public async Task DeleteCollectionAsync( string collectionName, CancellationToken cancellationToken = default) { - return this._cosmosClient + await this._cosmosClient .GetDatabase(this._databaseName) .GetContainer(collectionName) - .DeleteContainerAsync(cancellationToken: cancellationToken); + .DeleteContainerAsync(cancellationToken: cancellationToken) + .ConfigureAwait(false); } /// @@ -142,7 +145,7 @@ public async Task UpsertAsync( var result = await this._cosmosClient .GetDatabase(this._databaseName) .GetContainer(collectionName) - .UpsertItemAsync(record, cancellationToken: cancellationToken) + .UpsertItemAsync(record, new PartitionKey(record.Key), cancellationToken: cancellationToken) .ConfigureAwait(false); return record.Key; @@ -251,28 +254,28 @@ public async Task RemoveBatchAsync( bool withEmbeddings = false, [EnumeratorCancellation] CancellationToken cancellationToken = default) { + // It would be nice to "WHERE" on the similarity score, but alas. var queryDefinition = new QueryDefinition(""" - SELECT x, VectorSimilarity(x.Embedding, @embedding) AS SimilarityScore + SELECT TOP @limit x.id,x.key,x.metadata,x.timestamp,VectorDistance(x.embedding, @embedding) AS SimilarityScore FROM x - ORDER BY VectorSimilarity(x.Embedding, @embedding) - WHERE SimilarityScore >= @minRelevanceScore - TOP @limit + ORDER BY VectorDistance(x.embedding, @embedding) """); - queryDefinition.WithParameter("embedding", embedding); - queryDefinition.WithParameter("limit", limit); - queryDefinition.WithParameter("minRelevanceScore", minRelevanceScore); + queryDefinition.WithParameter("@limit", limit); + queryDefinition.WithParameter("@embedding", embedding); var feedIterator = this._cosmosClient .GetDatabase(this._databaseName) .GetContainer(collectionName) - .GetItemQueryIterator(queryDefinition); + .GetItemQueryIterator(queryDefinition); while (feedIterator.HasMoreResults) { foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) { - // TODO: Get the similarity score out too. - yield return (memoryRecord, 0.0); + if (memoryRecord.SimilarityScore >= minRelevanceScore) + { + yield return (memoryRecord, memoryRecord.SimilarityScore); + } } } } @@ -298,3 +301,22 @@ protected virtual void Dispose(bool disposing) } } } + +/// +/// Creates a new record with a similarity score. +/// +/// +/// +/// +/// +public class MemoryRecordWithSimilarityScore( + MemoryRecordMetadata metadata, + ReadOnlyMemory embedding, + string? key, + DateTimeOffset? timestamp = null) : MemoryRecord(metadata, embedding, key, timestamp) +{ + /// + /// The similarity score returned. + /// + public double SimilarityScore { get; set; } +} diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs new file mode 100644 index 000000000000..eac8f93befe9 --- /dev/null +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs @@ -0,0 +1,131 @@ +// Copyright (c) Microsoft. All rights reserved. + +// Taken from https://github.com/Azure/azure-cosmos-dotnet-v3/pull/4332 + +using System; +using System.IO; +using System.Reflection; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Microsoft.Azure.Cosmos; + +/// +/// This class provides a default implementation of System.Text.Json Cosmos Linq Serializer. +/// +public class CosmosSystemTextJsonSerializer : CosmosLinqSerializer +{ + /// + /// A read-only instance of . + /// + private readonly JsonSerializerOptions _jsonSerializerOptions; + + /// + /// Creates an instance of + /// with the default values for the Cosmos SDK + /// + /// An instance of containing the json serialization options. + public CosmosSystemTextJsonSerializer( + JsonSerializerOptions jsonSerializerOptions) + { + this._jsonSerializerOptions = jsonSerializerOptions; + } + + /// + public override T FromStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + + if (stream.CanSeek && stream.Length == 0) + { + return default; + } + + if (typeof(Stream).IsAssignableFrom(typeof(T))) + { + return (T)(object)stream; + } + + using (stream) + { + using StreamReader reader = new(stream); + return JsonSerializer.Deserialize(reader.ReadToEnd(), this._jsonSerializerOptions); + } + } + + /// + public override Stream ToStream(T input) + { + MemoryStream streamPayload = new(); + using Utf8JsonWriter writer = new(streamPayload); + + JsonSerializer.Serialize( + writer: writer, + value: input, + options: this._jsonSerializerOptions); + + streamPayload.Position = 0; + return streamPayload; + } + + /// + /// Convert a MemberInfo to a string for use in LINQ query translation. + /// + /// Any MemberInfo used in the query. + /// A serialized representation of the member. + /// + /// Note that this is just a default implementation which handles the basic scenarios. Any passed in + /// here are not going to be reflected in SerializeMemberName(). For example, if customers passed in a JsonSerializerOption such as below + /// + /// + /// + /// This would not be honored by SerializeMemberName() unless it included special handling for this, for example. + /// + /// (true); + /// if (jsonExtensionDataAttribute != null) + /// { + /// return null; + /// } + /// JsonPropertyNameAttribute jsonPropertyNameAttribute = memberInfo.GetCustomAttribute(true); + /// if (!string.IsNullOrEmpty(jsonPropertyNameAttribute?.Name)) + /// { + /// return jsonPropertyNameAttribute.Name; + /// } + /// return System.Text.Json.JsonNamingPolicy.CamelCase.ConvertName(memberInfo.Name); + /// } + /// ]]> + /// + /// To handle such scenarios, please create a custom serializer which inherits from the and overrides the + /// SerializeMemberName to add any special handling. + /// + public override string SerializeMemberName(MemberInfo memberInfo) + { + JsonExtensionDataAttribute jsonExtensionDataAttribute = + memberInfo.GetCustomAttribute(true); + + if (jsonExtensionDataAttribute != null) + { + return null; + } + + JsonPropertyNameAttribute jsonPropertyNameAttribute = memberInfo.GetCustomAttribute(true); + + string memberName = !string.IsNullOrEmpty(jsonPropertyNameAttribute?.Name) + ? jsonPropertyNameAttribute.Name + : memberInfo.Name; + + return memberName; + } +} diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs new file mode 100644 index 000000000000..ebe001119454 --- /dev/null +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; +using Microsoft.SemanticKernel.Memory; +using MongoDB.Driver; +using Xunit; + +namespace SemanticKernel.IntegrationTests.Connectors.AzureCosmosDBNoSQL; + +/// +/// Integration tests of . +/// +public class AzureCosmosDBNoSQLMemoryStoreTests : IClassFixture +{ + private const string? SkipReason = null;//"Azure CosmosDB Account with Vector indexing enabled required"; + + private readonly AzureCosmosDBNoSQLMemoryStoreTestsFixture _fixture; + + public AzureCosmosDBNoSQLMemoryStoreTests(AzureCosmosDBNoSQLMemoryStoreTestsFixture fixture) + { + this._fixture = fixture; + } + + [Fact(Skip = SkipReason)] + public async Task ItCanCreateGetCheckAndDeleteCollectionAsync() + { + var collectionName = this._fixture.CollectionName; + var memoryStore = this._fixture.MemoryStore; + + await memoryStore.CreateCollectionAsync(collectionName); + var collectionNames = memoryStore.GetCollectionsAsync(); + + Assert.True(await collectionNames.ContainsAsync(collectionName)); + Assert.True(await memoryStore.DoesCollectionExistAsync(collectionName)); + + await memoryStore.DeleteCollectionAsync(collectionName); + Assert.False(await memoryStore.DoesCollectionExistAsync(collectionName)); + } + + [Theory(Skip = SkipReason)] + [InlineData(true)] + [InlineData(false)] + public async Task ItCanBatchUpsertGetRemoveAsync(bool withEmbeddings) + { + const int Count = 10; + var collectionName = this._fixture.CollectionName; + var memoryStore = this._fixture.MemoryStore; + var records = DataHelper.CreateBatchRecords(Count); + + await memoryStore.CreateCollectionAsync(collectionName); + var keys = await memoryStore.UpsertBatchAsync(collectionName, records).ToListAsync(); + var actualRecords = await memoryStore + .GetBatchAsync(collectionName, keys, withEmbeddings: withEmbeddings) + .ToListAsync(); + + Assert.NotNull(keys); + Assert.NotNull(actualRecords); + Assert.Equal(keys, actualRecords.Select(obj => obj.Key).ToList()); + Console.WriteLine(actualRecords); + + var actualRecordsOrdered = actualRecords.OrderBy(r => r.Key).ToArray(); + for (int i = 0; i < Count; i++) + { + AssertMemoryRecordEqual( + records[i], + actualRecordsOrdered[i], + assertEmbeddingEqual: withEmbeddings + ); + } + + await memoryStore.RemoveBatchAsync(collectionName, keys); + var ids = await memoryStore.GetBatchAsync(collectionName, keys).ToListAsync(); + Assert.Empty(ids); + } + + [Theory(Skip = SkipReason)] + [InlineData(1, false)] + [InlineData(1, true)] + [InlineData(5, false)] + [InlineData(8, false)] + public async Task ItCanGetNearestMatchesAsync(int limit, bool withEmbeddings) + { + var collectionName = this._fixture.CollectionName; + var memoryStore = this._fixture.MemoryStore; + var searchEmbedding = DataHelper.VectorSearchTestEmbedding; + var nearestMatchesExpected = DataHelper.VectorSearchExpectedResults; + + var nearestMatchesActual = await memoryStore + .GetNearestMatchesAsync( + collectionName, + searchEmbedding, + limit, + withEmbeddings: withEmbeddings + ) + .ToListAsync(); + + Assert.NotNull(nearestMatchesActual); + + for (int i = 0; i < limit; i++) + { + AssertMemoryRecordEqual( + nearestMatchesExpected[i], + nearestMatchesActual[i].Item1, + withEmbeddings + ); + } + } + + private static void AssertMemoryRecordEqual( + MemoryRecord expectedRecord, + MemoryRecord actualRecord, + bool assertEmbeddingEqual = true + ) + { + Assert.Equal(expectedRecord.Key, actualRecord.Key); + Assert.Equal(expectedRecord.Timestamp, actualRecord.Timestamp); + Assert.Equal(expectedRecord.Metadata.Id, actualRecord.Metadata.Id); + Assert.Equal(expectedRecord.Metadata.Text, actualRecord.Metadata.Text); + Assert.Equal(expectedRecord.Metadata.Description, actualRecord.Metadata.Description); + Assert.Equal( + expectedRecord.Metadata.AdditionalMetadata, + actualRecord.Metadata.AdditionalMetadata + ); + Assert.Equal(expectedRecord.Metadata.IsReference, actualRecord.Metadata.IsReference); + Assert.Equal( + expectedRecord.Metadata.ExternalSourceName, + actualRecord.Metadata.ExternalSourceName + ); + + if (assertEmbeddingEqual) + { + Assert.True(expectedRecord.Embedding.Span.SequenceEqual(actualRecord.Embedding.Span)); + } + else + { + Assert.True(actualRecord.Embedding.Span.IsEmpty); + } + } +} diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs new file mode 100644 index 000000000000..976316384fdc --- /dev/null +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.ObjectModel; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Azure.Cosmos; +using Microsoft.Extensions.Configuration; +using Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; +using MongoDB.Driver; +using Xunit; + +namespace SemanticKernel.IntegrationTests.Connectors.AzureCosmosDBNoSQL; + +public class AzureCosmosDBNoSQLMemoryStoreTestsFixture : IAsyncLifetime +{ + public AzureCosmosDBNoSQLMemoryStore MemoryStore { get; } + public string DatabaseName { get; } + public string CollectionName { get; } + + public AzureCosmosDBNoSQLMemoryStoreTestsFixture() + { + // Load Configuration + var configuration = new ConfigurationBuilder() + .AddJsonFile(path: "testsettings.json", optional: false, reloadOnChange: true) + .AddJsonFile( + path: "testsettings.development.json", + optional: false, + reloadOnChange: true + ) + .AddEnvironmentVariables() + .Build(); + + var connectionString = GetSetting(configuration, "ConnectionString"); + this.DatabaseName = "DotNetSKTestDB"; + this.CollectionName = "DotNetSKTestCollection"; + this.MemoryStore = new AzureCosmosDBNoSQLMemoryStore( + connectionString, + this.DatabaseName, + new VectorEmbeddingPolicy( + new Collection + { + new() + { + DataType = VectorDataType.Float32, + Dimensions = 3, + DistanceFunction = DistanceFunction.Cosine, + Path = "/embedding" + } + }), + new() + { + VectorIndexes = new Collection { + new() + { + Path = "/embedding", + Type = VectorIndexType.Flat, + }, + }, + } + ); + } + + public async Task InitializeAsync() + { + await this.MemoryStore.CreateCollectionAsync(this.CollectionName); + await this + .MemoryStore.UpsertBatchAsync(this.CollectionName, DataHelper.VectorSearchTestRecords) + .ToListAsync(); + } + + public async Task DisposeAsync() + { + try + { + await this.MemoryStore.DeleteCollectionAsync(this.CollectionName); + } + catch (CosmosException ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) + { + // It's okay if a test already deleted the collection. + } + this.MemoryStore.Dispose(); + } + + private static string GetSetting(IConfigurationRoot configuration, string settingName) + { + var settingValue = configuration[$"AzureCosmosDB:{settingName}"]; + if (string.IsNullOrWhiteSpace(settingValue)) + { + throw new ArgumentNullException($"{settingValue} string is not configured"); + } + + return settingValue; + } +} diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs new file mode 100644 index 000000000000..30c3cef7ad35 --- /dev/null +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Linq; +using System.Numerics.Tensors; +using Microsoft.SemanticKernel.Memory; + +namespace SemanticKernel.IntegrationTests.Connectors.AzureCosmosDBNoSQL; + +internal static class DataHelper +{ + public static MemoryRecord[] VectorSearchExpectedResults { get; } + public static MemoryRecord[] VectorSearchTestRecords { get; } + public static float[] VectorSearchTestEmbedding { get; } + + static DataHelper() + { + VectorSearchTestRecords = CreateBatchRecords(8); + VectorSearchTestEmbedding = new[] { 1, 0.699f, 0.701f }; + VectorSearchExpectedResults = VectorSearchTestRecords + .OrderByDescending(r => TensorPrimitives.CosineSimilarity(r.Embedding.Span, VectorSearchTestEmbedding)) + .ToArray(); + } + + public static MemoryRecord[] CreateBatchRecords(int count) => + Enumerable + .Range(0, count) + .Select(i => MemoryRecord.LocalRecord( + id: $"test_{i}", + text: $"text_{i}", + description: $"description_{i}", + embedding: new[] { 1, (float)Math.Cos(Math.PI * i / count), (float)Math.Sin(Math.PI * i / count) }, + key: $"test_{i}", + timestamp: DateTime.Now)) + .ToArray(); +} diff --git a/dotnet/src/IntegrationTests/IntegrationTests.csproj b/dotnet/src/IntegrationTests/IntegrationTests.csproj index ac04125bc9fa..8f6e3a652d43 100644 --- a/dotnet/src/IntegrationTests/IntegrationTests.csproj +++ b/dotnet/src/IntegrationTests/IntegrationTests.csproj @@ -53,16 +53,17 @@ - + + + - diff --git a/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs b/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs index 1a95ee13dbe0..a5a78d3b59b8 100644 --- a/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs +++ b/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs @@ -27,6 +27,13 @@ public class MemoryRecord : DataEntryBase [JsonPropertyName("metadata")] public MemoryRecordMetadata Metadata { get; } + /// + /// Unique identifier. The format of the value is domain specific, so it can be a URL, a GUID, etc. + /// + [JsonInclude] + [JsonPropertyName("id")] + public string Id { get; } + /// /// Constructor, use or /// @@ -38,6 +45,7 @@ public MemoryRecord( DateTimeOffset? timestamp = null) : base(key, timestamp) { this.Metadata = metadata; + this.Id = metadata.Id; this.Embedding = embedding; } From 7f6bef3dcdcf46556dedd0f1291e79e9e94986bb Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Thu, 2 May 2024 11:04:23 -0700 Subject: [PATCH 06/27] Fix flaky DateTime, and conditionally include embeddings --- .../AzureCosmosDBNoSQLMemoryStore.cs | 19 ++++++++++++++----- .../Memory/AzureCosmosDBNoSQL/DataHelper.cs | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 318f9a87b08d..34ab60c7a107 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -99,9 +99,9 @@ public async IAsyncEnumerable GetCollectionsAsync( while (feedIterator.HasMoreResults) { var next = await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false); - foreach (var container in next.Resource) + foreach (var containerName in next.Resource) { - yield return container; + yield return containerName; } } } @@ -196,7 +196,16 @@ public async IAsyncEnumerable GetBatchAsync( foreach (var item in feedResponse.Resource) { - yield return item; + if (withEmbeddings) + { + yield return item; + } + else + { + // TODO: Consider changing this into a select that doesn't return the embeddings. + // Is that actually better? RU consumption of query, vs ReadMany and transmission of larger docs. + yield return new MemoryRecord(item.Metadata, null, item.Key, item.Timestamp); + } } } @@ -255,8 +264,8 @@ public async Task RemoveBatchAsync( [EnumeratorCancellation] CancellationToken cancellationToken = default) { // It would be nice to "WHERE" on the similarity score, but alas. - var queryDefinition = new QueryDefinition(""" - SELECT TOP @limit x.id,x.key,x.metadata,x.timestamp,VectorDistance(x.embedding, @embedding) AS SimilarityScore + var queryDefinition = new QueryDefinition($""" + SELECT TOP @limit x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")},VectorDistance(x.embedding, @embedding) AS SimilarityScore FROM x ORDER BY VectorDistance(x.embedding, @embedding) """); diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs index 30c3cef7ad35..476142430d6a 100644 --- a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/DataHelper.cs @@ -31,6 +31,6 @@ public static MemoryRecord[] CreateBatchRecords(int count) => description: $"description_{i}", embedding: new[] { 1, (float)Math.Cos(Math.PI * i / count), (float)Math.Sin(Math.PI * i / count) }, key: $"test_{i}", - timestamp: DateTime.Now)) + timestamp: DateTimeOffset.Now)) .ToArray(); } From 4b449fdc93bd349f1f6a2fc9bd34f1fabe8cfee9 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Thu, 2 May 2024 12:58:41 -0700 Subject: [PATCH 07/27] Working with client side TOP --- .../AzureCosmosDBNoSQLMemoryStore.cs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 34ab60c7a107..7c3b47972981 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -2,14 +2,17 @@ using System; using System.Collections.Generic; +using System.IO; using System.Linq; using System.Runtime.CompilerServices; +using System.Text; using System.Text.Json; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos; using Microsoft.SemanticKernel.Http; using Microsoft.SemanticKernel.Memory; +using Microsoft.SemanticKernel.Text; namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; @@ -263,20 +266,23 @@ public async Task RemoveBatchAsync( bool withEmbeddings = false, [EnumeratorCancellation] CancellationToken cancellationToken = default) { - // It would be nice to "WHERE" on the similarity score, but alas. + // It would be nice to "WHERE" on the similarity score to stay above the `minRelevanceScore`, but alas + // queries don't support that. + // TODO: Change this to use a `TOP @limit` instead of stopping client side. var queryDefinition = new QueryDefinition($""" - SELECT TOP @limit x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")},VectorDistance(x.embedding, @embedding) AS SimilarityScore + SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")},VectorDistance(x.embedding, @embedding) AS SimilarityScore FROM x ORDER BY VectorDistance(x.embedding, @embedding) """); - queryDefinition.WithParameter("@limit", limit); queryDefinition.WithParameter("@embedding", embedding); + queryDefinition.WithParameter("@limit", limit); var feedIterator = this._cosmosClient .GetDatabase(this._databaseName) .GetContainer(collectionName) .GetItemQueryIterator(queryDefinition); + var count = 0; while (feedIterator.HasMoreResults) { foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) @@ -284,6 +290,11 @@ ORDER BY VectorDistance(x.embedding, @embedding) if (memoryRecord.SimilarityScore >= minRelevanceScore) { yield return (memoryRecord, memoryRecord.SimilarityScore); + count++; + if (count == limit) + { + yield break; + } } } } From 6b58ac47eb23cda731d7e017f0c96153d9dea054 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Thu, 2 May 2024 15:43:55 -0700 Subject: [PATCH 08/27] Back to query --- .../AzureCosmosDBNoSQLMemoryStore.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 7c3b47972981..5cafd3e2e56b 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -2,6 +2,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; using System.Runtime.CompilerServices; @@ -268,9 +269,8 @@ public async Task RemoveBatchAsync( { // It would be nice to "WHERE" on the similarity score to stay above the `minRelevanceScore`, but alas // queries don't support that. - // TODO: Change this to use a `TOP @limit` instead of stopping client side. var queryDefinition = new QueryDefinition($""" - SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")},VectorDistance(x.embedding, @embedding) AS SimilarityScore + SELECT TOP @limit x.id,x.key,x.metadata,x.timestamp,{(withEmbeddings ? "x.embedding," : "")}VectorDistance(x.embedding, @embedding) AS SimilarityScore FROM x ORDER BY VectorDistance(x.embedding, @embedding) """); @@ -282,7 +282,6 @@ ORDER BY VectorDistance(x.embedding, @embedding) .GetContainer(collectionName) .GetItemQueryIterator(queryDefinition); - var count = 0; while (feedIterator.HasMoreResults) { foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) @@ -290,11 +289,6 @@ ORDER BY VectorDistance(x.embedding, @embedding) if (memoryRecord.SimilarityScore >= minRelevanceScore) { yield return (memoryRecord, memoryRecord.SimilarityScore); - count++; - if (count == limit) - { - yield break; - } } } } @@ -329,6 +323,7 @@ protected virtual void Dispose(bool disposing) /// /// /// +[DebuggerDisplay("{GetDebuggerDisplay()}")] public class MemoryRecordWithSimilarityScore( MemoryRecordMetadata metadata, ReadOnlyMemory embedding, @@ -339,4 +334,9 @@ public class MemoryRecordWithSimilarityScore( /// The similarity score returned. /// public double SimilarityScore { get; set; } + + private string GetDebuggerDisplay() + { + return $"{this.Key} - {this.SimilarityScore}"; + } } From 0a1a1dabb352534478061aab0dfb4e6a35455e4a Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Thu, 2 May 2024 15:54:12 -0700 Subject: [PATCH 09/27] Check length too --- .../AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs index ebe001119454..e3a170e439dc 100644 --- a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs @@ -98,6 +98,7 @@ public async Task ItCanGetNearestMatchesAsync(int limit, bool withEmbeddings) .ToListAsync(); Assert.NotNull(nearestMatchesActual); + Assert.Equal(nearestMatchesExpected.Length, nearestMatchesActual.Count); for (int i = 0; i < limit; i++) { From ab72061be65b9f009e39ce0fa19cc6bd8f7aaedd Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 3 May 2024 12:43:09 -0700 Subject: [PATCH 10/27] Serialize MemoryRecord with a derived type --- .../AzureCosmosDBNoSQLMemoryStore.cs | 55 ++++++++++++------- .../Memory/MemoryRecord.cs | 8 --- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 5cafd3e2e56b..c09a755e886e 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -3,17 +3,15 @@ using System; using System.Collections.Generic; using System.Diagnostics; -using System.IO; using System.Linq; using System.Runtime.CompilerServices; -using System.Text; using System.Text.Json; +using System.Text.Json.Serialization; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos; using Microsoft.SemanticKernel.Http; using Microsoft.SemanticKernel.Memory; -using Microsoft.SemanticKernel.Text; namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; @@ -149,7 +147,7 @@ public async Task UpsertAsync( var result = await this._cosmosClient .GetDatabase(this._databaseName) .GetContainer(collectionName) - .UpsertItemAsync(record, new PartitionKey(record.Key), cancellationToken: cancellationToken) + .UpsertItemAsync(new MemoryRecordWithId(record), new PartitionKey(record.Key), cancellationToken: cancellationToken) .ConfigureAwait(false); return record.Key; @@ -191,24 +189,23 @@ public async IAsyncEnumerable GetBatchAsync( bool withEmbeddings = false, [EnumeratorCancellation] CancellationToken cancellationToken = default) { - var items = keys.Select(k => (k, new PartitionKey(k))).ToList(); - var feedResponse = await this._cosmosClient - .GetDatabase(this._databaseName) - .GetContainer(collectionName) - .ReadManyItemsAsync(items, cancellationToken: cancellationToken) - .ConfigureAwait(false); + var queryDefinition = new QueryDefinition($""" + SELECT x.id,x.key,x.metadata,x.timestamp,{(withEmbeddings ? "x.embedding," : "")} + FROM x + where x.id in @keys + """); + queryDefinition.WithParameter("@keys", keys.Select(k => (k, k))); + + var feedIterator = this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .GetItemQueryIterator(queryDefinition); - foreach (var item in feedResponse.Resource) + while (feedIterator.HasMoreResults) { - if (withEmbeddings) - { - yield return item; - } - else + foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) { - // TODO: Consider changing this into a select that doesn't return the embeddings. - // Is that actually better? RU consumption of query, vs ReadMany and transmission of larger docs. - yield return new MemoryRecord(item.Metadata, null, item.Key, item.Timestamp); + yield return memoryRecord; } } } @@ -340,3 +337,23 @@ private string GetDebuggerDisplay() return $"{this.Key} - {this.SimilarityScore}"; } } + +/// +/// Creates a new record that also serializes an "id" property. +/// +[DebuggerDisplay("{GetDebuggerDisplay()}")] +public class MemoryRecordWithId(MemoryRecord source) + : MemoryRecord(source.Metadata, source.Embedding, source.Key, source.Timestamp) +{ + /// + /// The similarity score returned. + /// + [JsonInclude] + [JsonPropertyName("id")] + public string Id => this.Key; + + private string GetDebuggerDisplay() + { + return this.Key; + } +} diff --git a/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs b/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs index a5a78d3b59b8..1a95ee13dbe0 100644 --- a/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs +++ b/dotnet/src/SemanticKernel.Abstractions/Memory/MemoryRecord.cs @@ -27,13 +27,6 @@ public class MemoryRecord : DataEntryBase [JsonPropertyName("metadata")] public MemoryRecordMetadata Metadata { get; } - /// - /// Unique identifier. The format of the value is domain specific, so it can be a URL, a GUID, etc. - /// - [JsonInclude] - [JsonPropertyName("id")] - public string Id { get; } - /// /// Constructor, use or /// @@ -45,7 +38,6 @@ public MemoryRecord( DateTimeOffset? timestamp = null) : base(key, timestamp) { this.Metadata = metadata; - this.Id = metadata.Id; this.Embedding = embedding; } From bcfa39332b2db5cdd6aa2c3c3ff3e83f40569801 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 3 May 2024 15:29:04 -0700 Subject: [PATCH 11/27] Update query for 'GetBatchAsync' --- .../AzureCosmosDBNoSQLMemoryStore.cs | 31 ++++++++++++++++--- .../AzureCosmosDBNoSQLMemoryStoreTests.cs | 2 +- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index c09a755e886e..3019ad87fe1c 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -189,12 +189,13 @@ public async IAsyncEnumerable GetBatchAsync( bool withEmbeddings = false, [EnumeratorCancellation] CancellationToken cancellationToken = default) { + // TODO: Need to split this into multiple queries if the query string is larger than 512kB + var whereClause = string.Join(" OR ", keys.Select(k => $"(x.id = \"{k}\" AND x.key = \"{k}\")")); var queryDefinition = new QueryDefinition($""" - SELECT x.id,x.key,x.metadata,x.timestamp,{(withEmbeddings ? "x.embedding," : "")} + SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} FROM x - where x.id in @keys + where {whereClause} """); - queryDefinition.WithParameter("@keys", keys.Select(k => (k, k))); var feedIterator = this._cosmosClient .GetDatabase(this._databaseName) @@ -342,9 +343,29 @@ private string GetDebuggerDisplay() /// Creates a new record that also serializes an "id" property. /// [DebuggerDisplay("{GetDebuggerDisplay()}")] -public class MemoryRecordWithId(MemoryRecord source) - : MemoryRecord(source.Metadata, source.Embedding, source.Key, source.Timestamp) +public class MemoryRecordWithId : MemoryRecord { + /// + /// Creates a new record that also serializes an "id" property. + /// + public MemoryRecordWithId(MemoryRecord source) + : base(source.Metadata, source.Embedding, source.Key, source.Timestamp) + { + } + + /// + /// Creates a new record that also serializes an "id" property. + /// + [JsonConstructor] + public MemoryRecordWithId( + MemoryRecordMetadata metadata, + ReadOnlyMemory embedding, + string? key, + DateTimeOffset? timestamp = null) + : base(metadata, embedding, key, timestamp) + { + } + /// /// The similarity score returned. /// diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs index e3a170e439dc..469cf4a36601 100644 --- a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs @@ -98,7 +98,7 @@ public async Task ItCanGetNearestMatchesAsync(int limit, bool withEmbeddings) .ToListAsync(); Assert.NotNull(nearestMatchesActual); - Assert.Equal(nearestMatchesExpected.Length, nearestMatchesActual.Count); + Assert.Equal(limit, nearestMatchesActual.Count); for (int i = 0; i < limit; i++) { From e175661eea619fe98bbb9c86967ae4f9807a7e92 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Tue, 7 May 2024 10:33:41 -0700 Subject: [PATCH 12/27] Cleanup Tests --- .../AzureCosmosDBNoSQLMemoryStoreTests.cs | 7 ++++++ ...ureCosmosDBNoSQLMemoryStoreTestsFixture.cs | 23 ++++--------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs index 469cf4a36601..7b95989e5f18 100644 --- a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs @@ -74,6 +74,8 @@ public async Task ItCanBatchUpsertGetRemoveAsync(bool withEmbeddings) await memoryStore.RemoveBatchAsync(collectionName, keys); var ids = await memoryStore.GetBatchAsync(collectionName, keys).ToListAsync(); Assert.Empty(ids); + + await memoryStore.DeleteCollectionAsync(collectionName); } [Theory(Skip = SkipReason)] @@ -88,6 +90,9 @@ public async Task ItCanGetNearestMatchesAsync(int limit, bool withEmbeddings) var searchEmbedding = DataHelper.VectorSearchTestEmbedding; var nearestMatchesExpected = DataHelper.VectorSearchExpectedResults; + await memoryStore.CreateCollectionAsync(collectionName); + var keys = await memoryStore.UpsertBatchAsync(collectionName, DataHelper.VectorSearchTestRecords).ToListAsync(); + var nearestMatchesActual = await memoryStore .GetNearestMatchesAsync( collectionName, @@ -108,6 +113,8 @@ public async Task ItCanGetNearestMatchesAsync(int limit, bool withEmbeddings) withEmbeddings ); } + + await memoryStore.DeleteCollectionAsync(collectionName); } private static void AssertMemoryRecordEqual( diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs index 976316384fdc..90f0e659644c 100644 --- a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs @@ -61,26 +61,11 @@ public AzureCosmosDBNoSQLMemoryStoreTestsFixture() ); } - public async Task InitializeAsync() - { - await this.MemoryStore.CreateCollectionAsync(this.CollectionName); - await this - .MemoryStore.UpsertBatchAsync(this.CollectionName, DataHelper.VectorSearchTestRecords) - .ToListAsync(); - } + public Task InitializeAsync() + => Task.CompletedTask; - public async Task DisposeAsync() - { - try - { - await this.MemoryStore.DeleteCollectionAsync(this.CollectionName); - } - catch (CosmosException ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) - { - // It's okay if a test already deleted the collection. - } - this.MemoryStore.Dispose(); - } + public Task DisposeAsync() + => Task.CompletedTask; private static string GetSetting(IConfigurationRoot configuration, string settingName) { From 86684f6a1746e9dad03f6a61c9efd0d6c357eae4 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Tue, 7 May 2024 10:33:52 -0700 Subject: [PATCH 13/27] Handle large batch queries --- .../AzureCosmosDBNoSQLMemoryStore.cs | 70 ++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 3019ad87fe1c..b51f13066e20 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Linq; using System.Runtime.CompilerServices; +using System.Text; using System.Text.Json; using System.Text.Json.Serialization; using System.Threading; @@ -189,24 +190,75 @@ public async IAsyncEnumerable GetBatchAsync( bool withEmbeddings = false, [EnumeratorCancellation] CancellationToken cancellationToken = default) { - // TODO: Need to split this into multiple queries if the query string is larger than 512kB + // Optimistically create the entire query string. var whereClause = string.Join(" OR ", keys.Select(k => $"(x.id = \"{k}\" AND x.key = \"{k}\")")); var queryDefinition = new QueryDefinition($""" SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} FROM x - where {whereClause} + WHERE {whereClause} """); - var feedIterator = this._cosmosClient - .GetDatabase(this._databaseName) - .GetContainer(collectionName) - .GetItemQueryIterator(queryDefinition); + // NOTE: Cosmos DB queries are limited to 512kB, so if this is larger than that, break it into segments. + var byteCount = Encoding.UTF8.GetByteCount(whereClause); + var ratio = byteCount / ((float)(512 * 1024)); + if (ratio < 1) + { + var feedIterator = this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .GetItemQueryIterator(queryDefinition); - while (feedIterator.HasMoreResults) + while (feedIterator.HasMoreResults) + { + foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) + { + yield return memoryRecord; + } + } + } + else { - foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) + // We're in the very large case, we'll need to split this into multiple queries. + // We add one to catch any fractional piece left in the last segment + var segments = (int)(ratio + 1); + var keyList = keys.ToList(); + var keysPerQuery = keyList.Count / segments; + // Make a guess as to how long this query will be. We need at least 26 chars for each "OR" block, so + // put a few extra for the values of the keys. + var estimatedWhereLength = 30 * keysPerQuery; + var localWhere = new StringBuilder(estimatedWhereLength); + const string OR = " OR "; + for (var i = 0; i < segments; i++) { - yield return memoryRecord; + localWhere.Clear(); + for (var q = i * keysPerQuery; q < (i + 1) * keysPerQuery && q < keyList.Count; q++) + { + var k = keyList[q]; + localWhere.Append($"(x.id = \"{k}\" AND x.key = \"{k}\")").Append(OR); + } + + if (localWhere.Length >= OR.Length) + { + localWhere.Length -= OR.Length; + + var localQueryDefinition = new QueryDefinition($""" + SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} + FROM x + WHERE {localWhere} + """); + var feedIterator = this._cosmosClient + .GetDatabase(this._databaseName) + .GetContainer(collectionName) + .GetItemQueryIterator(localQueryDefinition); + + while (feedIterator.HasMoreResults) + { + foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) + { + yield return memoryRecord; + } + } + } } } } From 3fa098d7adfdf436a4265a627ec9aa22021021c8 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Tue, 7 May 2024 11:23:17 -0700 Subject: [PATCH 14/27] Formatting fixes --- .../AzureCosmosDBNoSQLMemoryStore.cs | 10 ++++++++-- .../AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs | 2 -- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index b51f13066e20..bcad3349c981 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -33,8 +33,8 @@ public class AzureCosmosDBNoSQLMemoryStore : IMemoryStore, IDisposable /// /// Connection string required to connect to Azure Cosmos DB. /// The database name to connect to. - /// Details about the to use. - /// The to use. + /// The to use if a collection is created. NOTE that embeddings will be stored in a property named 'embedding'. + /// The to use if a collection is created. NOTE that embeddings will be stored in a property named 'embedding'. /// The application name to use in requests. public AzureCosmosDBNoSQLMemoryStore( string connectionString, @@ -58,6 +58,12 @@ public AzureCosmosDBNoSQLMemoryStore( /// Initiates a AzureCosmosDBNoSQLMemoryStore instance using a instance /// and other properties required for vector search. /// + /// An existing to use. NOTE: This must support serializing with + /// System.Text.Json, not the default Cosmos serializer. + /// The database name to operate against. + /// The to use if a collection is created. NOTE that embeddings will be stored in a property named 'embedding'. + /// The to use if a collection is created. NOTE that embeddings will be stored in a property named 'embedding'. + /// The application name to use in requests. public AzureCosmosDBNoSQLMemoryStore( CosmosClient cosmosClient, string databaseName, diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs index 90f0e659644c..93cbea170f40 100644 --- a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTestsFixture.cs @@ -2,12 +2,10 @@ using System; using System.Collections.ObjectModel; -using System.Linq; using System.Threading.Tasks; using Microsoft.Azure.Cosmos; using Microsoft.Extensions.Configuration; using Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; -using MongoDB.Driver; using Xunit; namespace SemanticKernel.IntegrationTests.Connectors.AzureCosmosDBNoSQL; From 2f5eee9d3dd8747e89f3a299ef44cd609e752089 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Tue, 7 May 2024 11:28:03 -0700 Subject: [PATCH 15/27] Skip Azure Cosmos DB integration tests --- .../AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs index 7b95989e5f18..0e8aee320856 100644 --- a/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs +++ b/dotnet/src/IntegrationTests/Connectors/Memory/AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStoreTests.cs @@ -15,7 +15,7 @@ namespace SemanticKernel.IntegrationTests.Connectors.AzureCosmosDBNoSQL; /// public class AzureCosmosDBNoSQLMemoryStoreTests : IClassFixture { - private const string? SkipReason = null;//"Azure CosmosDB Account with Vector indexing enabled required"; + private const string? SkipReason = "Azure Cosmos DB Account with Vector indexing enabled required"; private readonly AzureCosmosDBNoSQLMemoryStoreTestsFixture _fixture; From e8baf682142169ef273b648fc6c1dbabba7269d5 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 10:36:03 -0700 Subject: [PATCH 16/27] Review feedback * Make helper types internal * Ensure there is an appropriate embedding policy * Fix doc comment copy/paste mistake * Re-use constant in another place --- .../AzureCosmosDBNoSQLMemoryStore.cs | 48 +++++++++++++------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index bcad3349c981..4a66274f8062 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -42,16 +42,19 @@ public AzureCosmosDBNoSQLMemoryStore( VectorEmbeddingPolicy vectorEmbeddingPolicy, IndexingPolicy indexingPolicy, string? applicationName = null) + : this( + new CosmosClient( + connectionString, + new CosmosClientOptions + { + ApplicationName = applicationName ?? HttpHeaderConstant.Values.UserAgent, + Serializer = new CosmosSystemTextJsonSerializer(JsonSerializerOptions.Default), + }), + databaseName, + vectorEmbeddingPolicy, + indexingPolicy, + applicationName) { - var options = new CosmosClientOptions - { - ApplicationName = applicationName ?? HttpHeaderConstant.Values.UserAgent, - Serializer = new CosmosSystemTextJsonSerializer(JsonSerializerOptions.Default), - }; - this._cosmosClient = new CosmosClient(connectionString, options); - this._databaseName = databaseName; - this._vectorEmbeddingPolicy = vectorEmbeddingPolicy; - this._indexingPolicy = indexingPolicy; } /// @@ -71,11 +74,23 @@ public AzureCosmosDBNoSQLMemoryStore( IndexingPolicy indexingPolicy, string? applicationName = null) { + if (!vectorEmbeddingPolicy.Embeddings.Any(e => e.Path == "/embedding")) + { + throw new InvalidOperationException($""" + In order for {nameof(GetNearestMatchAsync)} to function, {nameof(vectorEmbeddingPolicy)} should + contain an embedding path at /embedding. It's also recommended to include a that path in the + {nameof(indexingPolicy)} to improve performance and reduce cost for searches. + """); + } this._cosmosClient = cosmosClient; - this._cosmosClient.ClientOptions.ApplicationName = applicationName; this._databaseName = databaseName; this._vectorEmbeddingPolicy = vectorEmbeddingPolicy; this._indexingPolicy = indexingPolicy; + + if (!string.IsNullOrWhiteSpace(applicationName)) + { + this._cosmosClient.ClientOptions.ApplicationName = applicationName; + } } /// @@ -196,8 +211,10 @@ public async IAsyncEnumerable GetBatchAsync( bool withEmbeddings = false, [EnumeratorCancellation] CancellationToken cancellationToken = default) { + const string OR = " OR "; + // Optimistically create the entire query string. - var whereClause = string.Join(" OR ", keys.Select(k => $"(x.id = \"{k}\" AND x.key = \"{k}\")")); + var whereClause = string.Join(OR, keys.Select(k => $"(x.id = \"{k}\" AND x.key = \"{k}\")")); var queryDefinition = new QueryDefinition($""" SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} FROM x @@ -233,7 +250,6 @@ FROM x // put a few extra for the values of the keys. var estimatedWhereLength = 30 * keysPerQuery; var localWhere = new StringBuilder(estimatedWhereLength); - const string OR = " OR "; for (var i = 0; i < segments; i++) { localWhere.Clear(); @@ -380,7 +396,7 @@ protected virtual void Dispose(bool disposing) /// /// [DebuggerDisplay("{GetDebuggerDisplay()}")] -public class MemoryRecordWithSimilarityScore( +internal class MemoryRecordWithSimilarityScore( MemoryRecordMetadata metadata, ReadOnlyMemory embedding, string? key, @@ -401,7 +417,7 @@ private string GetDebuggerDisplay() /// Creates a new record that also serializes an "id" property. /// [DebuggerDisplay("{GetDebuggerDisplay()}")] -public class MemoryRecordWithId : MemoryRecord +internal class MemoryRecordWithId : MemoryRecord { /// /// Creates a new record that also serializes an "id" property. @@ -425,7 +441,9 @@ public MemoryRecordWithId( } /// - /// The similarity score returned. + /// Serializes the the property as "id". + /// We do this because Azure Cosmos DB requires a property named "id" for + /// each item. /// [JsonInclude] [JsonPropertyName("id")] From 1a92fabab929ef80bb61ddc923ac161dd960c6b8 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 10:38:49 -0700 Subject: [PATCH 17/27] Make CosmosSystemTextJsonSerializer internal --- .../CosmosSystemTextJSonSerializer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs index eac8f93befe9..c656f0daacb3 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs @@ -13,7 +13,7 @@ namespace Microsoft.Azure.Cosmos; /// /// This class provides a default implementation of System.Text.Json Cosmos Linq Serializer. /// -public class CosmosSystemTextJsonSerializer : CosmosLinqSerializer +internal class CosmosSystemTextJsonSerializer : CosmosLinqSerializer { /// /// A read-only instance of . From e580918583d57a387faffadd3876c768c7fc60d6 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 10:48:29 -0700 Subject: [PATCH 18/27] Apply indent fix Co-authored-by: Stephen Toub --- .../AzureCosmosDBNoSQLMemoryStore.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 4a66274f8062..58ff8c622fae 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -264,10 +264,10 @@ FROM x localWhere.Length -= OR.Length; var localQueryDefinition = new QueryDefinition($""" - SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} - FROM x - WHERE {localWhere} - """); + SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} + FROM x + WHERE {localWhere} + """); var feedIterator = this._cosmosClient .GetDatabase(this._databaseName) .GetContainer(collectionName) From 7d061de556761a21fee7dbba13b69c4fb6110f65 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 11:06:14 -0700 Subject: [PATCH 19/27] Multi-target to .net8.0 and netstandard2.0 --- .../AzureCosmosDBNoSQLMemoryStore.cs | 8 ++++++++ .../Connectors.Memory.AzureCosmosDBNoSQL.csproj | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 58ff8c622fae..251fb46026c5 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -14,6 +14,10 @@ using Microsoft.SemanticKernel.Http; using Microsoft.SemanticKernel.Memory; +#if NET6_0_OR_GREATER +using System.Globalization; +#endif + namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; /// @@ -256,7 +260,11 @@ FROM x for (var q = i * keysPerQuery; q < (i + 1) * keysPerQuery && q < keyList.Count; q++) { var k = keyList[q]; +#if NET6_0_OR_GREATER + localWhere.Append(CultureInfo.InvariantCulture, $"(x.id = \"{k}\" AND x.key = \"{k}\")").Append(OR); +#else localWhere.Append($"(x.id = \"{k}\" AND x.key = \"{k}\")").Append(OR); +#endif } if (localWhere.Length >= OR.Length) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj index 3c5a610f3c0d..0ffb5b602e05 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/Connectors.Memory.AzureCosmosDBNoSQL.csproj @@ -4,7 +4,7 @@ Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL $(AssemblyName) - netstandard2.0 + net8.0;netstandard2.0 $(NoWarn);NU5104;SKEXP0001,SKEXP0010 alpha From eae8d20a99b01a7582e787a9b98641a45779c19f Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 11:07:34 -0700 Subject: [PATCH 20/27] Fix stutter --- .../AzureCosmosDBNoSQLMemoryStore.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index 251fb46026c5..df97c3ade337 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -449,7 +449,7 @@ public MemoryRecordWithId( } /// - /// Serializes the the property as "id". + /// Serializes the property as "id". /// We do this because Azure Cosmos DB requires a property named "id" for /// each item. /// From 4f16261c5080972aaea1443cd126bbae0bc96674 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 11:21:19 -0700 Subject: [PATCH 21/27] Avoid reading stream into string when deserializing --- .../CosmosSystemTextJSonSerializer.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs index c656f0daacb3..a4abae5bf266 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs @@ -51,8 +51,7 @@ public override T FromStream(Stream stream) using (stream) { - using StreamReader reader = new(stream); - return JsonSerializer.Deserialize(reader.ReadToEnd(), this._jsonSerializerOptions); + return JsonSerializer.Deserialize(stream, this._jsonSerializerOptions); } } From 0807f3ae79cbf003fe2b350c96c5d9d5676902ea Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 11:32:14 -0700 Subject: [PATCH 22/27] Optimize single collection lookup --- .../AzureCosmosDBNoSQLMemoryStore.cs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index df97c3ade337..a141f9532068 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -139,16 +139,20 @@ public async Task DoesCollectionExistAsync( string collectionName, CancellationToken cancellationToken = default) { - await foreach ( - var existingCollectionName in this.GetCollectionsAsync(cancellationToken) - .ConfigureAwait(false) - ) + using var feedIterator = this. + _cosmosClient + .GetDatabase(this._databaseName) + .GetContainerQueryIterator($"SELECT VALUE(c.id) FROM c WHERE c.id = '{collectionName}'"); + + while (feedIterator.HasMoreResults) { - if (existingCollectionName == collectionName) + var next = await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false); + foreach (var containerName in next.Resource) { return true; } } + return false; } From 47a347aa2b631c90cb961cb62f47e15feed901cd Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 11:39:28 -0700 Subject: [PATCH 23/27] Avoid text writer and serialize directly to stream --- .../CosmosSystemTextJSonSerializer.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs index a4abae5bf266..7d99004b6879 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs @@ -59,10 +59,8 @@ public override T FromStream(Stream stream) public override Stream ToStream(T input) { MemoryStream streamPayload = new(); - using Utf8JsonWriter writer = new(streamPayload); - JsonSerializer.Serialize( - writer: writer, + utf8Json: streamPayload, value: input, options: this._jsonSerializerOptions); From cb27f3a08c43e037e83a7302ffb6c1671f91ab9d Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 11:58:12 -0700 Subject: [PATCH 24/27] Remove `applicationName` parameter. Setting this after the `CosmosClient` can't be relied upon to work (it won't if a connection has already been established in the current implementation). --- .../AzureCosmosDBNoSQLMemoryStore.cs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index a141f9532068..acd93973eeb6 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -56,8 +56,7 @@ public AzureCosmosDBNoSQLMemoryStore( }), databaseName, vectorEmbeddingPolicy, - indexingPolicy, - applicationName) + indexingPolicy) { } @@ -70,13 +69,11 @@ public AzureCosmosDBNoSQLMemoryStore( /// The database name to operate against. /// The to use if a collection is created. NOTE that embeddings will be stored in a property named 'embedding'. /// The to use if a collection is created. NOTE that embeddings will be stored in a property named 'embedding'. - /// The application name to use in requests. public AzureCosmosDBNoSQLMemoryStore( CosmosClient cosmosClient, string databaseName, VectorEmbeddingPolicy vectorEmbeddingPolicy, - IndexingPolicy indexingPolicy, - string? applicationName = null) + IndexingPolicy indexingPolicy) { if (!vectorEmbeddingPolicy.Embeddings.Any(e => e.Path == "/embedding")) { @@ -90,11 +87,6 @@ contain an embedding path at /embedding. It's also recommended to include a that this._databaseName = databaseName; this._vectorEmbeddingPolicy = vectorEmbeddingPolicy; this._indexingPolicy = indexingPolicy; - - if (!string.IsNullOrWhiteSpace(applicationName)) - { - this._cosmosClient.ClientOptions.ApplicationName = applicationName; - } } /// From 38fd7d626764fbc51a525138c92345964eef32b8 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 20:05:23 -0700 Subject: [PATCH 25/27] Use published package for Cosmos DB --- dotnet/Directory.Packages.props | 2 +- dotnet/nuget.config | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/dotnet/Directory.Packages.props b/dotnet/Directory.Packages.props index 84c664e5e648..0a78b2c0332f 100644 --- a/dotnet/Directory.Packages.props +++ b/dotnet/Directory.Packages.props @@ -86,7 +86,7 @@ - + diff --git a/dotnet/nuget.config b/dotnet/nuget.config index 9ad18d1ddc85..7159fcd04c36 100644 --- a/dotnet/nuget.config +++ b/dotnet/nuget.config @@ -4,13 +4,9 @@ - - - - From d246bf4662d4284d8683756df9ca939e42207c6e Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Fri, 17 May 2024 20:40:07 -0700 Subject: [PATCH 26/27] Fix warnings --- .../AzureCosmosDBNoSQLMemoryStore.cs | 6 ++++-- .../CosmosSystemTextJSonSerializer.cs | 20 ++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index acd93973eeb6..d3df09f8350c 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -400,7 +400,9 @@ protected virtual void Dispose(bool disposing) /// /// [DebuggerDisplay("{GetDebuggerDisplay()}")] -internal class MemoryRecordWithSimilarityScore( +#pragma warning disable CA1812 // 'MemoryRecordWithSimilarityScore' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812) +internal sealed class MemoryRecordWithSimilarityScore( +#pragma warning restore CA1812 MemoryRecordMetadata metadata, ReadOnlyMemory embedding, string? key, @@ -421,7 +423,7 @@ private string GetDebuggerDisplay() /// Creates a new record that also serializes an "id" property. /// [DebuggerDisplay("{GetDebuggerDisplay()}")] -internal class MemoryRecordWithId : MemoryRecord +internal sealed class MemoryRecordWithId : MemoryRecord { /// /// Creates a new record that also serializes an "id" property. diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs index 7d99004b6879..0737ce09c120 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/CosmosSystemTextJSonSerializer.cs @@ -3,6 +3,7 @@ // Taken from https://github.com/Azure/azure-cosmos-dotnet-v3/pull/4332 using System; +using System.Diagnostics.CodeAnalysis; using System.IO; using System.Reflection; using System.Text.Json; @@ -13,7 +14,7 @@ namespace Microsoft.Azure.Cosmos; /// /// This class provides a default implementation of System.Text.Json Cosmos Linq Serializer. /// -internal class CosmosSystemTextJsonSerializer : CosmosLinqSerializer +internal sealed class CosmosSystemTextJsonSerializer : CosmosLinqSerializer { /// /// A read-only instance of . @@ -32,6 +33,7 @@ public CosmosSystemTextJsonSerializer( } /// + [return: MaybeNull] public override T FromStream(Stream stream) { if (stream == null) @@ -107,9 +109,9 @@ public override Stream ToStream(T input) /// To handle such scenarios, please create a custom serializer which inherits from the and overrides the /// SerializeMemberName to add any special handling. /// - public override string SerializeMemberName(MemberInfo memberInfo) + public override string? SerializeMemberName(MemberInfo memberInfo) { - JsonExtensionDataAttribute jsonExtensionDataAttribute = + JsonExtensionDataAttribute? jsonExtensionDataAttribute = memberInfo.GetCustomAttribute(true); if (jsonExtensionDataAttribute != null) @@ -117,12 +119,12 @@ public override string SerializeMemberName(MemberInfo memberInfo) return null; } - JsonPropertyNameAttribute jsonPropertyNameAttribute = memberInfo.GetCustomAttribute(true); - - string memberName = !string.IsNullOrEmpty(jsonPropertyNameAttribute?.Name) - ? jsonPropertyNameAttribute.Name - : memberInfo.Name; + JsonPropertyNameAttribute? jsonPropertyNameAttribute = memberInfo.GetCustomAttribute(true); + if (jsonPropertyNameAttribute is { } && !string.IsNullOrEmpty(jsonPropertyNameAttribute.Name)) + { + return jsonPropertyNameAttribute.Name; + } - return memberName; + return memberInfo.Name; } } From 5cf5f8799cd510a40d1e674c4751bd941ea4a1a2 Mon Sep 17 00:00:00 2001 From: Kevin Pilch Date: Sun, 19 May 2024 14:10:16 -0700 Subject: [PATCH 27/27] Use QueryDefinition to avoid injection --- .../AzureCosmosDBNoSQLMemoryStore.cs | 94 ++++++------------- 1 file changed, 31 insertions(+), 63 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs index d3df09f8350c..70d6210fc355 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureCosmosDBNoSQL/AzureCosmosDBNoSQLMemoryStore.cs @@ -14,10 +14,6 @@ using Microsoft.SemanticKernel.Http; using Microsoft.SemanticKernel.Memory; -#if NET6_0_OR_GREATER -using System.Globalization; -#endif - namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBNoSQL; /// @@ -131,10 +127,12 @@ public async Task DoesCollectionExistAsync( string collectionName, CancellationToken cancellationToken = default) { + var queryDefinition = new QueryDefinition("SELECT VALUE(c.id) FROM c WHERE c.id = @collectionName"); + queryDefinition.WithParameter("@collectionName", collectionName); using var feedIterator = this. _cosmosClient .GetDatabase(this._databaseName) - .GetContainerQueryIterator($"SELECT VALUE(c.id) FROM c WHERE c.id = '{collectionName}'"); + .GetContainerQueryIterator(queryDefinition); while (feedIterator.HasMoreResults) { @@ -212,20 +210,36 @@ public async IAsyncEnumerable GetBatchAsync( [EnumeratorCancellation] CancellationToken cancellationToken = default) { const string OR = " OR "; - - // Optimistically create the entire query string. - var whereClause = string.Join(OR, keys.Select(k => $"(x.id = \"{k}\" AND x.key = \"{k}\")")); - var queryDefinition = new QueryDefinition($""" + var queryStart = $""" SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} FROM x - WHERE {whereClause} - """); - - // NOTE: Cosmos DB queries are limited to 512kB, so if this is larger than that, break it into segments. - var byteCount = Encoding.UTF8.GetByteCount(whereClause); - var ratio = byteCount / ((float)(512 * 1024)); - if (ratio < 1) + WHERE + """; + // NOTE: Cosmos DB queries are limited to 512kB, so we'll break this into chunks + // of around 500kB. We don't go all the way to 512kB so that we don't have to + // remove the last clause we added once we go over. + int keyIndex = 0; + var keyList = keys.ToList(); + while (keyIndex < keyList.Count) { + var length = queryStart.Length; + var countThisBatch = 0; + var whereClauses = new StringBuilder(); + for (int i = keyIndex; i < keyList.Count && length <= 500 * 1024; i++, countThisBatch++) + { + string keyId = $"@key{i:D}"; + var clause = $"(x.id = {keyId} AND x.key = {keyId})"; + whereClauses.Append(clause).Append(OR); + length += clause.Length + OR.Length + 4 + keyId.Length + Encoding.UTF8.GetByteCount(keyList[keyIndex]); + } + whereClauses.Length -= OR.Length; + + var queryDefinition = new QueryDefinition(queryStart + whereClauses); + for (int i = keyIndex; i < keyIndex + countThisBatch; i++) + { + queryDefinition.WithParameter($"@key{i:D}", keyList[i]); + } + var feedIterator = this._cosmosClient .GetDatabase(this._databaseName) .GetContainer(collectionName) @@ -238,54 +252,8 @@ FROM x yield return memoryRecord; } } - } - else - { - // We're in the very large case, we'll need to split this into multiple queries. - // We add one to catch any fractional piece left in the last segment - var segments = (int)(ratio + 1); - var keyList = keys.ToList(); - var keysPerQuery = keyList.Count / segments; - // Make a guess as to how long this query will be. We need at least 26 chars for each "OR" block, so - // put a few extra for the values of the keys. - var estimatedWhereLength = 30 * keysPerQuery; - var localWhere = new StringBuilder(estimatedWhereLength); - for (var i = 0; i < segments; i++) - { - localWhere.Clear(); - for (var q = i * keysPerQuery; q < (i + 1) * keysPerQuery && q < keyList.Count; q++) - { - var k = keyList[q]; -#if NET6_0_OR_GREATER - localWhere.Append(CultureInfo.InvariantCulture, $"(x.id = \"{k}\" AND x.key = \"{k}\")").Append(OR); -#else - localWhere.Append($"(x.id = \"{k}\" AND x.key = \"{k}\")").Append(OR); -#endif - } - if (localWhere.Length >= OR.Length) - { - localWhere.Length -= OR.Length; - - var localQueryDefinition = new QueryDefinition($""" - SELECT x.id,x.key,x.metadata,x.timestamp{(withEmbeddings ? ",x.embedding" : "")} - FROM x - WHERE {localWhere} - """); - var feedIterator = this._cosmosClient - .GetDatabase(this._databaseName) - .GetContainer(collectionName) - .GetItemQueryIterator(localQueryDefinition); - - while (feedIterator.HasMoreResults) - { - foreach (var memoryRecord in await feedIterator.ReadNextAsync(cancellationToken).ConfigureAwait(false)) - { - yield return memoryRecord; - } - } - } - } + keyIndex += countThisBatch; } }