Skip to content

Commit 5e58b94

Browse files
authored
[BUG] Raise Error when can't deserialize configuration json from server, lazily load ef on CollectionModel, warn on api_key (#4471)
## Description of changes This PR fixes a bug where it warns users if the configuration is not deserializable rather than raising an error, giving the users an empty config. Now, it raises an error if the configuration is not deserializable. This will only happen when the client is ahead of the server, which we do not need to support. EF's from the config are now lazily loaded. only when ._embed is invoked from the model is it tried to be built, and will take precedent over the provided ef. This also fixes a bug with list_collections, where if the user tries to load collections without the proper embedding function parameters set up, like api key env var, etc it would throw an hnsw and spann not configured error, which is not the root cause. instead, it will now load only the configuration_json, and at embed time try to load the ef, so errors will only propogate at that point, and with much clearer error messages. Now, using api_key directly in embedding_functions will give a deprecation warning saying it is not persisted. ## Test plan _How are these changes tested?_ - [ ] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Documentation Changes _Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs section](https://github.com/chroma-core/chroma/tree/main/docs/docs.trychroma.com)?_
1 parent e46b4a2 commit 5e58b94

20 files changed

+436
-118
lines changed

chromadb/api/collection_configuration.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,24 @@ def load_collection_configuration_from_json(
8282
ef = None
8383
else:
8484
try:
85-
ef = known_embedding_functions[ef_config["name"]]
86-
ef = ef.build_from_config(ef_config["config"]) # type: ignore
85+
ef_name = ef_config["name"]
86+
except KeyError:
87+
raise ValueError(
88+
f"Embedding function name not found in config: {ef_config}"
89+
)
90+
try:
91+
ef = known_embedding_functions[ef_name]
8792
except KeyError:
8893
raise ValueError(
89-
f"Embedding function {ef_config['name']} not found. Add @register_embedding_function decorator to the class definition."
94+
f"Embedding function {ef_name} not found. Add @register_embedding_function decorator to the class definition."
9095
)
96+
try:
97+
ef = ef.build_from_config(ef_config["config"]) # type: ignore
98+
except Exception as e:
99+
raise ValueError(
100+
f"Could not build embedding function {ef_config['name']} from config {ef_config['config']}: {e}"
101+
)
102+
91103
else:
92104
ef = None
93105

chromadb/api/models/CollectionCommon.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -129,18 +129,7 @@ def __init__(
129129
if embedding_function is not None:
130130
validate_embedding_function(embedding_function)
131131

132-
config_ef = self.configuration.get("embedding_function")
133-
if config_ef is not None:
134-
if embedding_function is not None and not isinstance(
135-
embedding_function, ef.DefaultEmbeddingFunction
136-
):
137-
if embedding_function.name() is not config_ef.name():
138-
raise ValueError(
139-
f"Embedding function name mismatch: {embedding_function.name()} != {config_ef.name()}"
140-
)
141-
self._embedding_function = config_ef
142-
else:
143-
self._embedding_function = embedding_function
132+
self._embedding_function = embedding_function
144133
self._data_loader = data_loader
145134

146135
# Expose the model properties as read-only properties on the Collection class
@@ -567,6 +556,13 @@ def _embed_record_set(
567556
)
568557

569558
def _embed(self, input: Any) -> Embeddings:
559+
if self._embedding_function is not None and not isinstance(
560+
self._embedding_function, ef.DefaultEmbeddingFunction
561+
):
562+
return self._embedding_function(input=input)
563+
config_ef = self.configuration.get("embedding_function")
564+
if config_ef is not None:
565+
return config_ef(input=input)
570566
if self._embedding_function is None:
571567
raise ValueError(
572568
"You must provide an embedding function to compute embeddings."

chromadb/api/rust.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
UpdateCollectionConfiguration,
1818
create_collection_configuration_to_json_str,
1919
update_collection_configuration_to_json_str,
20-
load_collection_configuration_from_json,
2120
)
2221
from chromadb.auth import UserIdentity
2322
from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT, Settings, System
@@ -191,9 +190,7 @@ def list_collections(
191190
CollectionModel(
192191
id=collection.id,
193192
name=collection.name,
194-
configuration=load_collection_configuration_from_json(
195-
collection.configuration
196-
),
193+
configuration_json=collection.configuration,
197194
metadata=collection.metadata,
198195
dimension=collection.dimension,
199196
tenant=collection.tenant,
@@ -233,9 +230,7 @@ def create_collection(
233230
collection_model = CollectionModel(
234231
id=collection.id,
235232
name=collection.name,
236-
configuration=load_collection_configuration_from_json(
237-
collection.configuration
238-
),
233+
configuration_json=collection.configuration,
239234
metadata=collection.metadata,
240235
dimension=collection.dimension,
241236
tenant=collection.tenant,
@@ -254,9 +249,7 @@ def get_collection(
254249
return CollectionModel(
255250
id=collection.id,
256251
name=collection.name,
257-
configuration=load_collection_configuration_from_json(
258-
collection.configuration
259-
),
252+
configuration_json=collection.configuration,
260253
metadata=collection.metadata,
261254
dimension=collection.dimension,
262255
tenant=collection.tenant,

chromadb/api/segment.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from chromadb.api.collection_configuration import (
44
CreateCollectionConfiguration,
55
UpdateCollectionConfiguration,
6-
load_collection_configuration_from_create_collection_configuration,
6+
create_collection_configuration_to_json,
77
)
88
from chromadb.auth import UserIdentity
99
from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT, Settings, System
@@ -235,7 +235,7 @@ def create_collection(
235235
id=id,
236236
name=name,
237237
metadata=metadata,
238-
configuration=load_collection_configuration_from_create_collection_configuration(
238+
configuration_json=create_collection_configuration_to_json(
239239
configuration or CreateCollectionConfiguration()
240240
),
241241
tenant=tenant,
@@ -413,7 +413,9 @@ def _fork(
413413
tenant: str = DEFAULT_TENANT,
414414
database: str = DEFAULT_DATABASE,
415415
) -> CollectionModel:
416-
raise NotImplementedError("Collection forking is not implemented for SegmentAPI")
416+
raise NotImplementedError(
417+
"Collection forking is not implemented for SegmentAPI"
418+
)
417419

418420
@trace_method("SegmentAPI.delete_collection", OpenTelemetryGranularity.OPERATION)
419421
@override

chromadb/db/impl/grpc/server.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
from typing import Any, Dict, List, cast
33
from uuid import UUID
44
from overrides import overrides
5-
from chromadb.api.collection_configuration import (
6-
load_collection_configuration_from_json_str,
7-
)
5+
import json
6+
87
from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT, Component, System
98
from chromadb.proto.convert import (
109
from_proto_metadata,
@@ -286,15 +285,13 @@ def CreateCollection(
286285
f"Collection {collection_name} already exists",
287286
)
288287

289-
configuration = load_collection_configuration_from_json_str(
290-
request.configuration_json_str
291-
)
288+
configuration_json = json.loads(request.configuration_json_str)
292289

293290
id = UUID(hex=request.id)
294291
new_collection = Collection(
295292
id=id,
296293
name=request.name,
297-
configuration=configuration,
294+
configuration_json=configuration_json,
298295
metadata=from_proto_metadata(request.metadata),
299296
dimension=request.dimension,
300297
database=database,

chromadb/db/mixins/sysdb.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@
3838
create_collection_configuration_to_json_str,
3939
load_collection_configuration_from_json_str,
4040
CollectionConfiguration,
41-
load_collection_configuration_from_create_collection_configuration,
41+
create_collection_configuration_to_json,
42+
collection_configuration_to_json,
4243
collection_configuration_to_json_str,
4344
overwrite_collection_configuration,
4445
update_collection_configuration_from_legacy_update_metadata,
@@ -310,9 +311,7 @@ def create_collection(
310311
collection = Collection(
311312
id=id,
312313
name=name,
313-
configuration=load_collection_configuration_from_create_collection_configuration(
314-
configuration
315-
),
314+
configuration_json=create_collection_configuration_to_json(configuration),
316315
metadata=metadata,
317316
dimension=dimension,
318317
tenant=tenant,
@@ -541,7 +540,9 @@ def get_collections(
541540
Collection(
542541
id=cast(UUID, id),
543542
name=name,
544-
configuration=configuration,
543+
configuration_json=collection_configuration_to_json(
544+
configuration
545+
),
545546
metadata=metadata,
546547
dimension=dimension,
547548
tenant=str(rows[0][5]),

chromadb/proto/convert.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from typing import Dict, Optional, Sequence, Tuple, TypedDict, Union, cast
22
from uuid import UUID
3+
import json
34

45
import numpy as np
56
from numpy.typing import NDArray
67

78
import chromadb.proto.chroma_pb2 as chroma_pb
89
import chromadb.proto.query_executor_pb2 as query_pb
910
from chromadb.api.collection_configuration import (
10-
load_collection_configuration_from_json_str,
1111
collection_configuration_to_json_str,
1212
)
1313
from chromadb.api.types import Embedding, Where, WhereDocument
@@ -239,9 +239,7 @@ def from_proto_collection(collection: chroma_pb.Collection) -> Collection:
239239
return Collection(
240240
id=UUID(hex=collection.id),
241241
name=collection.name,
242-
configuration=load_collection_configuration_from_json_str(
243-
collection.configuration_json_str
244-
),
242+
configuration_json=json.loads(collection.configuration_json_str),
245243
metadata=from_proto_metadata(collection.metadata)
246244
if collection.HasField("metadata")
247245
else None,

0 commit comments

Comments
 (0)