Skip to content

Commit 17da04d

Browse files
authored
Fixed chunk number limit in tokens instead of chars. (#394)
1 parent 926da15 commit 17da04d

File tree

4 files changed

+42
-6
lines changed

4 files changed

+42
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414
### Fixed
1515

1616
- Fixed connection closed or lost on `ConnectionPool` by using `check_connection` to verify if the connection is still working.
17+
- Fixed chunks length check to use the correct number of tokens instead of the number of characters.
1718

1819
## [0.1.4] - 2025-05-22
1920

daiv/codebase/chunking/loaders.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,17 @@ def _split(self, document: dict[str | None, list[Document]]) -> list[Document]:
134134
text_splitter = self._get_text_splitter(language)
135135

136136
for doc in split_documents(text_splitter, documents):
137-
# skip chunks that are too large
138-
if len(doc.page_content) > settings.CHUNK_SIZE * 2:
137+
# Skip chunks that are too large.
138+
# We multiply by 5 to try to convert (approximately) the number of tokens to the number of characters.
139+
content_length = len(doc.page_content)
140+
if content_length > settings.EMBEDDINGS_MAX_INPUT_TOKENS * 5:
139141
logger.warning(
140142
"Chunk is too large, skipping: %s. "
141143
"Consider excluding it from being indexed on .daiv.yml. "
142144
"Chunk size: %d, max allowed: %d",
143145
doc.metadata["source"],
144-
len(doc.page_content),
145-
settings.CHUNK_SIZE * 2,
146+
content_length,
147+
settings.EMBEDDINGS_MAX_INPUT_TOKENS * 5,
146148
)
147149
continue
148150

daiv/codebase/conf.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ class CodebaseSettings(BaseSettings):
4848
EMBEDDINGS_BATCH_SIZE: int = Field(
4949
default=500, description="Batch size for the embeddings. Only used for OpenAI models."
5050
)
51+
EMBEDDINGS_MAX_INPUT_TOKENS: int = Field(
52+
default=8192,
53+
description=(
54+
"Maximum number of tokens to embed. "
55+
"A warning will be raised if the number of tokens is higher than this value and the chunk will be skipped."
56+
),
57+
)
5158

5259
# Chunking
5360
CHUNK_SIZE: int = Field(

daiv/codebase/search_engines/semantic.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
from textwrap import dedent
44

5+
import tiktoken
56
from daiv.settings.components import DATA_DIR
67
from langchain_core.documents import Document
78
from langchain_core.embeddings import Embeddings
@@ -129,21 +130,34 @@ def _build_content_to_embed(self, document: Document, description: str) -> str:
129130
str: Content to embed
130131
"""
131132
if not description:
132-
return dedent(f"""\
133+
content = dedent(f"""\
133134
Repository: {document.metadata.get("repo_id", "")}
134135
File Path: {document.metadata.get("source", "")}
135136
136137
{document.page_content}
137138
""")
138139
else:
139-
return dedent(f"""\
140+
content = dedent(f"""\
140141
Repository: {document.metadata.get("repo_id", "")}
141142
File Path: {document.metadata.get("source", "")}
142143
Description: {description}
143144
144145
{document.page_content}
145146
""")
146147

148+
count = self._embeddings_count_tokens(content)
149+
150+
if count > settings.EMBEDDINGS_MAX_INPUT_TOKENS:
151+
logger.warning(
152+
"Chunk is too large, truncating: %s. Chunk tokens: %d, max allowed: %d",
153+
document.metadata["source"],
154+
self._embeddings_count_tokens(content),
155+
settings.EMBEDDINGS_MAX_INPUT_TOKENS,
156+
)
157+
return content[: settings.EMBEDDINGS_MAX_INPUT_TOKENS]
158+
159+
return content
160+
147161
def delete_documents(self, namespace: CodebaseNamespace, source: str | list[str]):
148162
"""
149163
Deletes documents from the namespace matching the given source(s).
@@ -196,3 +210,15 @@ def as_retriever(self, namespace: CodebaseNamespace | None = None, **kwargs) ->
196210
if namespace is None:
197211
return PostgresRetriever(embeddings=self.embeddings, **kwargs)
198212
return ScopedPostgresRetriever(namespace=namespace, embeddings=self.embeddings, **kwargs)
213+
214+
def _embeddings_count_tokens(self, text: str) -> int:
215+
"""
216+
Count the number of tokens in the text.
217+
"""
218+
provider, model_name = settings.EMBEDDINGS_MODEL_NAME.split("/", 1)
219+
220+
if provider == "voyageai":
221+
return self.embeddings._client.count_tokens([text], model=model_name)
222+
elif provider == "openai":
223+
return len(tiktoken.encoding_for_model(model_name).encode(text))
224+
return len(tiktoken.get_encoding("cl100k_base").encode(text))

0 commit comments

Comments
 (0)