Fixed chunk number limit in tokens instead of chars. (#394)

srtab · web-flow · commit 17da04dcdec0 · 2025-05-26T14:19:55.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - Fixed connection closed or lost on `ConnectionPool` by using `check_connection` to verify if the connection is still working.
+- Fixed chunks length check to use the correct number of tokens instead of the number of characters.
 
 ## [0.1.4] - 2025-05-22
 
diff --git a/daiv/codebase/chunking/loaders.py b/daiv/codebase/chunking/loaders.py
@@ -134,15 +134,17 @@ def _split(self, document: dict[str | None, list[Document]]) -> list[Document]:
             text_splitter = self._get_text_splitter(language)
 
             for doc in split_documents(text_splitter, documents):
-                # skip chunks that are too large
-                if len(doc.page_content) > settings.CHUNK_SIZE * 2:
+                # Skip chunks that are too large.
+                # We multiply by 5 to try to convert (approximately) the number of tokens to the number of characters.
+                content_length = len(doc.page_content)
+                if content_length > settings.EMBEDDINGS_MAX_INPUT_TOKENS * 5:
                     logger.warning(
                         "Chunk is too large, skipping: %s. "
                         "Consider excluding it from being indexed on .daiv.yml. "
                         "Chunk size: %d, max allowed: %d",
                         doc.metadata["source"],
-                        len(doc.page_content),
-                        settings.CHUNK_SIZE * 2,
+                        content_length,
+                        settings.EMBEDDINGS_MAX_INPUT_TOKENS * 5,
                     )
                     continue
 
diff --git a/daiv/codebase/conf.py b/daiv/codebase/conf.py
@@ -48,6 +48,13 @@ class CodebaseSettings(BaseSettings):
     EMBEDDINGS_BATCH_SIZE: int = Field(
         default=500, description="Batch size for the embeddings. Only used for OpenAI models."
     )
+    EMBEDDINGS_MAX_INPUT_TOKENS: int = Field(
+        default=8192,
+        description=(
+            "Maximum number of tokens to embed. "
+            "A warning will be raised if the number of tokens is higher than this value and the chunk will be skipped."
+        ),
+    )
 
     # Chunking
     CHUNK_SIZE: int = Field(
diff --git a/daiv/codebase/search_engines/semantic.py b/daiv/codebase/search_engines/semantic.py
@@ -2,6 +2,7 @@
 import logging
 from textwrap import dedent
 
+import tiktoken
 from daiv.settings.components import DATA_DIR
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
@@ -129,21 +130,34 @@ def _build_content_to_embed(self, document: Document, description: str) -> str:
             str: Content to embed
         """
         if not description:
-            return dedent(f"""\
+            content = dedent(f"""\
                 Repository: {document.metadata.get("repo_id", "")}
                 File Path: {document.metadata.get("source", "")}
 
                 {document.page_content}
             """)
         else:
-            return dedent(f"""\
+            content = dedent(f"""\
                 Repository: {document.metadata.get("repo_id", "")}
                 File Path: {document.metadata.get("source", "")}
                 Description: {description}
 
                 {document.page_content}
             """)
 
+        count = self._embeddings_count_tokens(content)
+
+        if count > settings.EMBEDDINGS_MAX_INPUT_TOKENS:
+            logger.warning(
+                "Chunk is too large, truncating: %s. Chunk tokens: %d, max allowed: %d",
+                document.metadata["source"],
+                self._embeddings_count_tokens(content),
+                settings.EMBEDDINGS_MAX_INPUT_TOKENS,
+            )
+            return content[: settings.EMBEDDINGS_MAX_INPUT_TOKENS]
+
+        return content
+
     def delete_documents(self, namespace: CodebaseNamespace, source: str | list[str]):
         """
         Deletes documents from the namespace matching the given source(s).
@@ -196,3 +210,15 @@ def as_retriever(self, namespace: CodebaseNamespace | None = None, **kwargs) ->
         if namespace is None:
             return PostgresRetriever(embeddings=self.embeddings, **kwargs)
         return ScopedPostgresRetriever(namespace=namespace, embeddings=self.embeddings, **kwargs)
+
+    def _embeddings_count_tokens(self, text: str) -> int:
+        """
+        Count the number of tokens in the text.
+        """
+        provider, model_name = settings.EMBEDDINGS_MODEL_NAME.split("/", 1)
+
+        if provider == "voyageai":
+            return self.embeddings._client.count_tokens([text], model=model_name)
+        elif provider == "openai":
+            return len(tiktoken.encoding_for_model(model_name).encode(text))
+        return len(tiktoken.get_encoding("cl100k_base").encode(text))