cloudera · jkwatson · Nov 21, 2024 · Nov 21, 2024 · conradocloudera · Nov 21, 2024
diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
@@ -57,13 +57,7 @@
     ".txt": NopReader,
     ".md": NopReader,
 }
-CHUNKABLE_FILE_EXTENSIONS = set(
-    [
-        ".pdf",
-        ".txt",
-        ".md",
-    ]
-)
+CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"}
 
 @dataclass
 class NotSupportedFileExtensionError(Exception):
@@ -101,6 +95,7 @@ def index_file(self, file_path: str, file_id: str):
 
         for chunk, embedding in zip(chunks, embeddings):
             chunk.embedding = embedding
+            chunk.metadata["file_name"] = os.path.basename(file_path)
 
         logger.debug(f"Adding {len(chunks)} chunks to vector store")
         chunks_vector_store = self.chunks_vector_store.access_vector_store()
@@ -113,6 +108,7 @@ def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -
 
         for i, document in enumerate(documents):
             # Update the document metadata
+            document.id_ = file_id
             document.metadata["file_id"] = file_id
             document.metadata["document_part_number"] = i
             document.metadata["data_source_id"] = self.data_source_id
@@ -124,6 +120,7 @@ def _chunks_in_document(self, document: Document) -> List[BaseNode]:
 
         for j, chunk in enumerate(chunks):
             chunk.metadata["file_id"] = document.metadata["file_id"]
+            chunk.metadata["document_id"] = document.metadata["file_id"]
             chunk.metadata["document_part_number"] = document.metadata["document_part_number"]
             chunk.metadata["chunk_number"] = j
             chunk.metadata["data_source_id"] = document.metadata["data_source_id"]

diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
@@ -28,7 +28,6 @@
 #  DATA.
 # ##############################################################################
 
-import http
 import logging
 import os
 import tempfile
@@ -39,8 +38,8 @@
 from .... import exceptions
 from ....services import doc_summaries, qdrant, s3
 from ....ai.indexing.index import Indexer
-from ....services.rag_vector_store import create_rag_vector_store
-from ....services.models import get_embedding_model
+from ....services import rag_vector_store
+from ....services import models
 from llama_index.core.node_parser import SentenceSplitter
 
 logger = logging.getLogger(__name__)
@@ -143,7 +142,7 @@ def download_and_index(
                     chunk_size=request.configuration.chunk_size,
                     chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size),
                 ),
-                embedding_model=get_embedding_model(),
-                chunks_vector_store=create_rag_vector_store(data_source_id)
+                embedding_model=models.get_embedding_model(),
+                chunks_vector_store=rag_vector_store.create_rag_vector_store(data_source_id)
         )
         indexer.index_file(file_path, request.document_id)
diff --git a/llm-service/app/services/CaiiModel.py b/llm-service/app/services/CaiiModel.py
@@ -60,7 +60,8 @@ def __init__(
             api_base=api_base,
             messages_to_prompt=messages_to_prompt,
             completion_to_prompt=completion_to_prompt,
-            default_headers=default_headers)
+            default_headers=default_headers,
+            context=context)
         self.context = context
 
     @property

diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py
@@ -55,7 +55,7 @@
 from app.main import app
 from app.services import models, rag_vector_store
 from app.services.rag_qdrant_vector_store import RagQdrantVectorStore
-
+from app.services.utils import get_last_segment
 
 @pytest.fixture
 def aws_region() -> str:
@@ -92,7 +92,7 @@ def data_source_id() -> int:
 @pytest.fixture
 def index_document_request_body(data_source_id, s3_object) -> dict[str, Any]:
     return {
-        "document_id": s3_object.key,
+        "document_id": get_last_segment(s3_object.key),
         "data_source_id": data_source_id,
         "s3_bucket_name": s3_object.bucket_name,
         "s3_document_key": s3_object.key,

diff --git a/llm-service/app/tests/routers/index/test_data_source.py b/llm-service/app/tests/routers/index/test_data_source.py
@@ -40,7 +40,6 @@
 
 from typing import Any
 
-import pytest
 from llama_index.core import VectorStoreIndex
 from llama_index.core.vector_stores import VectorStoreQuery
 
@@ -53,7 +52,6 @@ def get_vector_store_index(data_source_id) -> VectorStoreIndex:
     index = VectorStoreIndex.from_vector_store(vector_store, embed_model=models.get_embedding_model())
     return index
 
-@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
 class TestDocumentIndexing:
     @staticmethod
     def test_create_document(

diff --git a/llm-service/app/tests/routers/index/test_doc_summaries.py b/llm-service/app/tests/routers/index/test_doc_summaries.py
@@ -39,7 +39,6 @@
 import pytest
 from typing import Any
 
-@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
 class TestDocumentSummaries:
     @staticmethod
     def test_generate_summary(client, index_document_request_body: dict[str, Any], data_source_id, document_id, s3_object) -> None: