diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py index 5661d5e05..5d59dc154 100644 --- a/llm-service/app/ai/indexing/index.py +++ b/llm-service/app/ai/indexing/index.py @@ -57,13 +57,7 @@ ".txt": NopReader, ".md": NopReader, } -CHUNKABLE_FILE_EXTENSIONS = set( - [ - ".pdf", - ".txt", - ".md", - ] -) +CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"} @dataclass class NotSupportedFileExtensionError(Exception): @@ -101,6 +95,7 @@ def index_file(self, file_path: str, file_id: str): for chunk, embedding in zip(chunks, embeddings): chunk.embedding = embedding + chunk.metadata["file_name"] = os.path.basename(file_path) logger.debug(f"Adding {len(chunks)} chunks to vector store") chunks_vector_store = self.chunks_vector_store.access_vector_store() @@ -113,6 +108,7 @@ def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) - for i, document in enumerate(documents): # Update the document metadata + document.id_ = file_id document.metadata["file_id"] = file_id document.metadata["document_part_number"] = i document.metadata["data_source_id"] = self.data_source_id @@ -124,6 +120,7 @@ def _chunks_in_document(self, document: Document) -> List[BaseNode]: for j, chunk in enumerate(chunks): chunk.metadata["file_id"] = document.metadata["file_id"] + chunk.metadata["document_id"] = document.metadata["file_id"] chunk.metadata["document_part_number"] = document.metadata["document_part_number"] chunk.metadata["chunk_number"] = j chunk.metadata["data_source_id"] = document.metadata["data_source_id"] diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index ca073d772..bb5e2bfae 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -28,7 +28,6 @@ # DATA. # ############################################################################## -import http import logging import os import tempfile @@ -39,8 +38,8 @@ from .... import exceptions from ....services import doc_summaries, qdrant, s3 from ....ai.indexing.index import Indexer -from ....services.rag_vector_store import create_rag_vector_store -from ....services.models import get_embedding_model +from ....services import rag_vector_store +from ....services import models from llama_index.core.node_parser import SentenceSplitter logger = logging.getLogger(__name__) @@ -143,7 +142,7 @@ def download_and_index( chunk_size=request.configuration.chunk_size, chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size), ), - embedding_model=get_embedding_model(), - chunks_vector_store=create_rag_vector_store(data_source_id) + embedding_model=models.get_embedding_model(), + chunks_vector_store=rag_vector_store.create_rag_vector_store(data_source_id) ) indexer.index_file(file_path, request.document_id) diff --git a/llm-service/app/services/CaiiModel.py b/llm-service/app/services/CaiiModel.py index f1897edbc..e71b5cf4f 100644 --- a/llm-service/app/services/CaiiModel.py +++ b/llm-service/app/services/CaiiModel.py @@ -60,7 +60,8 @@ def __init__( api_base=api_base, messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, - default_headers=default_headers) + default_headers=default_headers, + context=context) self.context = context @property diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py index 40530c7df..7fb6154fb 100644 --- a/llm-service/app/tests/conftest.py +++ b/llm-service/app/tests/conftest.py @@ -55,7 +55,7 @@ from app.main import app from app.services import models, rag_vector_store from app.services.rag_qdrant_vector_store import RagQdrantVectorStore - +from app.services.utils import get_last_segment @pytest.fixture def aws_region() -> str: @@ -92,7 +92,7 @@ def data_source_id() -> int: @pytest.fixture def index_document_request_body(data_source_id, s3_object) -> dict[str, Any]: return { - "document_id": s3_object.key, + "document_id": get_last_segment(s3_object.key), "data_source_id": data_source_id, "s3_bucket_name": s3_object.bucket_name, "s3_document_key": s3_object.key, diff --git a/llm-service/app/tests/routers/index/test_data_source.py b/llm-service/app/tests/routers/index/test_data_source.py index 698bfbde8..b88e4ccdc 100644 --- a/llm-service/app/tests/routers/index/test_data_source.py +++ b/llm-service/app/tests/routers/index/test_data_source.py @@ -40,7 +40,6 @@ from typing import Any -import pytest from llama_index.core import VectorStoreIndex from llama_index.core.vector_stores import VectorStoreQuery @@ -53,7 +52,6 @@ def get_vector_store_index(data_source_id) -> VectorStoreIndex: index = VectorStoreIndex.from_vector_store(vector_store, embed_model=models.get_embedding_model()) return index -@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.") class TestDocumentIndexing: @staticmethod def test_create_document( diff --git a/llm-service/app/tests/routers/index/test_doc_summaries.py b/llm-service/app/tests/routers/index/test_doc_summaries.py index aa80fac50..92ffda5b1 100644 --- a/llm-service/app/tests/routers/index/test_doc_summaries.py +++ b/llm-service/app/tests/routers/index/test_doc_summaries.py @@ -39,7 +39,6 @@ import pytest from typing import Any -@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.") class TestDocumentSummaries: @staticmethod def test_generate_summary(client, index_document_request_body: dict[str, Any], data_source_id, document_id, s3_object) -> None: