diff --git a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java index cbdc33d38..b1e2cd797 100644 --- a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java +++ b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java @@ -68,7 +68,8 @@ public void indexFile( + "/data_sources/" + ragDocument.dataSourceId() + "/documents/download-and-index", - new IndexRequest(bucketName, ragDocument.s3Path(), configuration)); + new IndexRequest( + ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration)); } catch (IOException e) { throw new RuntimeException(e); } @@ -97,6 +98,7 @@ public void deleteSession(Long sessionId) { } record IndexRequest( + @JsonProperty("document_id") String documentId, @JsonProperty("s3_bucket_name") String s3BucketName, @JsonProperty("s3_document_key") String s3DocumentKey, IndexConfiguration configuration) {} diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java index 60f6393ff..e1f8760d7 100644 --- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java +++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java @@ -57,7 +57,7 @@ void indexFile() { Tracker> tracker = new Tracker<>(); RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2); - RagDocument document = indexRequest("s3Path", 1234L); + RagDocument document = indexRequest("documentId", "s3Path", 1234L); client.indexFile(document, "bucketName", indexConfiguration); @@ -68,14 +68,15 @@ void indexFile() { new TrackedHttpRequest<>( HttpMethod.POST, "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index", - new RagBackendClient.IndexRequest("bucketName", "s3Path", indexConfiguration))); + new RagBackendClient.IndexRequest( + "documentId", "bucketName", "s3Path", indexConfiguration))); } @Test void createSummary() { Tracker> tracker = new Tracker<>(); RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - RagDocument document = indexRequest("s3Path", 1234L); + RagDocument document = indexRequest("documentId", "s3Path", 1234L); client.createSummary(document, "bucketName"); @@ -134,13 +135,25 @@ void deleteSession() { void null_handlesThrowable() { RagBackendClient client = RagBackendClient.createNull(new Tracker<>(), new NotFound("not found")); - RagDocument document = indexRequest("s3Path", 1234L); + RagDocument document = indexRequest("documentId", "s3Path", 1234L); assertThatThrownBy(() -> client.indexFile(document, "fakeit", null)) .isInstanceOf(NotFound.class); } - private static RagDocument indexRequest(String s3Path, Long dataSourceId) { + private static RagDocument indexRequest(String documentId, String s3Path, Long dataSourceId) { return new RagDocument( - null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null); + null, + null, + dataSourceId, + documentId, + s3Path, + null, + null, + null, + null, + null, + null, + null, + null); } } diff --git a/llm-service/app/ai/__init__.py b/llm-service/app/ai/__init__.py new file mode 100644 index 000000000..e2b4ac6c2 --- /dev/null +++ b/llm-service/app/ai/__init__.py @@ -0,0 +1,37 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# diff --git a/llm-service/app/ai/indexing/__init__.py b/llm-service/app/ai/indexing/__init__.py new file mode 100644 index 000000000..e2b4ac6c2 --- /dev/null +++ b/llm-service/app/ai/indexing/__init__.py @@ -0,0 +1,37 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py new file mode 100644 index 000000000..85ac69930 --- /dev/null +++ b/llm-service/app/ai/indexing/index.py @@ -0,0 +1,157 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# + +import logging +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Type + +from llama_index.core.base.embeddings.base import BaseEmbedding +from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import BaseNode, Document, TextNode +from llama_index.readers.file import DocxReader + +from ...services.vector_store import VectorStore +from .readers.nop import NopReader +from .readers.pdf import PDFReader + +logger = logging.getLogger(__name__) + +READERS: Dict[str, Type[BaseReader]] = { + ".pdf": PDFReader, + ".txt": NopReader, + ".md": NopReader, + ".docx": DocxReader, +} +CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md", ".docx"} + + +@dataclass +class NotSupportedFileExtensionError(Exception): + file_extension: str + + +class Indexer: + def __init__( + self, + data_source_id: int, + splitter: SentenceSplitter, + embedding_model: BaseEmbedding, + chunks_vector_store: VectorStore, + ): + self.data_source_id = data_source_id + self.splitter = splitter + self.embedding_model = embedding_model + self.chunks_vector_store = chunks_vector_store + + def index_file(self, file_path: Path, file_id: str) -> None: + logger.debug(f"Indexing file: {file_path}") + + file_extension = os.path.splitext(file_path)[1] + reader_cls = READERS.get(file_extension) + if not reader_cls: + raise NotSupportedFileExtensionError(file_extension) + + reader = reader_cls() + + logger.debug(f"Parsing file: {file_path}") + + documents = self._documents_in_file(reader, file_path, file_id) + if file_extension in CHUNKABLE_FILE_EXTENSIONS: + logger.debug(f"Chunking file: {file_path}") + chunks = [ + chunk + for document in documents + for chunk in self._chunks_in_document(document) + ] + else: + chunks = documents + + texts = [chunk.text for chunk in chunks] + logger.debug(f"Embedding {len(texts)} chunks") + embeddings = self.embedding_model.get_text_embedding_batch(texts) + + for chunk, embedding in zip(chunks, embeddings): + chunk.embedding = embedding + + logger.debug(f"Adding {len(chunks)} chunks to vector store") + chunks_vector_store = self.chunks_vector_store.access_vector_store() + + # We have to explicitly convert here even though the types are compatible (TextNode inherits from BaseNode) + # because the "add" annotation uses List instead of Sequence. We need to use TextNode explicitly because + # we're capturing "text". + converted_chunks: List[BaseNode] = [chunk for chunk in chunks] + chunks_vector_store.add(converted_chunks) + + logger.debug(f"Indexing file: {file_path} completed") + + def _documents_in_file( + self, reader: BaseReader, file_path: Path, file_id: str + ) -> List[Document]: + documents = reader.load_data(file_path) + + for i, document in enumerate(documents): + # Update the document metadata + document.id_ = file_id + document.metadata["file_name"] = os.path.basename(file_path) + document.metadata["document_id"] = file_id + document.metadata["document_part_number"] = i + document.metadata["data_source_id"] = self.data_source_id + + return documents + + def _chunks_in_document(self, document: Document) -> List[TextNode]: + chunks = self.splitter.get_nodes_from_documents([document]) + + for j, chunk in enumerate(chunks): + chunk.metadata["file_name"] = document.metadata["file_name"] + chunk.metadata["document_id"] = document.metadata["document_id"] + chunk.metadata["document_part_number"] = document.metadata[ + "document_part_number" + ] + chunk.metadata["chunk_number"] = j + chunk.metadata["data_source_id"] = document.metadata["data_source_id"] + + converted_chunks: List[TextNode] = [] + for chunk in chunks: + assert isinstance(chunk, TextNode) + converted_chunks.append(chunk) + + return converted_chunks diff --git a/llm-service/app/ai/indexing/readers/__init__.py b/llm-service/app/ai/indexing/readers/__init__.py new file mode 100644 index 000000000..e2b4ac6c2 --- /dev/null +++ b/llm-service/app/ai/indexing/readers/__init__.py @@ -0,0 +1,37 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# diff --git a/llm-service/app/ai/indexing/readers/nop.py b/llm-service/app/ai/indexing/readers/nop.py new file mode 100644 index 000000000..3320e79fa --- /dev/null +++ b/llm-service/app/ai/indexing/readers/nop.py @@ -0,0 +1,47 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# + +from typing import List +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + + +class NopReader(BaseReader): + def load_data(self, file_path: str) -> List[Document]: + with open(file_path, "r") as f: + return [Document(text=f.read())] diff --git a/llm-service/app/ai/indexing/readers/pdf.py b/llm-service/app/ai/indexing/readers/pdf.py new file mode 100644 index 000000000..697721417 --- /dev/null +++ b/llm-service/app/ai/indexing/readers/pdf.py @@ -0,0 +1,52 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# + +from pathlib import Path +from typing import List + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document +from llama_index.readers.file import PDFReader as LlamaIndexPDFReader + + +class PDFReader(BaseReader): + def __init__(self) -> None: + self.inner = LlamaIndexPDFReader(return_full_document=True) + + def load_data(self, file_path: Path) -> List[Document]: + return self.inner.load_data(file_path) diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index a3d544bf3..dedf8c5d1 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -28,15 +28,18 @@ # DATA. # ############################################################################## -import http import logging +import os import tempfile +from pathlib import Path from fastapi import APIRouter +from llama_index.core.node_parser import SentenceSplitter from pydantic import BaseModel from .... import exceptions -from ....services import doc_summaries, qdrant, s3 +from ....ai.indexing.index import Indexer +from ....services import doc_summaries, models, qdrant, rag_vector_store, s3 logger = logging.getLogger(__name__) @@ -100,12 +103,17 @@ def delete_document(data_source_id: int, doc_id: str) -> None: doc_summaries.delete_document(data_source_id, doc_id) +class RagIndexDocumentConfiguration(BaseModel): + # TODO: Add more params + chunk_size: int = 512 # this is llama-index's default + chunk_overlap: int = 10 # percentage of tokens in a chunk (chunk_size) + + class RagIndexDocumentRequest(BaseModel): + document_id: str s3_bucket_name: str s3_document_key: str - configuration: qdrant.RagIndexDocumentConfiguration = ( - qdrant.RagIndexDocumentConfiguration() - ) + configuration: RagIndexDocumentConfiguration = RagIndexDocumentConfiguration() @router.post( @@ -117,11 +125,29 @@ class RagIndexDocumentRequest(BaseModel): def download_and_index( data_source_id: int, request: RagIndexDocumentRequest, -) -> str: +) -> None: with tempfile.TemporaryDirectory() as tmpdirname: logger.debug("created temporary directory %s", tmpdirname) s3.download(tmpdirname, request.s3_bucket_name, request.s3_document_key) - qdrant.download_and_index( - tmpdirname, data_source_id, request.configuration, request.s3_document_key + # Get the single file in the directory + files = os.listdir(tmpdirname) + if len(files) != 1: + raise ValueError("Expected a single file in the temporary directory") + file_path = Path(os.path.join(tmpdirname, files[0])) + + indexer = Indexer( + data_source_id, + splitter=SentenceSplitter( + chunk_size=request.configuration.chunk_size, + chunk_overlap=int( + request.configuration.chunk_overlap + * 0.01 + * request.configuration.chunk_size + ), + ), + embedding_model=models.get_embedding_model(), + chunks_vector_store=rag_vector_store.create_rag_vector_store( + data_source_id + ), ) - return http.HTTPStatus.OK.phrase + indexer.index_file(file_path, request.document_id) diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py index efeeff25d..314264359 100644 --- a/llm-service/app/services/qdrant.py +++ b/llm-service/app/services/qdrant.py @@ -44,77 +44,16 @@ from llama_index.core.chat_engine.types import AgentChatResponse from llama_index.core.indices import VectorStoreIndex from llama_index.core.indices.vector_store import VectorIndexRetriever -from llama_index.core.node_parser import SentenceSplitter from llama_index.core.query_engine import RetrieverQueryEngine -from llama_index.core.readers import SimpleDirectoryReader from llama_index.core.response_synthesizers import get_response_synthesizer -from llama_index.core.storage import StorageContext -from pydantic import BaseModel from ..rag_types import RagPredictConfiguration from . import models, rag_vector_store from .chat_store import RagContext -from .utils import get_last_segment logger = logging.getLogger(__name__) -class RagIndexDocumentConfiguration(BaseModel): - # TODO: Add more params - chunk_size: int = 512 # this is llama-index's default - chunk_overlap: int = 10 # percentage of tokens in a chunk (chunk_size) - - -def download_and_index( - tmpdirname: str, - data_source_id: int, - configuration: RagIndexDocumentConfiguration, - s3_document_key: str, -) -> None: - try: - documents = SimpleDirectoryReader(tmpdirname).load_data() - document_id = get_last_segment(s3_document_key) - for document in documents: - document.id_ = document_id # this is a terrible way to assign the doc id... - document.metadata["document_id"] = document_id - except Exception as e: - logger.error( - "error loading document from temporary directory %s", - tmpdirname, - ) - raise HTTPException( - status_code=422, - detail=f"error loading document from temporary directory {tmpdirname}", - ) from e - - logger.info("instantiating vector store") - vector_store = rag_vector_store.create_rag_vector_store( - data_source_id - ).access_vector_store() - logger.info("instantiated vector store") - - storage_context = StorageContext.from_defaults(vector_store=vector_store) - - chunk_overlap_tokens = int( - configuration.chunk_overlap * 0.01 * configuration.chunk_size - ) - - logger.info("indexing document") - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - embed_model=models.get_embedding_model(), - show_progress=False, - transformations=[ - SentenceSplitter( - chunk_size=configuration.chunk_size, - chunk_overlap=chunk_overlap_tokens, - ), - ], - ) - logger.info("indexed document") - - def check_data_source_exists(data_source_size: int) -> None: if data_source_size == -1: raise HTTPException(status_code=404, detail="Knowledge base not found.") diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py index 5c6201fa6..092df5048 100644 --- a/llm-service/app/tests/conftest.py +++ b/llm-service/app/tests/conftest.py @@ -65,7 +65,7 @@ from app.main import app from app.services import models, rag_vector_store from app.services.rag_qdrant_vector_store import RagQdrantVectorStore - +from app.services.utils import get_last_segment @dataclass class BotoObject: @@ -110,6 +110,7 @@ def index_document_request_body( data_source_id: int, s3_object: BotoObject ) -> Dict[str, Any]: return { + "document_id": get_last_segment(s3_object.key), "data_source_id": data_source_id, "s3_bucket_name": s3_object.bucket_name, "s3_document_key": s3_object.key, diff --git a/llm-service/pdm.lock b/llm-service/pdm.lock index 3a3ec66ed..cddb261f3 100644 --- a/llm-service/pdm.lock +++ b/llm-service/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:cf38cbf44250032e4b248c90dbc34037bb94662cdfe512d50fcb3e271309fd84" +content_hash = "sha256:2a9b3e86ee90d639241d72fdeee20e779c2e7c42e90ab2e43e335c18454e0858" [[metadata.targets]] requires_python = "==3.10.*" @@ -382,6 +382,15 @@ files = [ {file = "dnspython-2.7.0.tar.gz", hash = "sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1"}, ] +[[package]] +name = "docx2txt" +version = "0.8" +summary = "A pure python-based utility to extract text and images from docx files." +groups = ["default"] +files = [ + {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"}, +] + [[package]] name = "email-validator" version = "2.2.0" diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml index 830db6091..d22188469 100644 --- a/llm-service/pyproject.toml +++ b/llm-service/pyproject.toml @@ -5,7 +5,7 @@ description = "Default template for PDM package" authors = [ {name = "Conrado Silva Miranda", email = "csilvamiranda@cloudera.com"}, ] -dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17"] +dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17", "docx2txt>=0.8"] requires-python = "==3.10.*" readme = "README.md" license = {text = "APACHE"}