From cb095310598b92c9a755358c83352b68085fedb6 Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Wed, 20 Nov 2024 13:24:26 -0800 Subject: [PATCH 01/10] Provide Indexer to index files --- .../cai/rag/external/RagBackendClient.java | 36 ++-- .../rag/external/RagBackendClientTest.java | 190 +++++++++--------- llm-service/app/ai/__init__.py | 37 ++++ llm-service/app/ai/indexing/__init__.py | 37 ++++ llm-service/app/ai/indexing/index.py | 124 ++++++++++++ .../app/ai/indexing/readers/__init__.py | 37 ++++ llm-service/app/ai/indexing/readers/pdf.py | 50 +++++ .../app/routers/index/data_source/__init__.py | 39 +++- llm-service/app/services/qdrant.py | 55 ----- 9 files changed, 433 insertions(+), 172 deletions(-) create mode 100644 llm-service/app/ai/__init__.py create mode 100644 llm-service/app/ai/indexing/__init__.py create mode 100644 llm-service/app/ai/indexing/index.py create mode 100644 llm-service/app/ai/indexing/readers/__init__.py create mode 100644 llm-service/app/ai/indexing/readers/pdf.py diff --git a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java index cbdc33d38..427fd3b27 100644 --- a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java +++ b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java @@ -38,15 +38,18 @@ package com.cloudera.cai.rag.external; +import java.io.IOException; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + import com.cloudera.cai.rag.Types; import com.cloudera.cai.rag.configuration.AppConfiguration; import com.cloudera.cai.util.SimpleHttpClient; import com.cloudera.cai.util.Tracker; import com.fasterxml.jackson.annotation.JsonProperty; + import io.opentelemetry.instrumentation.annotations.WithSpan; -import java.io.IOException; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; @Component public class RagBackendClient { @@ -68,7 +71,7 @@ public void indexFile( + "/data_sources/" + ragDocument.dataSourceId() + "/documents/download-and-index", - new IndexRequest(bucketName, ragDocument.s3Path(), configuration)); + new IndexRequest(ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration)); } catch (IOException e) { throw new RuntimeException(e); } @@ -97,17 +100,21 @@ public void deleteSession(Long sessionId) { } record IndexRequest( + @JsonProperty("document_id") String documentId, @JsonProperty("s3_bucket_name") String s3BucketName, @JsonProperty("s3_document_key") String s3DocumentKey, - IndexConfiguration configuration) {} + IndexConfiguration configuration) { + } public record SummaryRequest( @JsonProperty("s3_bucket_name") String s3BucketName, - @JsonProperty("s3_document_key") String s3DocumentKey) {} + @JsonProperty("s3_document_key") String s3DocumentKey) { + } public record IndexConfiguration( @JsonProperty("chunk_size") int chunkSize, - @JsonProperty("chunk_overlap") int chunkOverlapPercentage) {} + @JsonProperty("chunk_overlap") int chunkOverlapPercentage) { + } // nullables below here @@ -171,13 +178,18 @@ public void deleteDocument(long dataSourceId, String documentId) { } public record TrackedIndexRequest( - String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) {} + String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) { + } - public record TrackedDeleteSessionRequest(Long sessionId) {} + public record TrackedDeleteSessionRequest(Long sessionId) { + } - public record TrackedDeleteDataSourceRequest(long dataSourceId) {} + public record TrackedDeleteDataSourceRequest(long dataSourceId) { + } - public record TrackedRequest(T detail) {} + public record TrackedRequest(T detail) { + } - public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) {} + public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) { + } } diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java index 60f6393ff..b3d07f85f 100644 --- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java +++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java @@ -41,106 +41,108 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.springframework.http.HttpMethod; + import com.cloudera.cai.rag.Types.RagDocument; import com.cloudera.cai.rag.external.RagBackendClient.IndexConfiguration; import com.cloudera.cai.util.SimpleHttpClient; import com.cloudera.cai.util.SimpleHttpClient.TrackedHttpRequest; import com.cloudera.cai.util.Tracker; import com.cloudera.cai.util.exceptions.NotFound; -import java.util.List; -import org.junit.jupiter.api.Test; -import org.springframework.http.HttpMethod; class RagBackendClientTest { - @Test - void indexFile() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2); - RagDocument document = indexRequest("s3Path", 1234L); - - client.indexFile(document, "bucketName", indexConfiguration); - - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.POST, - "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index", - new RagBackendClient.IndexRequest("bucketName", "s3Path", indexConfiguration))); - } - - @Test - void createSummary() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - RagDocument document = indexRequest("s3Path", 1234L); - - client.createSummary(document, "bucketName"); - - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.POST, - "http://rag-backend:8000/data_sources/1234/summarize-document", - new RagBackendClient.SummaryRequest("bucketName", "s3Path"))); - } - - @Test - void deleteDataSource() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - client.deleteDataSource(1234L); - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null)); - } - - @Test - void deleteDocument() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - client.deleteDocument(1234L, "documentId"); - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.DELETE, - "http://rag-backend:8000/data_sources/1234/documents/documentId", - null)); - } - - @Test - void deleteSession() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - client.deleteSession(1234L); - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null)); - } - - @Test - void null_handlesThrowable() { - RagBackendClient client = - RagBackendClient.createNull(new Tracker<>(), new NotFound("not found")); - RagDocument document = indexRequest("s3Path", 1234L); - assertThatThrownBy(() -> client.indexFile(document, "fakeit", null)) - .isInstanceOf(NotFound.class); - } - - private static RagDocument indexRequest(String s3Path, Long dataSourceId) { - return new RagDocument( - null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null); - } + @Test + void indexFile() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2); + RagDocument document = indexRequest("s3Path", 1234L); + + client.indexFile(document, "bucketName", indexConfiguration); + + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.POST, + "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index", + new RagBackendClient.IndexRequest("documentId", "bucketName", "s3Path", + indexConfiguration))); + } + + @Test + void createSummary() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + RagDocument document = indexRequest("s3Path", 1234L); + + client.createSummary(document, "bucketName"); + + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.POST, + "http://rag-backend:8000/data_sources/1234/summarize-document", + new RagBackendClient.SummaryRequest("bucketName", "s3Path"))); + } + + @Test + void deleteDataSource() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + client.deleteDataSource(1234L); + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null)); + } + + @Test + void deleteDocument() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + client.deleteDocument(1234L, "documentId"); + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.DELETE, + "http://rag-backend:8000/data_sources/1234/documents/documentId", + null)); + } + + @Test + void deleteSession() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + client.deleteSession(1234L); + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null)); + } + + @Test + void null_handlesThrowable() { + RagBackendClient client = RagBackendClient.createNull(new Tracker<>(), new NotFound("not found")); + RagDocument document = indexRequest("s3Path", 1234L); + assertThatThrownBy(() -> client.indexFile(document, "fakeit", null)) + .isInstanceOf(NotFound.class); + } + + private static RagDocument indexRequest(String s3Path, Long dataSourceId) { + return new RagDocument( + null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null); + } } diff --git a/llm-service/app/ai/__init__.py b/llm-service/app/ai/__init__.py new file mode 100644 index 000000000..e2b4ac6c2 --- /dev/null +++ b/llm-service/app/ai/__init__.py @@ -0,0 +1,37 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# diff --git a/llm-service/app/ai/indexing/__init__.py b/llm-service/app/ai/indexing/__init__.py new file mode 100644 index 000000000..e2b4ac6c2 --- /dev/null +++ b/llm-service/app/ai/indexing/__init__.py @@ -0,0 +1,37 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py new file mode 100644 index 000000000..8ac93877e --- /dev/null +++ b/llm-service/app/ai/indexing/index.py @@ -0,0 +1,124 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# + +from dataclasses import dataclass +import logging +import os +from typing import Dict, List, Type + +from .readers.pdf import PDFReader +from llama_index.core.readers.base import BaseReader +from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.schema import Document +from llama_index.core.base.embeddings.base import BaseEmbedding +from app.services.vector_store import VectorStore +from llama_index.core.node_parser.interface import BaseNode + +logger = logging.getLogger(__name__) + +READERS: Dict[str, Type[BaseReader]] = { + ".pdf": PDFReader, +} +CHUNKABLE_FILE_EXTENSIONS = set( + [ + ".pdf", + ] +) + +@dataclass +class NotSupportedFileExtensionError(Exception): + file_extension: str + +class Indexer: + def __init__(self, data_source_id: int, splitter: SentenceSplitter, embedding_model: BaseEmbedding, chunks_vector_store: VectorStore): + self.data_source_id = data_source_id + self.splitter = splitter + self.embedding_model = embedding_model + self.chunks_vector_store = chunks_vector_store + + def index_file(self, file_path: str, file_id: str): + logger.debug(f"Indexing file: {file_path}") + + file_extension = os.path.splitext(file_path)[1] + reader_cls = READERS.get(file_extension) + if not reader_cls: + raise NotSupportedFileExtensionError(file_extension) + + reader = reader_cls() + + logger.debug(f"Parsing file: {file_path}") + + documents = self._documents_in_file(reader, file_path, file_id) + if file_extension in CHUNKABLE_FILE_EXTENSIONS: + logger.debug(f"Chunking file: {file_path}") + chunks = [chunk for document in documents for chunk in self._chunks_in_document(document)] + else: + chunks = documents + + texts = [chunk.text for chunk in chunks] + logger.debug(f"Embedding {len(texts)} chunks") + embeddings = self.embedding_model.get_text_embedding_batch(texts) + + for chunk, embedding in zip(chunks, embeddings): + chunk.embedding = embedding + + logger.debug(f"Adding {len(chunks)} chunks to vector store") + chunks_vector_store = self.chunks_vector_store.access_vector_store() + chunks_vector_store.add(chunks) + + def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -> List[Document]: + documents = reader.load_data(file_path) + + for i, document in enumerate(documents): + # Update the document metadata + document.metadata["file_id"] = file_id + document.metadata["document_part_number"] = i + document.metadata["data_source_id"] = self.data_source_id + + return documents + + def _chunks_in_document(self, document: Document) -> List[BaseNode]: + chunks = self.splitter.get_nodes_from_documents([document]) + + for j, chunk in enumerate(chunks): + chunk.metadata["file_id"] = document.metadata["file_id"] + chunk.metadata["document_part_number"] = document.metadata["document_part_number"] + chunk.metadata["chunk_number"] = j + chunk.metadata["data_source_id"] = document.metadata["data_source_id"] + + return chunks diff --git a/llm-service/app/ai/indexing/readers/__init__.py b/llm-service/app/ai/indexing/readers/__init__.py new file mode 100644 index 000000000..e2b4ac6c2 --- /dev/null +++ b/llm-service/app/ai/indexing/readers/__init__.py @@ -0,0 +1,37 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# diff --git a/llm-service/app/ai/indexing/readers/pdf.py b/llm-service/app/ai/indexing/readers/pdf.py new file mode 100644 index 000000000..dfb922617 --- /dev/null +++ b/llm-service/app/ai/indexing/readers/pdf.py @@ -0,0 +1,50 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# + +from typing import List + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document +from llama_index.readers.file import PDFReader as LlamaIndexPDFReader + +class PDFReader(BaseReader): + def __init__(self): + self.inner = LlamaIndexPDFReader(return_full_document=True) + + def load_data(self, file_path: str) -> List[Document]: + return self.inner.load_data(file_path) diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index aa7e133eb..f3584d1e8 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -30,6 +30,7 @@ import http import logging +import os import tempfile from fastapi import APIRouter @@ -37,6 +38,10 @@ from .... import exceptions from ....services import doc_summaries, qdrant, s3 +from app.ai.indexing.index import Indexer +from app.services.rag_vector_store import create_rag_vector_store +from app.services.models import get_embedding_model +from llama_index.core.node_parser import SentenceSplitter logger = logging.getLogger(__name__) @@ -100,15 +105,19 @@ def delete_document(data_source_id: int, doc_id: str) -> None: doc_summaries.delete_document(data_source_id, doc_id) +class RagIndexDocumentConfiguration(BaseModel): + # TODO: Add more params + chunk_size: int = 512 # this is llama-index's default + chunk_overlap: int = 10 # percentage of tokens in a chunk (chunk_size) class RagIndexDocumentRequest(BaseModel): + document_id: str s3_bucket_name: str s3_document_key: str - configuration: qdrant.RagIndexDocumentConfiguration = ( - qdrant.RagIndexDocumentConfiguration() + configuration: RagIndexDocumentConfiguration = ( + RagIndexDocumentConfiguration() ) - @router.post( "/documents/download-and-index", summary="Download and index document", @@ -118,15 +127,23 @@ class RagIndexDocumentRequest(BaseModel): def download_and_index( data_source_id: int, request: RagIndexDocumentRequest, -) -> str: +) -> None: with tempfile.TemporaryDirectory() as tmpdirname: logger.debug("created temporary directory %s", tmpdirname) s3.download(tmpdirname, request.s3_bucket_name, request.s3_document_key) - qdrant.download_and_index( - tmpdirname, - data_source_id, - request.configuration, - request.s3_document_key + # Get the single file in the directory + files = os.listdir(tmpdirname) + if len(files) != 1: + raise ValueError("Expected a single file in the temporary directory") + file_path = os.path.join(tmpdirname, files[0]) + + indexer = Indexer( + data_source_id, + splitter=SentenceSplitter( + chunk_size=request.configuration.chunk_size, + chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size), + ), + embedding_model=get_embedding_model(), + chunks_vector_store=create_rag_vector_store(data_source_id) ) - return http.HTTPStatus.OK.phrase - + indexer.index_file(file_path, request.document_id) diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py index 11b3c0569..1aab2e1b3 100644 --- a/llm-service/app/services/qdrant.py +++ b/llm-service/app/services/qdrant.py @@ -59,61 +59,6 @@ logger = logging.getLogger(__name__) - -class RagIndexDocumentConfiguration(BaseModel): - # TODO: Add more params - chunk_size: int = 512 # this is llama-index's default - chunk_overlap: int = 10 # percentage of tokens in a chunk (chunk_size) - - -def download_and_index( - tmpdirname: str, - data_source_id: int, - configuration: RagIndexDocumentConfiguration, - s3_document_key: str, -): - try: - documents = SimpleDirectoryReader(tmpdirname).load_data() - document_id = get_last_segment(s3_document_key) - for document in documents: - document.id_ = document_id # this is a terrible way to assign the doc id... - document.metadata["document_id"] = document_id - except Exception as e: - logger.error( - "error loading document from temporary directory %s", - tmpdirname, - ) - raise HTTPException( - status_code=422, - detail=f"error loading document from temporary directory {tmpdirname}", - ) from e - - logger.info("instantiating vector store") - vector_store = rag_vector_store.create_rag_vector_store(data_source_id).access_vector_store() - logger.info("instantiated vector store") - - storage_context = StorageContext.from_defaults(vector_store=vector_store) - - chunk_overlap_tokens = int( - configuration.chunk_overlap * 0.01 * configuration.chunk_size - ) - - logger.info("indexing document") - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - embed_model=models.get_embedding_model(), - show_progress=False, - transformations=[ - SentenceSplitter( - chunk_size=configuration.chunk_size, - chunk_overlap=chunk_overlap_tokens, - ), - ], - ) - logger.info("indexed document") - - def check_data_source_exists(data_source_size: int) -> None: if data_source_size == -1: raise HTTPException(status_code=404, detail="Knowledge base not found.") From 5aeed8447a9a1bde72066e8acdd0bd2593f4429c Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Wed, 20 Nov 2024 13:47:08 -0800 Subject: [PATCH 02/10] fix imports for local dev --- .../cai/rag/external/RagBackendClient.java | 36 ++-- .../rag/external/RagBackendClientTest.java | 191 +++++++++--------- llm-service/app/ai/indexing/index.py | 7 +- llm-service/app/ai/indexing/readers/nop.py | 46 +++++ .../app/routers/index/data_source/__init__.py | 6 +- llm-service/app/tests/conftest.py | 1 + 6 files changed, 164 insertions(+), 123 deletions(-) create mode 100644 llm-service/app/ai/indexing/readers/nop.py diff --git a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java index 427fd3b27..b1e2cd797 100644 --- a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java +++ b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java @@ -38,18 +38,15 @@ package com.cloudera.cai.rag.external; -import java.io.IOException; - -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; - import com.cloudera.cai.rag.Types; import com.cloudera.cai.rag.configuration.AppConfiguration; import com.cloudera.cai.util.SimpleHttpClient; import com.cloudera.cai.util.Tracker; import com.fasterxml.jackson.annotation.JsonProperty; - import io.opentelemetry.instrumentation.annotations.WithSpan; +import java.io.IOException; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; @Component public class RagBackendClient { @@ -71,7 +68,8 @@ public void indexFile( + "/data_sources/" + ragDocument.dataSourceId() + "/documents/download-and-index", - new IndexRequest(ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration)); + new IndexRequest( + ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration)); } catch (IOException e) { throw new RuntimeException(e); } @@ -103,18 +101,15 @@ record IndexRequest( @JsonProperty("document_id") String documentId, @JsonProperty("s3_bucket_name") String s3BucketName, @JsonProperty("s3_document_key") String s3DocumentKey, - IndexConfiguration configuration) { - } + IndexConfiguration configuration) {} public record SummaryRequest( @JsonProperty("s3_bucket_name") String s3BucketName, - @JsonProperty("s3_document_key") String s3DocumentKey) { - } + @JsonProperty("s3_document_key") String s3DocumentKey) {} public record IndexConfiguration( @JsonProperty("chunk_size") int chunkSize, - @JsonProperty("chunk_overlap") int chunkOverlapPercentage) { - } + @JsonProperty("chunk_overlap") int chunkOverlapPercentage) {} // nullables below here @@ -178,18 +173,13 @@ public void deleteDocument(long dataSourceId, String documentId) { } public record TrackedIndexRequest( - String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) { - } + String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) {} - public record TrackedDeleteSessionRequest(Long sessionId) { - } + public record TrackedDeleteSessionRequest(Long sessionId) {} - public record TrackedDeleteDataSourceRequest(long dataSourceId) { - } + public record TrackedDeleteDataSourceRequest(long dataSourceId) {} - public record TrackedRequest(T detail) { - } + public record TrackedRequest(T detail) {} - public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) { - } + public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) {} } diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java index b3d07f85f..13dd0b729 100644 --- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java +++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java @@ -41,108 +41,107 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import java.util.List; - -import org.junit.jupiter.api.Test; -import org.springframework.http.HttpMethod; - import com.cloudera.cai.rag.Types.RagDocument; import com.cloudera.cai.rag.external.RagBackendClient.IndexConfiguration; import com.cloudera.cai.util.SimpleHttpClient; import com.cloudera.cai.util.SimpleHttpClient.TrackedHttpRequest; import com.cloudera.cai.util.Tracker; import com.cloudera.cai.util.exceptions.NotFound; +import java.util.List; +import org.junit.jupiter.api.Test; +import org.springframework.http.HttpMethod; class RagBackendClientTest { - @Test - void indexFile() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2); - RagDocument document = indexRequest("s3Path", 1234L); - - client.indexFile(document, "bucketName", indexConfiguration); - - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.POST, - "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index", - new RagBackendClient.IndexRequest("documentId", "bucketName", "s3Path", - indexConfiguration))); - } - - @Test - void createSummary() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - RagDocument document = indexRequest("s3Path", 1234L); - - client.createSummary(document, "bucketName"); - - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.POST, - "http://rag-backend:8000/data_sources/1234/summarize-document", - new RagBackendClient.SummaryRequest("bucketName", "s3Path"))); - } - - @Test - void deleteDataSource() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - client.deleteDataSource(1234L); - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null)); - } - - @Test - void deleteDocument() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - client.deleteDocument(1234L, "documentId"); - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.DELETE, - "http://rag-backend:8000/data_sources/1234/documents/documentId", - null)); - } - - @Test - void deleteSession() { - Tracker> tracker = new Tracker<>(); - RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - client.deleteSession(1234L); - List> values = tracker.getValues(); - assertThat(values) - .hasSize(1) - .contains( - new TrackedHttpRequest<>( - HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null)); - } - - @Test - void null_handlesThrowable() { - RagBackendClient client = RagBackendClient.createNull(new Tracker<>(), new NotFound("not found")); - RagDocument document = indexRequest("s3Path", 1234L); - assertThatThrownBy(() -> client.indexFile(document, "fakeit", null)) - .isInstanceOf(NotFound.class); - } - - private static RagDocument indexRequest(String s3Path, Long dataSourceId) { - return new RagDocument( - null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null); - } + @Test + void indexFile() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2); + RagDocument document = indexRequest("s3Path", 1234L); + + client.indexFile(document, "bucketName", indexConfiguration); + + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.POST, + "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index", + new RagBackendClient.IndexRequest( + "documentId", "bucketName", "s3Path", indexConfiguration))); + } + + @Test + void createSummary() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + RagDocument document = indexRequest("s3Path", 1234L); + + client.createSummary(document, "bucketName"); + + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.POST, + "http://rag-backend:8000/data_sources/1234/summarize-document", + new RagBackendClient.SummaryRequest("bucketName", "s3Path"))); + } + + @Test + void deleteDataSource() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + client.deleteDataSource(1234L); + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null)); + } + + @Test + void deleteDocument() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + client.deleteDocument(1234L, "documentId"); + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.DELETE, + "http://rag-backend:8000/data_sources/1234/documents/documentId", + null)); + } + + @Test + void deleteSession() { + Tracker> tracker = new Tracker<>(); + RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); + client.deleteSession(1234L); + List> values = tracker.getValues(); + assertThat(values) + .hasSize(1) + .contains( + new TrackedHttpRequest<>( + HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null)); + } + + @Test + void null_handlesThrowable() { + RagBackendClient client = + RagBackendClient.createNull(new Tracker<>(), new NotFound("not found")); + RagDocument document = indexRequest("s3Path", 1234L); + assertThatThrownBy(() -> client.indexFile(document, "fakeit", null)) + .isInstanceOf(NotFound.class); + } + + private static RagDocument indexRequest(String s3Path, Long dataSourceId) { + return new RagDocument( + null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null); + } } diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py index 8ac93877e..bc294a840 100644 --- a/llm-service/app/ai/indexing/index.py +++ b/llm-service/app/ai/indexing/index.py @@ -42,21 +42,26 @@ from typing import Dict, List, Type from .readers.pdf import PDFReader +from .readers.nop import NopReader from llama_index.core.readers.base import BaseReader from llama_index.core.node_parser import SentenceSplitter from llama_index.core.schema import Document from llama_index.core.base.embeddings.base import BaseEmbedding -from app.services.vector_store import VectorStore +from ...services.vector_store import VectorStore from llama_index.core.node_parser.interface import BaseNode logger = logging.getLogger(__name__) READERS: Dict[str, Type[BaseReader]] = { ".pdf": PDFReader, + ".txt": NopReader, + ".md": NopReader, } CHUNKABLE_FILE_EXTENSIONS = set( [ ".pdf", + ".txt", + ".md", ] ) diff --git a/llm-service/app/ai/indexing/readers/nop.py b/llm-service/app/ai/indexing/readers/nop.py new file mode 100644 index 000000000..41d0fa452 --- /dev/null +++ b/llm-service/app/ai/indexing/readers/nop.py @@ -0,0 +1,46 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# + +from typing import List +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + +class NopReader(BaseReader): + def load_data(self, file_path: str) -> List[Document]: + with open(file_path, "r") as f: + return [Document(text=f.read())] diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index f3584d1e8..ca073d772 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -38,9 +38,9 @@ from .... import exceptions from ....services import doc_summaries, qdrant, s3 -from app.ai.indexing.index import Indexer -from app.services.rag_vector_store import create_rag_vector_store -from app.services.models import get_embedding_model +from ....ai.indexing.index import Indexer +from ....services.rag_vector_store import create_rag_vector_store +from ....services.models import get_embedding_model from llama_index.core.node_parser import SentenceSplitter logger = logging.getLogger(__name__) diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py index 9ff9b7098..40530c7df 100644 --- a/llm-service/app/tests/conftest.py +++ b/llm-service/app/tests/conftest.py @@ -92,6 +92,7 @@ def data_source_id() -> int: @pytest.fixture def index_document_request_body(data_source_id, s3_object) -> dict[str, Any]: return { + "document_id": s3_object.key, "data_source_id": data_source_id, "s3_bucket_name": s3_object.bucket_name, "s3_document_key": s3_object.key, From 2cfc7ca70cd5bca9e46281125c40bcd64375be46 Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Wed, 20 Nov 2024 14:41:40 -0800 Subject: [PATCH 03/10] skip tests that require same qdrant client --- llm-service/app/ai/indexing/index.py | 2 ++ llm-service/app/tests/routers/index/test_data_source.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py index bc294a840..5661d5e05 100644 --- a/llm-service/app/ai/indexing/index.py +++ b/llm-service/app/ai/indexing/index.py @@ -106,6 +106,8 @@ def index_file(self, file_path: str, file_id: str): chunks_vector_store = self.chunks_vector_store.access_vector_store() chunks_vector_store.add(chunks) + logger.debug(f"Indexing file: {file_path} completed") + def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -> List[Document]: documents = reader.load_data(file_path) diff --git a/llm-service/app/tests/routers/index/test_data_source.py b/llm-service/app/tests/routers/index/test_data_source.py index 51cdbefaf..698bfbde8 100644 --- a/llm-service/app/tests/routers/index/test_data_source.py +++ b/llm-service/app/tests/routers/index/test_data_source.py @@ -40,6 +40,7 @@ from typing import Any +import pytest from llama_index.core import VectorStoreIndex from llama_index.core.vector_stores import VectorStoreQuery @@ -52,9 +53,8 @@ def get_vector_store_index(data_source_id) -> VectorStoreIndex: index = VectorStoreIndex.from_vector_store(vector_store, embed_model=models.get_embedding_model()) return index - +@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.") class TestDocumentIndexing: - @staticmethod def test_create_document( client, From 5071e3911549b341501521ae4d298b2addfa747c Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Wed, 20 Nov 2024 14:53:10 -0800 Subject: [PATCH 04/10] pass document id within the test --- .../rag/external/RagBackendClientTest.java | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java index 13dd0b729..e1f8760d7 100644 --- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java +++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java @@ -57,7 +57,7 @@ void indexFile() { Tracker> tracker = new Tracker<>(); RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2); - RagDocument document = indexRequest("s3Path", 1234L); + RagDocument document = indexRequest("documentId", "s3Path", 1234L); client.indexFile(document, "bucketName", indexConfiguration); @@ -76,7 +76,7 @@ void indexFile() { void createSummary() { Tracker> tracker = new Tracker<>(); RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker)); - RagDocument document = indexRequest("s3Path", 1234L); + RagDocument document = indexRequest("documentId", "s3Path", 1234L); client.createSummary(document, "bucketName"); @@ -135,13 +135,25 @@ void deleteSession() { void null_handlesThrowable() { RagBackendClient client = RagBackendClient.createNull(new Tracker<>(), new NotFound("not found")); - RagDocument document = indexRequest("s3Path", 1234L); + RagDocument document = indexRequest("documentId", "s3Path", 1234L); assertThatThrownBy(() -> client.indexFile(document, "fakeit", null)) .isInstanceOf(NotFound.class); } - private static RagDocument indexRequest(String s3Path, Long dataSourceId) { + private static RagDocument indexRequest(String documentId, String s3Path, Long dataSourceId) { return new RagDocument( - null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null); + null, + null, + dataSourceId, + documentId, + s3Path, + null, + null, + null, + null, + null, + null, + null, + null); } } From e108d7eee274640832956f78438c8e4db88d3c00 Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Wed, 20 Nov 2024 15:00:45 -0800 Subject: [PATCH 05/10] fix the other test with the race condition --- llm-service/app/tests/routers/index/test_doc_summaries.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llm-service/app/tests/routers/index/test_doc_summaries.py b/llm-service/app/tests/routers/index/test_doc_summaries.py index 80951213e..aa80fac50 100644 --- a/llm-service/app/tests/routers/index/test_doc_summaries.py +++ b/llm-service/app/tests/routers/index/test_doc_summaries.py @@ -36,9 +36,10 @@ # DATA. # +import pytest from typing import Any - +@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.") class TestDocumentSummaries: @staticmethod def test_generate_summary(client, index_document_request_body: dict[str, Any], data_source_id, document_id, s3_object) -> None: From 81e5e53040b626d8da3e6e729d6e663373633df7 Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Wed, 20 Nov 2024 15:02:37 -0800 Subject: [PATCH 06/10] fix monkey patch --- llm-service/app/routers/index/data_source/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index ca073d772..79aa20a92 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -39,8 +39,8 @@ from .... import exceptions from ....services import doc_summaries, qdrant, s3 from ....ai.indexing.index import Indexer -from ....services.rag_vector_store import create_rag_vector_store -from ....services.models import get_embedding_model +from ....services import rag_vector_store +from ....services import models from llama_index.core.node_parser import SentenceSplitter logger = logging.getLogger(__name__) @@ -143,7 +143,7 @@ def download_and_index( chunk_size=request.configuration.chunk_size, chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size), ), - embedding_model=get_embedding_model(), - chunks_vector_store=create_rag_vector_store(data_source_id) + embedding_model=models.get_embedding_model(), + chunks_vector_store=rag_vector_store.create_rag_vector_store(data_source_id) ) indexer.index_file(file_path, request.document_id) From 743d5f4577e4dd9a9853fee0890c47d927e41a96 Mon Sep 17 00:00:00 2001 From: John Watson Date: Thu, 21 Nov 2024 09:56:26 -0800 Subject: [PATCH 07/10] a few tweaks, fix test & a couple bugs (#29) --- llm-service/app/ai/indexing/index.py | 11 ++++------- llm-service/app/routers/index/data_source/__init__.py | 9 ++++----- llm-service/app/services/CaiiModel.py | 3 ++- llm-service/app/tests/conftest.py | 4 ++-- .../app/tests/routers/index/test_data_source.py | 2 -- .../app/tests/routers/index/test_doc_summaries.py | 1 - 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py index 5661d5e05..5d59dc154 100644 --- a/llm-service/app/ai/indexing/index.py +++ b/llm-service/app/ai/indexing/index.py @@ -57,13 +57,7 @@ ".txt": NopReader, ".md": NopReader, } -CHUNKABLE_FILE_EXTENSIONS = set( - [ - ".pdf", - ".txt", - ".md", - ] -) +CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"} @dataclass class NotSupportedFileExtensionError(Exception): @@ -101,6 +95,7 @@ def index_file(self, file_path: str, file_id: str): for chunk, embedding in zip(chunks, embeddings): chunk.embedding = embedding + chunk.metadata["file_name"] = os.path.basename(file_path) logger.debug(f"Adding {len(chunks)} chunks to vector store") chunks_vector_store = self.chunks_vector_store.access_vector_store() @@ -113,6 +108,7 @@ def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) - for i, document in enumerate(documents): # Update the document metadata + document.id_ = file_id document.metadata["file_id"] = file_id document.metadata["document_part_number"] = i document.metadata["data_source_id"] = self.data_source_id @@ -124,6 +120,7 @@ def _chunks_in_document(self, document: Document) -> List[BaseNode]: for j, chunk in enumerate(chunks): chunk.metadata["file_id"] = document.metadata["file_id"] + chunk.metadata["document_id"] = document.metadata["file_id"] chunk.metadata["document_part_number"] = document.metadata["document_part_number"] chunk.metadata["chunk_number"] = j chunk.metadata["data_source_id"] = document.metadata["data_source_id"] diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index ca073d772..bb5e2bfae 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -28,7 +28,6 @@ # DATA. # ############################################################################## -import http import logging import os import tempfile @@ -39,8 +38,8 @@ from .... import exceptions from ....services import doc_summaries, qdrant, s3 from ....ai.indexing.index import Indexer -from ....services.rag_vector_store import create_rag_vector_store -from ....services.models import get_embedding_model +from ....services import rag_vector_store +from ....services import models from llama_index.core.node_parser import SentenceSplitter logger = logging.getLogger(__name__) @@ -143,7 +142,7 @@ def download_and_index( chunk_size=request.configuration.chunk_size, chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size), ), - embedding_model=get_embedding_model(), - chunks_vector_store=create_rag_vector_store(data_source_id) + embedding_model=models.get_embedding_model(), + chunks_vector_store=rag_vector_store.create_rag_vector_store(data_source_id) ) indexer.index_file(file_path, request.document_id) diff --git a/llm-service/app/services/CaiiModel.py b/llm-service/app/services/CaiiModel.py index f1897edbc..e71b5cf4f 100644 --- a/llm-service/app/services/CaiiModel.py +++ b/llm-service/app/services/CaiiModel.py @@ -60,7 +60,8 @@ def __init__( api_base=api_base, messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, - default_headers=default_headers) + default_headers=default_headers, + context=context) self.context = context @property diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py index 40530c7df..7fb6154fb 100644 --- a/llm-service/app/tests/conftest.py +++ b/llm-service/app/tests/conftest.py @@ -55,7 +55,7 @@ from app.main import app from app.services import models, rag_vector_store from app.services.rag_qdrant_vector_store import RagQdrantVectorStore - +from app.services.utils import get_last_segment @pytest.fixture def aws_region() -> str: @@ -92,7 +92,7 @@ def data_source_id() -> int: @pytest.fixture def index_document_request_body(data_source_id, s3_object) -> dict[str, Any]: return { - "document_id": s3_object.key, + "document_id": get_last_segment(s3_object.key), "data_source_id": data_source_id, "s3_bucket_name": s3_object.bucket_name, "s3_document_key": s3_object.key, diff --git a/llm-service/app/tests/routers/index/test_data_source.py b/llm-service/app/tests/routers/index/test_data_source.py index 698bfbde8..b88e4ccdc 100644 --- a/llm-service/app/tests/routers/index/test_data_source.py +++ b/llm-service/app/tests/routers/index/test_data_source.py @@ -40,7 +40,6 @@ from typing import Any -import pytest from llama_index.core import VectorStoreIndex from llama_index.core.vector_stores import VectorStoreQuery @@ -53,7 +52,6 @@ def get_vector_store_index(data_source_id) -> VectorStoreIndex: index = VectorStoreIndex.from_vector_store(vector_store, embed_model=models.get_embedding_model()) return index -@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.") class TestDocumentIndexing: @staticmethod def test_create_document( diff --git a/llm-service/app/tests/routers/index/test_doc_summaries.py b/llm-service/app/tests/routers/index/test_doc_summaries.py index aa80fac50..92ffda5b1 100644 --- a/llm-service/app/tests/routers/index/test_doc_summaries.py +++ b/llm-service/app/tests/routers/index/test_doc_summaries.py @@ -39,7 +39,6 @@ import pytest from typing import Any -@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.") class TestDocumentSummaries: @staticmethod def test_generate_summary(client, index_document_request_body: dict[str, Any], data_source_id, document_id, s3_object) -> None: From 02b7383217e91cfd8f2588a5e3aa3ca73ae464d5 Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Thu, 21 Nov 2024 10:18:20 -0800 Subject: [PATCH 08/10] resolve mypy --- llm-service/app/ai/indexing/index.py | 65 +++++++++++++------ llm-service/app/ai/indexing/readers/nop.py | 1 + llm-service/app/ai/indexing/readers/pdf.py | 6 +- .../app/routers/index/data_source/__init__.py | 3 +- 4 files changed, 53 insertions(+), 22 deletions(-) diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py index 5d59dc154..19fbcb97e 100644 --- a/llm-service/app/ai/indexing/index.py +++ b/llm-service/app/ai/indexing/index.py @@ -36,19 +36,20 @@ # DATA. # -from dataclasses import dataclass import logging import os +from dataclasses import dataclass +from pathlib import Path from typing import Dict, List, Type -from .readers.pdf import PDFReader -from .readers.nop import NopReader -from llama_index.core.readers.base import BaseReader -from llama_index.core.node_parser import SentenceSplitter -from llama_index.core.schema import Document from llama_index.core.base.embeddings.base import BaseEmbedding +from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import BaseNode, Document, TextNode + from ...services.vector_store import VectorStore -from llama_index.core.node_parser.interface import BaseNode +from .readers.nop import NopReader +from .readers.pdf import PDFReader logger = logging.getLogger(__name__) @@ -59,18 +60,26 @@ } CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"} + @dataclass class NotSupportedFileExtensionError(Exception): file_extension: str + class Indexer: - def __init__(self, data_source_id: int, splitter: SentenceSplitter, embedding_model: BaseEmbedding, chunks_vector_store: VectorStore): + def __init__( + self, + data_source_id: int, + splitter: SentenceSplitter, + embedding_model: BaseEmbedding, + chunks_vector_store: VectorStore, + ): self.data_source_id = data_source_id self.splitter = splitter self.embedding_model = embedding_model self.chunks_vector_store = chunks_vector_store - def index_file(self, file_path: str, file_id: str): + def index_file(self, file_path: Path, file_id: str) -> None: logger.debug(f"Indexing file: {file_path}") file_extension = os.path.splitext(file_path)[1] @@ -85,7 +94,11 @@ def index_file(self, file_path: str, file_id: str): documents = self._documents_in_file(reader, file_path, file_id) if file_extension in CHUNKABLE_FILE_EXTENSIONS: logger.debug(f"Chunking file: {file_path}") - chunks = [chunk for document in documents for chunk in self._chunks_in_document(document)] + chunks = [ + chunk + for document in documents + for chunk in self._chunks_in_document(document) + ] else: chunks = documents @@ -95,34 +108,48 @@ def index_file(self, file_path: str, file_id: str): for chunk, embedding in zip(chunks, embeddings): chunk.embedding = embedding - chunk.metadata["file_name"] = os.path.basename(file_path) logger.debug(f"Adding {len(chunks)} chunks to vector store") chunks_vector_store = self.chunks_vector_store.access_vector_store() - chunks_vector_store.add(chunks) + + # We have to explicitly convert here even though the types are compatible (TextNode inherits from BaseNode) + # because the "add" annotation uses List instead of Sequence. We need to use TextNode explicitly because + # we're capturing "text". + converted_chunks: List[BaseNode] = [chunk for chunk in chunks] + chunks_vector_store.add(converted_chunks) logger.debug(f"Indexing file: {file_path} completed") - def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -> List[Document]: + def _documents_in_file( + self, reader: BaseReader, file_path: Path, file_id: str + ) -> List[Document]: documents = reader.load_data(file_path) for i, document in enumerate(documents): # Update the document metadata document.id_ = file_id - document.metadata["file_id"] = file_id + document.metadata["file_name"] = os.path.basename(file_path) + document.metadata["document_id"] = file_id document.metadata["document_part_number"] = i document.metadata["data_source_id"] = self.data_source_id return documents - def _chunks_in_document(self, document: Document) -> List[BaseNode]: + def _chunks_in_document(self, document: Document) -> List[TextNode]: chunks = self.splitter.get_nodes_from_documents([document]) for j, chunk in enumerate(chunks): - chunk.metadata["file_id"] = document.metadata["file_id"] - chunk.metadata["document_id"] = document.metadata["file_id"] - chunk.metadata["document_part_number"] = document.metadata["document_part_number"] + chunk.metadata["file_name"] = document.metadata["file_name"] + chunk.metadata["document_id"] = document.metadata["document_id"] + chunk.metadata["document_part_number"] = document.metadata[ + "document_part_number" + ] chunk.metadata["chunk_number"] = j chunk.metadata["data_source_id"] = document.metadata["data_source_id"] - return chunks + converted_chunks: List[TextNode] = [] + for chunk in chunks: + assert isinstance(chunk, TextNode) + converted_chunks.append(chunk) + + return converted_chunks diff --git a/llm-service/app/ai/indexing/readers/nop.py b/llm-service/app/ai/indexing/readers/nop.py index 41d0fa452..3320e79fa 100644 --- a/llm-service/app/ai/indexing/readers/nop.py +++ b/llm-service/app/ai/indexing/readers/nop.py @@ -40,6 +40,7 @@ from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document + class NopReader(BaseReader): def load_data(self, file_path: str) -> List[Document]: with open(file_path, "r") as f: diff --git a/llm-service/app/ai/indexing/readers/pdf.py b/llm-service/app/ai/indexing/readers/pdf.py index dfb922617..697721417 100644 --- a/llm-service/app/ai/indexing/readers/pdf.py +++ b/llm-service/app/ai/indexing/readers/pdf.py @@ -36,15 +36,17 @@ # DATA. # +from pathlib import Path from typing import List from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from llama_index.readers.file import PDFReader as LlamaIndexPDFReader + class PDFReader(BaseReader): - def __init__(self): + def __init__(self) -> None: self.inner = LlamaIndexPDFReader(return_full_document=True) - def load_data(self, file_path: str) -> List[Document]: + def load_data(self, file_path: Path) -> List[Document]: return self.inner.load_data(file_path) diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index 204d844cd..dedf8c5d1 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -31,6 +31,7 @@ import logging import os import tempfile +from pathlib import Path from fastapi import APIRouter from llama_index.core.node_parser import SentenceSplitter @@ -132,7 +133,7 @@ def download_and_index( files = os.listdir(tmpdirname) if len(files) != 1: raise ValueError("Expected a single file in the temporary directory") - file_path = os.path.join(tmpdirname, files[0]) + file_path = Path(os.path.join(tmpdirname, files[0])) indexer = Indexer( data_source_id, From ce44205e246c716fbe34ce27da11b0bdfeb1239d Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Thu, 21 Nov 2024 10:20:46 -0800 Subject: [PATCH 09/10] docx support --- llm-service/app/ai/indexing/index.py | 4 +++- llm-service/pdm.lock | 11 ++++++++++- llm-service/pyproject.toml | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py index 19fbcb97e..85ac69930 100644 --- a/llm-service/app/ai/indexing/index.py +++ b/llm-service/app/ai/indexing/index.py @@ -46,6 +46,7 @@ from llama_index.core.node_parser import SentenceSplitter from llama_index.core.readers.base import BaseReader from llama_index.core.schema import BaseNode, Document, TextNode +from llama_index.readers.file import DocxReader from ...services.vector_store import VectorStore from .readers.nop import NopReader @@ -57,8 +58,9 @@ ".pdf": PDFReader, ".txt": NopReader, ".md": NopReader, + ".docx": DocxReader, } -CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"} +CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md", ".docx"} @dataclass diff --git a/llm-service/pdm.lock b/llm-service/pdm.lock index 3a3ec66ed..cddb261f3 100644 --- a/llm-service/pdm.lock +++ b/llm-service/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:cf38cbf44250032e4b248c90dbc34037bb94662cdfe512d50fcb3e271309fd84" +content_hash = "sha256:2a9b3e86ee90d639241d72fdeee20e779c2e7c42e90ab2e43e335c18454e0858" [[metadata.targets]] requires_python = "==3.10.*" @@ -382,6 +382,15 @@ files = [ {file = "dnspython-2.7.0.tar.gz", hash = "sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1"}, ] +[[package]] +name = "docx2txt" +version = "0.8" +summary = "A pure python-based utility to extract text and images from docx files." +groups = ["default"] +files = [ + {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"}, +] + [[package]] name = "email-validator" version = "2.2.0" diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml index 830db6091..d22188469 100644 --- a/llm-service/pyproject.toml +++ b/llm-service/pyproject.toml @@ -5,7 +5,7 @@ description = "Default template for PDM package" authors = [ {name = "Conrado Silva Miranda", email = "csilvamiranda@cloudera.com"}, ] -dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17"] +dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17", "docx2txt>=0.8"] requires-python = "==3.10.*" readme = "README.md" license = {text = "APACHE"} From 60a7cbf641ec205268c147056eab15a6f059bd5d Mon Sep 17 00:00:00 2001 From: Conrado Silva Miranda Date: Thu, 21 Nov 2024 10:25:31 -0800 Subject: [PATCH 10/10] make ruff happy --- llm-service/app/services/qdrant.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py index 138dd8c10..314264359 100644 --- a/llm-service/app/services/qdrant.py +++ b/llm-service/app/services/qdrant.py @@ -44,17 +44,12 @@ from llama_index.core.chat_engine.types import AgentChatResponse from llama_index.core.indices import VectorStoreIndex from llama_index.core.indices.vector_store import VectorIndexRetriever -from llama_index.core.node_parser import SentenceSplitter from llama_index.core.query_engine import RetrieverQueryEngine -from llama_index.core.readers import SimpleDirectoryReader from llama_index.core.response_synthesizers import get_response_synthesizer -from llama_index.core.storage import StorageContext -from pydantic import BaseModel from ..rag_types import RagPredictConfiguration from . import models, rag_vector_store from .chat_store import RagContext -from .utils import get_last_segment logger = logging.getLogger(__name__)