From cb095310598b92c9a755358c83352b68085fedb6 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Wed, 20 Nov 2024 13:24:26 -0800
Subject: [PATCH 01/10] Provide Indexer to index files

---
 .../cai/rag/external/RagBackendClient.java    |  36 ++--
 .../rag/external/RagBackendClientTest.java    | 190 +++++++++---------
 llm-service/app/ai/__init__.py                |  37 ++++
 llm-service/app/ai/indexing/__init__.py       |  37 ++++
 llm-service/app/ai/indexing/index.py          | 124 ++++++++++++
 .../app/ai/indexing/readers/__init__.py       |  37 ++++
 llm-service/app/ai/indexing/readers/pdf.py    |  50 +++++
 .../app/routers/index/data_source/__init__.py |  39 +++-
 llm-service/app/services/qdrant.py            |  55 -----
 9 files changed, 433 insertions(+), 172 deletions(-)
 create mode 100644 llm-service/app/ai/__init__.py
 create mode 100644 llm-service/app/ai/indexing/__init__.py
 create mode 100644 llm-service/app/ai/indexing/index.py
 create mode 100644 llm-service/app/ai/indexing/readers/__init__.py
 create mode 100644 llm-service/app/ai/indexing/readers/pdf.py

diff --git a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
index cbdc33d38..427fd3b27 100644
--- a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
+++ b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
@@ -38,15 +38,18 @@
 
 package com.cloudera.cai.rag.external;
 
+import java.io.IOException;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
 import com.cloudera.cai.rag.Types;
 import com.cloudera.cai.rag.configuration.AppConfiguration;
 import com.cloudera.cai.util.SimpleHttpClient;
 import com.cloudera.cai.util.Tracker;
 import com.fasterxml.jackson.annotation.JsonProperty;
+
 import io.opentelemetry.instrumentation.annotations.WithSpan;
-import java.io.IOException;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Component;
 
 @Component
 public class RagBackendClient {
@@ -68,7 +71,7 @@ public void indexFile(
               + "/data_sources/"
               + ragDocument.dataSourceId()
               + "/documents/download-and-index",
-          new IndexRequest(bucketName, ragDocument.s3Path(), configuration));
+          new IndexRequest(ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration));
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -97,17 +100,21 @@ public void deleteSession(Long sessionId) {
   }
 
   record IndexRequest(
+      @JsonProperty("document_id") String documentId,
       @JsonProperty("s3_bucket_name") String s3BucketName,
       @JsonProperty("s3_document_key") String s3DocumentKey,
-      IndexConfiguration configuration) {}
+      IndexConfiguration configuration) {
+  }
 
   public record SummaryRequest(
       @JsonProperty("s3_bucket_name") String s3BucketName,
-      @JsonProperty("s3_document_key") String s3DocumentKey) {}
+      @JsonProperty("s3_document_key") String s3DocumentKey) {
+  }
 
   public record IndexConfiguration(
       @JsonProperty("chunk_size") int chunkSize,
-      @JsonProperty("chunk_overlap") int chunkOverlapPercentage) {}
+      @JsonProperty("chunk_overlap") int chunkOverlapPercentage) {
+  }
 
   // nullables below here
 
@@ -171,13 +178,18 @@ public void deleteDocument(long dataSourceId, String documentId) {
   }
 
   public record TrackedIndexRequest(
-      String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) {}
+      String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) {
+  }
 
-  public record TrackedDeleteSessionRequest(Long sessionId) {}
+  public record TrackedDeleteSessionRequest(Long sessionId) {
+  }
 
-  public record TrackedDeleteDataSourceRequest(long dataSourceId) {}
+  public record TrackedDeleteDataSourceRequest(long dataSourceId) {
+  }
 
-  public record TrackedRequest<T>(T detail) {}
+  public record TrackedRequest<T>(T detail) {
+  }
 
-  public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) {}
+  public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) {
+  }
 }
diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
index 60f6393ff..b3d07f85f 100644
--- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
+++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
@@ -41,106 +41,108 @@
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.http.HttpMethod;
+
 import com.cloudera.cai.rag.Types.RagDocument;
 import com.cloudera.cai.rag.external.RagBackendClient.IndexConfiguration;
 import com.cloudera.cai.util.SimpleHttpClient;
 import com.cloudera.cai.util.SimpleHttpClient.TrackedHttpRequest;
 import com.cloudera.cai.util.Tracker;
 import com.cloudera.cai.util.exceptions.NotFound;
-import java.util.List;
-import org.junit.jupiter.api.Test;
-import org.springframework.http.HttpMethod;
 
 class RagBackendClientTest {
-  @Test
-  void indexFile() {
-    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-    IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2);
-    RagDocument document = indexRequest("s3Path", 1234L);
-
-    client.indexFile(document, "bucketName", indexConfiguration);
-
-    List<TrackedHttpRequest<?>> values = tracker.getValues();
-    assertThat(values)
-        .hasSize(1)
-        .contains(
-            new TrackedHttpRequest<>(
-                HttpMethod.POST,
-                "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index",
-                new RagBackendClient.IndexRequest("bucketName", "s3Path", indexConfiguration)));
-  }
-
-  @Test
-  void createSummary() {
-    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-    RagDocument document = indexRequest("s3Path", 1234L);
-
-    client.createSummary(document, "bucketName");
-
-    List<TrackedHttpRequest<?>> values = tracker.getValues();
-    assertThat(values)
-        .hasSize(1)
-        .contains(
-            new TrackedHttpRequest<>(
-                HttpMethod.POST,
-                "http://rag-backend:8000/data_sources/1234/summarize-document",
-                new RagBackendClient.SummaryRequest("bucketName", "s3Path")));
-  }
-
-  @Test
-  void deleteDataSource() {
-    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-    client.deleteDataSource(1234L);
-    List<TrackedHttpRequest<?>> values = tracker.getValues();
-    assertThat(values)
-        .hasSize(1)
-        .contains(
-            new TrackedHttpRequest<>(
-                HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null));
-  }
-
-  @Test
-  void deleteDocument() {
-    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-    client.deleteDocument(1234L, "documentId");
-    List<TrackedHttpRequest<?>> values = tracker.getValues();
-    assertThat(values)
-        .hasSize(1)
-        .contains(
-            new TrackedHttpRequest<>(
-                HttpMethod.DELETE,
-                "http://rag-backend:8000/data_sources/1234/documents/documentId",
-                null));
-  }
-
-  @Test
-  void deleteSession() {
-    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-    client.deleteSession(1234L);
-    List<TrackedHttpRequest<?>> values = tracker.getValues();
-    assertThat(values)
-        .hasSize(1)
-        .contains(
-            new TrackedHttpRequest<>(
-                HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null));
-  }
-
-  @Test
-  void null_handlesThrowable() {
-    RagBackendClient client =
-        RagBackendClient.createNull(new Tracker<>(), new NotFound("not found"));
-    RagDocument document = indexRequest("s3Path", 1234L);
-    assertThatThrownBy(() -> client.indexFile(document, "fakeit", null))
-        .isInstanceOf(NotFound.class);
-  }
-
-  private static RagDocument indexRequest(String s3Path, Long dataSourceId) {
-    return new RagDocument(
-        null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null);
-  }
+    @Test
+    void indexFile() {
+        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+        IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2);
+        RagDocument document = indexRequest("s3Path", 1234L);
+
+        client.indexFile(document, "bucketName", indexConfiguration);
+
+        List<TrackedHttpRequest<?>> values = tracker.getValues();
+        assertThat(values)
+                .hasSize(1)
+                .contains(
+                        new TrackedHttpRequest<>(
+                                HttpMethod.POST,
+                                "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index",
+                                new RagBackendClient.IndexRequest("documentId", "bucketName", "s3Path",
+                                        indexConfiguration)));
+    }
+
+    @Test
+    void createSummary() {
+        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+        RagDocument document = indexRequest("s3Path", 1234L);
+
+        client.createSummary(document, "bucketName");
+
+        List<TrackedHttpRequest<?>> values = tracker.getValues();
+        assertThat(values)
+                .hasSize(1)
+                .contains(
+                        new TrackedHttpRequest<>(
+                                HttpMethod.POST,
+                                "http://rag-backend:8000/data_sources/1234/summarize-document",
+                                new RagBackendClient.SummaryRequest("bucketName", "s3Path")));
+    }
+
+    @Test
+    void deleteDataSource() {
+        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+        client.deleteDataSource(1234L);
+        List<TrackedHttpRequest<?>> values = tracker.getValues();
+        assertThat(values)
+                .hasSize(1)
+                .contains(
+                        new TrackedHttpRequest<>(
+                                HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null));
+    }
+
+    @Test
+    void deleteDocument() {
+        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+        client.deleteDocument(1234L, "documentId");
+        List<TrackedHttpRequest<?>> values = tracker.getValues();
+        assertThat(values)
+                .hasSize(1)
+                .contains(
+                        new TrackedHttpRequest<>(
+                                HttpMethod.DELETE,
+                                "http://rag-backend:8000/data_sources/1234/documents/documentId",
+                                null));
+    }
+
+    @Test
+    void deleteSession() {
+        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+        client.deleteSession(1234L);
+        List<TrackedHttpRequest<?>> values = tracker.getValues();
+        assertThat(values)
+                .hasSize(1)
+                .contains(
+                        new TrackedHttpRequest<>(
+                                HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null));
+    }
+
+    @Test
+    void null_handlesThrowable() {
+        RagBackendClient client = RagBackendClient.createNull(new Tracker<>(), new NotFound("not found"));
+        RagDocument document = indexRequest("s3Path", 1234L);
+        assertThatThrownBy(() -> client.indexFile(document, "fakeit", null))
+                .isInstanceOf(NotFound.class);
+    }
+
+    private static RagDocument indexRequest(String s3Path, Long dataSourceId) {
+        return new RagDocument(
+                null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null);
+    }
 }
diff --git a/llm-service/app/ai/__init__.py b/llm-service/app/ai/__init__.py
new file mode 100644
index 000000000..e2b4ac6c2
--- /dev/null
+++ b/llm-service/app/ai/__init__.py
@@ -0,0 +1,37 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
diff --git a/llm-service/app/ai/indexing/__init__.py b/llm-service/app/ai/indexing/__init__.py
new file mode 100644
index 000000000..e2b4ac6c2
--- /dev/null
+++ b/llm-service/app/ai/indexing/__init__.py
@@ -0,0 +1,37 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
new file mode 100644
index 000000000..8ac93877e
--- /dev/null
+++ b/llm-service/app/ai/indexing/index.py
@@ -0,0 +1,124 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from dataclasses import dataclass
+import logging
+import os
+from typing import Dict, List, Type
+
+from .readers.pdf import PDFReader
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.schema import Document
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from app.services.vector_store import VectorStore
+from llama_index.core.node_parser.interface import BaseNode
+
+logger = logging.getLogger(__name__)
+
+READERS: Dict[str, Type[BaseReader]] = {
+    ".pdf": PDFReader,
+}
+CHUNKABLE_FILE_EXTENSIONS = set(
+    [
+        ".pdf",
+    ]
+)
+
+@dataclass
+class NotSupportedFileExtensionError(Exception):
+    file_extension: str
+
+class Indexer:
+    def __init__(self, data_source_id: int, splitter: SentenceSplitter, embedding_model: BaseEmbedding, chunks_vector_store: VectorStore):
+        self.data_source_id = data_source_id
+        self.splitter = splitter
+        self.embedding_model = embedding_model
+        self.chunks_vector_store = chunks_vector_store
+
+    def index_file(self, file_path: str, file_id: str):
+        logger.debug(f"Indexing file: {file_path}")
+
+        file_extension = os.path.splitext(file_path)[1]
+        reader_cls = READERS.get(file_extension)
+        if not reader_cls:
+            raise NotSupportedFileExtensionError(file_extension)
+
+        reader = reader_cls()
+
+        logger.debug(f"Parsing file: {file_path}")
+
+        documents = self._documents_in_file(reader, file_path, file_id)
+        if file_extension in CHUNKABLE_FILE_EXTENSIONS:
+            logger.debug(f"Chunking file: {file_path}")
+            chunks = [chunk for document in documents for chunk in self._chunks_in_document(document)]
+        else:
+            chunks = documents
+
+        texts = [chunk.text for chunk in chunks]
+        logger.debug(f"Embedding {len(texts)} chunks")
+        embeddings = self.embedding_model.get_text_embedding_batch(texts)
+
+        for chunk, embedding in zip(chunks, embeddings):
+            chunk.embedding = embedding
+
+        logger.debug(f"Adding {len(chunks)} chunks to vector store")
+        chunks_vector_store = self.chunks_vector_store.access_vector_store()
+        chunks_vector_store.add(chunks)
+
+    def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -> List[Document]:
+        documents = reader.load_data(file_path)
+
+        for i, document in enumerate(documents):
+            # Update the document metadata
+            document.metadata["file_id"] = file_id
+            document.metadata["document_part_number"] = i
+            document.metadata["data_source_id"] = self.data_source_id
+
+        return documents
+
+    def _chunks_in_document(self, document: Document) -> List[BaseNode]:
+        chunks = self.splitter.get_nodes_from_documents([document])
+
+        for j, chunk in enumerate(chunks):
+            chunk.metadata["file_id"] = document.metadata["file_id"]
+            chunk.metadata["document_part_number"] = document.metadata["document_part_number"]
+            chunk.metadata["chunk_number"] = j
+            chunk.metadata["data_source_id"] = document.metadata["data_source_id"]
+
+        return chunks
diff --git a/llm-service/app/ai/indexing/readers/__init__.py b/llm-service/app/ai/indexing/readers/__init__.py
new file mode 100644
index 000000000..e2b4ac6c2
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/__init__.py
@@ -0,0 +1,37 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
diff --git a/llm-service/app/ai/indexing/readers/pdf.py b/llm-service/app/ai/indexing/readers/pdf.py
new file mode 100644
index 000000000..dfb922617
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/pdf.py
@@ -0,0 +1,50 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from typing import List
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+from llama_index.readers.file import PDFReader as LlamaIndexPDFReader
+
+class PDFReader(BaseReader):
+    def __init__(self):
+        self.inner = LlamaIndexPDFReader(return_full_document=True)
+
+    def load_data(self, file_path: str) -> List[Document]:
+        return self.inner.load_data(file_path)
diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
index aa7e133eb..f3584d1e8 100644
--- a/llm-service/app/routers/index/data_source/__init__.py
+++ b/llm-service/app/routers/index/data_source/__init__.py
@@ -30,6 +30,7 @@
 
 import http
 import logging
+import os
 import tempfile
 
 from fastapi import APIRouter
@@ -37,6 +38,10 @@
 
 from .... import exceptions
 from ....services import doc_summaries, qdrant, s3
+from app.ai.indexing.index import Indexer
+from app.services.rag_vector_store import create_rag_vector_store
+from app.services.models import get_embedding_model
+from llama_index.core.node_parser import SentenceSplitter
 
 logger = logging.getLogger(__name__)
 
@@ -100,15 +105,19 @@ def delete_document(data_source_id: int, doc_id: str) -> None:
     doc_summaries.delete_document(data_source_id, doc_id)
 
 
+class RagIndexDocumentConfiguration(BaseModel):
+    # TODO: Add more params
+    chunk_size: int = 512  # this is llama-index's default
+    chunk_overlap: int = 10  # percentage of tokens in a chunk (chunk_size)
 
 class RagIndexDocumentRequest(BaseModel):
+    document_id: str
     s3_bucket_name: str
     s3_document_key: str
-    configuration: qdrant.RagIndexDocumentConfiguration = (
-        qdrant.RagIndexDocumentConfiguration()
+    configuration: RagIndexDocumentConfiguration = (
+        RagIndexDocumentConfiguration()
     )
 
-
 @router.post(
     "/documents/download-and-index",
     summary="Download and index document",
@@ -118,15 +127,23 @@ class RagIndexDocumentRequest(BaseModel):
 def download_and_index(
         data_source_id: int,
         request: RagIndexDocumentRequest,
-) -> str:
+) -> None:
     with tempfile.TemporaryDirectory() as tmpdirname:
         logger.debug("created temporary directory %s", tmpdirname)
         s3.download(tmpdirname, request.s3_bucket_name, request.s3_document_key)
-        qdrant.download_and_index(
-            tmpdirname,
-            data_source_id,
-            request.configuration,
-            request.s3_document_key
+        # Get the single file in the directory
+        files = os.listdir(tmpdirname)
+        if len(files) != 1:
+            raise ValueError("Expected a single file in the temporary directory")
+        file_path = os.path.join(tmpdirname, files[0])
+
+        indexer = Indexer(
+                data_source_id,
+                splitter=SentenceSplitter(
+                    chunk_size=request.configuration.chunk_size,
+                    chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size),
+                ),
+                embedding_model=get_embedding_model(),
+                chunks_vector_store=create_rag_vector_store(data_source_id)
         )
-        return http.HTTPStatus.OK.phrase
-
+        indexer.index_file(file_path, request.document_id)
diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py
index 11b3c0569..1aab2e1b3 100644
--- a/llm-service/app/services/qdrant.py
+++ b/llm-service/app/services/qdrant.py
@@ -59,61 +59,6 @@
 
 logger = logging.getLogger(__name__)
 
-
-class RagIndexDocumentConfiguration(BaseModel):
-    # TODO: Add more params
-    chunk_size: int = 512  # this is llama-index's default
-    chunk_overlap: int = 10  # percentage of tokens in a chunk (chunk_size)
-
-
-def download_and_index(
-        tmpdirname: str,
-        data_source_id: int,
-        configuration: RagIndexDocumentConfiguration,
-        s3_document_key: str,
-):
-    try:
-        documents = SimpleDirectoryReader(tmpdirname).load_data()
-        document_id = get_last_segment(s3_document_key)
-        for document in documents:
-            document.id_ = document_id  # this is a terrible way to assign the doc id...
-            document.metadata["document_id"] = document_id
-    except Exception as e:
-        logger.error(
-            "error loading document from temporary directory %s",
-            tmpdirname,
-        )
-        raise HTTPException(
-            status_code=422,
-            detail=f"error loading document from temporary directory {tmpdirname}",
-        ) from e
-
-    logger.info("instantiating vector store")
-    vector_store = rag_vector_store.create_rag_vector_store(data_source_id).access_vector_store()
-    logger.info("instantiated vector store")
-
-    storage_context = StorageContext.from_defaults(vector_store=vector_store)
-
-    chunk_overlap_tokens = int(
-        configuration.chunk_overlap * 0.01 * configuration.chunk_size
-    )
-
-    logger.info("indexing document")
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        embed_model=models.get_embedding_model(),
-        show_progress=False,
-        transformations=[
-            SentenceSplitter(
-                chunk_size=configuration.chunk_size,
-                chunk_overlap=chunk_overlap_tokens,
-            ),
-        ],
-    )
-    logger.info("indexed document")
-
-
 def check_data_source_exists(data_source_size: int) -> None:
     if data_source_size == -1:
         raise HTTPException(status_code=404, detail="Knowledge base not found.")

From 5aeed8447a9a1bde72066e8acdd0bd2593f4429c Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Wed, 20 Nov 2024 13:47:08 -0800
Subject: [PATCH 02/10] fix imports for local dev

---
 .../cai/rag/external/RagBackendClient.java    |  36 ++--
 .../rag/external/RagBackendClientTest.java    | 191 +++++++++---------
 llm-service/app/ai/indexing/index.py          |   7 +-
 llm-service/app/ai/indexing/readers/nop.py    |  46 +++++
 .../app/routers/index/data_source/__init__.py |   6 +-
 llm-service/app/tests/conftest.py             |   1 +
 6 files changed, 164 insertions(+), 123 deletions(-)
 create mode 100644 llm-service/app/ai/indexing/readers/nop.py

diff --git a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
index 427fd3b27..b1e2cd797 100644
--- a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
+++ b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
@@ -38,18 +38,15 @@
 
 package com.cloudera.cai.rag.external;
 
-import java.io.IOException;
-
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Component;
-
 import com.cloudera.cai.rag.Types;
 import com.cloudera.cai.rag.configuration.AppConfiguration;
 import com.cloudera.cai.util.SimpleHttpClient;
 import com.cloudera.cai.util.Tracker;
 import com.fasterxml.jackson.annotation.JsonProperty;
-
 import io.opentelemetry.instrumentation.annotations.WithSpan;
+import java.io.IOException;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
 
 @Component
 public class RagBackendClient {
@@ -71,7 +68,8 @@ public void indexFile(
               + "/data_sources/"
               + ragDocument.dataSourceId()
               + "/documents/download-and-index",
-          new IndexRequest(ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration));
+          new IndexRequest(
+              ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration));
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -103,18 +101,15 @@ record IndexRequest(
       @JsonProperty("document_id") String documentId,
       @JsonProperty("s3_bucket_name") String s3BucketName,
       @JsonProperty("s3_document_key") String s3DocumentKey,
-      IndexConfiguration configuration) {
-  }
+      IndexConfiguration configuration) {}
 
   public record SummaryRequest(
       @JsonProperty("s3_bucket_name") String s3BucketName,
-      @JsonProperty("s3_document_key") String s3DocumentKey) {
-  }
+      @JsonProperty("s3_document_key") String s3DocumentKey) {}
 
   public record IndexConfiguration(
       @JsonProperty("chunk_size") int chunkSize,
-      @JsonProperty("chunk_overlap") int chunkOverlapPercentage) {
-  }
+      @JsonProperty("chunk_overlap") int chunkOverlapPercentage) {}
 
   // nullables below here
 
@@ -178,18 +173,13 @@ public void deleteDocument(long dataSourceId, String documentId) {
   }
 
   public record TrackedIndexRequest(
-      String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) {
-  }
+      String bucketName, String s3Path, long dataSourceId, IndexConfiguration configuration) {}
 
-  public record TrackedDeleteSessionRequest(Long sessionId) {
-  }
+  public record TrackedDeleteSessionRequest(Long sessionId) {}
 
-  public record TrackedDeleteDataSourceRequest(long dataSourceId) {
-  }
+  public record TrackedDeleteDataSourceRequest(long dataSourceId) {}
 
-  public record TrackedRequest<T>(T detail) {
-  }
+  public record TrackedRequest<T>(T detail) {}
 
-  public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) {
-  }
+  public record TrackedDeleteDocumentRequest(long dataSourceId, String documentId) {}
 }
diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
index b3d07f85f..13dd0b729 100644
--- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
+++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
@@ -41,108 +41,107 @@
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
-import java.util.List;
-
-import org.junit.jupiter.api.Test;
-import org.springframework.http.HttpMethod;
-
 import com.cloudera.cai.rag.Types.RagDocument;
 import com.cloudera.cai.rag.external.RagBackendClient.IndexConfiguration;
 import com.cloudera.cai.util.SimpleHttpClient;
 import com.cloudera.cai.util.SimpleHttpClient.TrackedHttpRequest;
 import com.cloudera.cai.util.Tracker;
 import com.cloudera.cai.util.exceptions.NotFound;
+import java.util.List;
+import org.junit.jupiter.api.Test;
+import org.springframework.http.HttpMethod;
 
 class RagBackendClientTest {
-    @Test
-    void indexFile() {
-        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-        IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2);
-        RagDocument document = indexRequest("s3Path", 1234L);
-
-        client.indexFile(document, "bucketName", indexConfiguration);
-
-        List<TrackedHttpRequest<?>> values = tracker.getValues();
-        assertThat(values)
-                .hasSize(1)
-                .contains(
-                        new TrackedHttpRequest<>(
-                                HttpMethod.POST,
-                                "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index",
-                                new RagBackendClient.IndexRequest("documentId", "bucketName", "s3Path",
-                                        indexConfiguration)));
-    }
-
-    @Test
-    void createSummary() {
-        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-        RagDocument document = indexRequest("s3Path", 1234L);
-
-        client.createSummary(document, "bucketName");
-
-        List<TrackedHttpRequest<?>> values = tracker.getValues();
-        assertThat(values)
-                .hasSize(1)
-                .contains(
-                        new TrackedHttpRequest<>(
-                                HttpMethod.POST,
-                                "http://rag-backend:8000/data_sources/1234/summarize-document",
-                                new RagBackendClient.SummaryRequest("bucketName", "s3Path")));
-    }
-
-    @Test
-    void deleteDataSource() {
-        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-        client.deleteDataSource(1234L);
-        List<TrackedHttpRequest<?>> values = tracker.getValues();
-        assertThat(values)
-                .hasSize(1)
-                .contains(
-                        new TrackedHttpRequest<>(
-                                HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null));
-    }
-
-    @Test
-    void deleteDocument() {
-        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-        client.deleteDocument(1234L, "documentId");
-        List<TrackedHttpRequest<?>> values = tracker.getValues();
-        assertThat(values)
-                .hasSize(1)
-                .contains(
-                        new TrackedHttpRequest<>(
-                                HttpMethod.DELETE,
-                                "http://rag-backend:8000/data_sources/1234/documents/documentId",
-                                null));
-    }
-
-    @Test
-    void deleteSession() {
-        Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
-        RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-        client.deleteSession(1234L);
-        List<TrackedHttpRequest<?>> values = tracker.getValues();
-        assertThat(values)
-                .hasSize(1)
-                .contains(
-                        new TrackedHttpRequest<>(
-                                HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null));
-    }
-
-    @Test
-    void null_handlesThrowable() {
-        RagBackendClient client = RagBackendClient.createNull(new Tracker<>(), new NotFound("not found"));
-        RagDocument document = indexRequest("s3Path", 1234L);
-        assertThatThrownBy(() -> client.indexFile(document, "fakeit", null))
-                .isInstanceOf(NotFound.class);
-    }
-
-    private static RagDocument indexRequest(String s3Path, Long dataSourceId) {
-        return new RagDocument(
-                null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null);
-    }
+  @Test
+  void indexFile() {
+    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+    IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2);
+    RagDocument document = indexRequest("s3Path", 1234L);
+
+    client.indexFile(document, "bucketName", indexConfiguration);
+
+    List<TrackedHttpRequest<?>> values = tracker.getValues();
+    assertThat(values)
+        .hasSize(1)
+        .contains(
+            new TrackedHttpRequest<>(
+                HttpMethod.POST,
+                "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index",
+                new RagBackendClient.IndexRequest(
+                    "documentId", "bucketName", "s3Path", indexConfiguration)));
+  }
+
+  @Test
+  void createSummary() {
+    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+    RagDocument document = indexRequest("s3Path", 1234L);
+
+    client.createSummary(document, "bucketName");
+
+    List<TrackedHttpRequest<?>> values = tracker.getValues();
+    assertThat(values)
+        .hasSize(1)
+        .contains(
+            new TrackedHttpRequest<>(
+                HttpMethod.POST,
+                "http://rag-backend:8000/data_sources/1234/summarize-document",
+                new RagBackendClient.SummaryRequest("bucketName", "s3Path")));
+  }
+
+  @Test
+  void deleteDataSource() {
+    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+    client.deleteDataSource(1234L);
+    List<TrackedHttpRequest<?>> values = tracker.getValues();
+    assertThat(values)
+        .hasSize(1)
+        .contains(
+            new TrackedHttpRequest<>(
+                HttpMethod.DELETE, "http://rag-backend:8000/data_sources/1234", null));
+  }
+
+  @Test
+  void deleteDocument() {
+    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+    client.deleteDocument(1234L, "documentId");
+    List<TrackedHttpRequest<?>> values = tracker.getValues();
+    assertThat(values)
+        .hasSize(1)
+        .contains(
+            new TrackedHttpRequest<>(
+                HttpMethod.DELETE,
+                "http://rag-backend:8000/data_sources/1234/documents/documentId",
+                null));
+  }
+
+  @Test
+  void deleteSession() {
+    Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
+    RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
+    client.deleteSession(1234L);
+    List<TrackedHttpRequest<?>> values = tracker.getValues();
+    assertThat(values)
+        .hasSize(1)
+        .contains(
+            new TrackedHttpRequest<>(
+                HttpMethod.DELETE, "http://rag-backend:8000/sessions/1234", null));
+  }
+
+  @Test
+  void null_handlesThrowable() {
+    RagBackendClient client =
+        RagBackendClient.createNull(new Tracker<>(), new NotFound("not found"));
+    RagDocument document = indexRequest("s3Path", 1234L);
+    assertThatThrownBy(() -> client.indexFile(document, "fakeit", null))
+        .isInstanceOf(NotFound.class);
+  }
+
+  private static RagDocument indexRequest(String s3Path, Long dataSourceId) {
+    return new RagDocument(
+        null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null);
+  }
 }
diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 8ac93877e..bc294a840 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -42,21 +42,26 @@
 from typing import Dict, List, Type
 
 from .readers.pdf import PDFReader
+from .readers.nop import NopReader
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.schema import Document
 from llama_index.core.base.embeddings.base import BaseEmbedding
-from app.services.vector_store import VectorStore
+from ...services.vector_store import VectorStore
 from llama_index.core.node_parser.interface import BaseNode
 
 logger = logging.getLogger(__name__)
 
 READERS: Dict[str, Type[BaseReader]] = {
     ".pdf": PDFReader,
+    ".txt": NopReader,
+    ".md": NopReader,
 }
 CHUNKABLE_FILE_EXTENSIONS = set(
     [
         ".pdf",
+        ".txt",
+        ".md",
     ]
 )
 
diff --git a/llm-service/app/ai/indexing/readers/nop.py b/llm-service/app/ai/indexing/readers/nop.py
new file mode 100644
index 000000000..41d0fa452
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/nop.py
@@ -0,0 +1,46 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from typing import List
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+class NopReader(BaseReader):
+    def load_data(self, file_path: str) -> List[Document]:
+        with open(file_path, "r") as f:
+            return [Document(text=f.read())]
diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
index f3584d1e8..ca073d772 100644
--- a/llm-service/app/routers/index/data_source/__init__.py
+++ b/llm-service/app/routers/index/data_source/__init__.py
@@ -38,9 +38,9 @@
 
 from .... import exceptions
 from ....services import doc_summaries, qdrant, s3
-from app.ai.indexing.index import Indexer
-from app.services.rag_vector_store import create_rag_vector_store
-from app.services.models import get_embedding_model
+from ....ai.indexing.index import Indexer
+from ....services.rag_vector_store import create_rag_vector_store
+from ....services.models import get_embedding_model
 from llama_index.core.node_parser import SentenceSplitter
 
 logger = logging.getLogger(__name__)
diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py
index 9ff9b7098..40530c7df 100644
--- a/llm-service/app/tests/conftest.py
+++ b/llm-service/app/tests/conftest.py
@@ -92,6 +92,7 @@ def data_source_id() -> int:
 @pytest.fixture
 def index_document_request_body(data_source_id, s3_object) -> dict[str, Any]:
     return {
+        "document_id": s3_object.key,
         "data_source_id": data_source_id,
         "s3_bucket_name": s3_object.bucket_name,
         "s3_document_key": s3_object.key,

From 2cfc7ca70cd5bca9e46281125c40bcd64375be46 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Wed, 20 Nov 2024 14:41:40 -0800
Subject: [PATCH 03/10] skip tests that require same qdrant client

---
 llm-service/app/ai/indexing/index.py                    | 2 ++
 llm-service/app/tests/routers/index/test_data_source.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index bc294a840..5661d5e05 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -106,6 +106,8 @@ def index_file(self, file_path: str, file_id: str):
         chunks_vector_store = self.chunks_vector_store.access_vector_store()
         chunks_vector_store.add(chunks)
 
+        logger.debug(f"Indexing file: {file_path} completed")
+
     def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -> List[Document]:
         documents = reader.load_data(file_path)
 
diff --git a/llm-service/app/tests/routers/index/test_data_source.py b/llm-service/app/tests/routers/index/test_data_source.py
index 51cdbefaf..698bfbde8 100644
--- a/llm-service/app/tests/routers/index/test_data_source.py
+++ b/llm-service/app/tests/routers/index/test_data_source.py
@@ -40,6 +40,7 @@
 
 from typing import Any
 
+import pytest
 from llama_index.core import VectorStoreIndex
 from llama_index.core.vector_stores import VectorStoreQuery
 
@@ -52,9 +53,8 @@ def get_vector_store_index(data_source_id) -> VectorStoreIndex:
     index = VectorStoreIndex.from_vector_store(vector_store, embed_model=models.get_embedding_model())
     return index
 
-
+@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
 class TestDocumentIndexing:
-
     @staticmethod
     def test_create_document(
             client,

From 5071e3911549b341501521ae4d298b2addfa747c Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Wed, 20 Nov 2024 14:53:10 -0800
Subject: [PATCH 04/10] pass document id within the test

---
 .../rag/external/RagBackendClientTest.java    | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
index 13dd0b729..e1f8760d7 100644
--- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
+++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
@@ -57,7 +57,7 @@ void indexFile() {
     Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
     RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
     IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2);
-    RagDocument document = indexRequest("s3Path", 1234L);
+    RagDocument document = indexRequest("documentId", "s3Path", 1234L);
 
     client.indexFile(document, "bucketName", indexConfiguration);
 
@@ -76,7 +76,7 @@ void indexFile() {
   void createSummary() {
     Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
     RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-    RagDocument document = indexRequest("s3Path", 1234L);
+    RagDocument document = indexRequest("documentId", "s3Path", 1234L);
 
     client.createSummary(document, "bucketName");
 
@@ -135,13 +135,25 @@ void deleteSession() {
   void null_handlesThrowable() {
     RagBackendClient client =
         RagBackendClient.createNull(new Tracker<>(), new NotFound("not found"));
-    RagDocument document = indexRequest("s3Path", 1234L);
+    RagDocument document = indexRequest("documentId", "s3Path", 1234L);
     assertThatThrownBy(() -> client.indexFile(document, "fakeit", null))
         .isInstanceOf(NotFound.class);
   }
 
-  private static RagDocument indexRequest(String s3Path, Long dataSourceId) {
+  private static RagDocument indexRequest(String documentId, String s3Path, Long dataSourceId) {
     return new RagDocument(
-        null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null);
+        null,
+        null,
+        dataSourceId,
+        documentId,
+        s3Path,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null);
   }
 }

From e108d7eee274640832956f78438c8e4db88d3c00 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Wed, 20 Nov 2024 15:00:45 -0800
Subject: [PATCH 05/10] fix the other test with the race condition

---
 llm-service/app/tests/routers/index/test_doc_summaries.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llm-service/app/tests/routers/index/test_doc_summaries.py b/llm-service/app/tests/routers/index/test_doc_summaries.py
index 80951213e..aa80fac50 100644
--- a/llm-service/app/tests/routers/index/test_doc_summaries.py
+++ b/llm-service/app/tests/routers/index/test_doc_summaries.py
@@ -36,9 +36,10 @@
 #  DATA.
 #
 
+import pytest
 from typing import Any
 
-
+@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
 class TestDocumentSummaries:
     @staticmethod
     def test_generate_summary(client, index_document_request_body: dict[str, Any], data_source_id, document_id, s3_object) -> None:

From 81e5e53040b626d8da3e6e729d6e663373633df7 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Wed, 20 Nov 2024 15:02:37 -0800
Subject: [PATCH 06/10] fix monkey patch

---
 llm-service/app/routers/index/data_source/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
index ca073d772..79aa20a92 100644
--- a/llm-service/app/routers/index/data_source/__init__.py
+++ b/llm-service/app/routers/index/data_source/__init__.py
@@ -39,8 +39,8 @@
 from .... import exceptions
 from ....services import doc_summaries, qdrant, s3
 from ....ai.indexing.index import Indexer
-from ....services.rag_vector_store import create_rag_vector_store
-from ....services.models import get_embedding_model
+from ....services import rag_vector_store
+from ....services import models
 from llama_index.core.node_parser import SentenceSplitter
 
 logger = logging.getLogger(__name__)
@@ -143,7 +143,7 @@ def download_and_index(
                     chunk_size=request.configuration.chunk_size,
                     chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size),
                 ),
-                embedding_model=get_embedding_model(),
-                chunks_vector_store=create_rag_vector_store(data_source_id)
+                embedding_model=models.get_embedding_model(),
+                chunks_vector_store=rag_vector_store.create_rag_vector_store(data_source_id)
         )
         indexer.index_file(file_path, request.document_id)

From 743d5f4577e4dd9a9853fee0890c47d927e41a96 Mon Sep 17 00:00:00 2001
From: John Watson <jkwatson@gmail.com>
Date: Thu, 21 Nov 2024 09:56:26 -0800
Subject: [PATCH 07/10] a few tweaks, fix test & a couple bugs (#29)

---
 llm-service/app/ai/indexing/index.py                  | 11 ++++-------
 llm-service/app/routers/index/data_source/__init__.py |  9 ++++-----
 llm-service/app/services/CaiiModel.py                 |  3 ++-
 llm-service/app/tests/conftest.py                     |  4 ++--
 .../app/tests/routers/index/test_data_source.py       |  2 --
 .../app/tests/routers/index/test_doc_summaries.py     |  1 -
 6 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 5661d5e05..5d59dc154 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -57,13 +57,7 @@
     ".txt": NopReader,
     ".md": NopReader,
 }
-CHUNKABLE_FILE_EXTENSIONS = set(
-    [
-        ".pdf",
-        ".txt",
-        ".md",
-    ]
-)
+CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"}
 
 @dataclass
 class NotSupportedFileExtensionError(Exception):
@@ -101,6 +95,7 @@ def index_file(self, file_path: str, file_id: str):
 
         for chunk, embedding in zip(chunks, embeddings):
             chunk.embedding = embedding
+            chunk.metadata["file_name"] = os.path.basename(file_path)
 
         logger.debug(f"Adding {len(chunks)} chunks to vector store")
         chunks_vector_store = self.chunks_vector_store.access_vector_store()
@@ -113,6 +108,7 @@ def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -
 
         for i, document in enumerate(documents):
             # Update the document metadata
+            document.id_ = file_id
             document.metadata["file_id"] = file_id
             document.metadata["document_part_number"] = i
             document.metadata["data_source_id"] = self.data_source_id
@@ -124,6 +120,7 @@ def _chunks_in_document(self, document: Document) -> List[BaseNode]:
 
         for j, chunk in enumerate(chunks):
             chunk.metadata["file_id"] = document.metadata["file_id"]
+            chunk.metadata["document_id"] = document.metadata["file_id"]
             chunk.metadata["document_part_number"] = document.metadata["document_part_number"]
             chunk.metadata["chunk_number"] = j
             chunk.metadata["data_source_id"] = document.metadata["data_source_id"]
diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
index ca073d772..bb5e2bfae 100644
--- a/llm-service/app/routers/index/data_source/__init__.py
+++ b/llm-service/app/routers/index/data_source/__init__.py
@@ -28,7 +28,6 @@
 #  DATA.
 # ##############################################################################
 
-import http
 import logging
 import os
 import tempfile
@@ -39,8 +38,8 @@
 from .... import exceptions
 from ....services import doc_summaries, qdrant, s3
 from ....ai.indexing.index import Indexer
-from ....services.rag_vector_store import create_rag_vector_store
-from ....services.models import get_embedding_model
+from ....services import rag_vector_store
+from ....services import models
 from llama_index.core.node_parser import SentenceSplitter
 
 logger = logging.getLogger(__name__)
@@ -143,7 +142,7 @@ def download_and_index(
                     chunk_size=request.configuration.chunk_size,
                     chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size),
                 ),
-                embedding_model=get_embedding_model(),
-                chunks_vector_store=create_rag_vector_store(data_source_id)
+                embedding_model=models.get_embedding_model(),
+                chunks_vector_store=rag_vector_store.create_rag_vector_store(data_source_id)
         )
         indexer.index_file(file_path, request.document_id)
diff --git a/llm-service/app/services/CaiiModel.py b/llm-service/app/services/CaiiModel.py
index f1897edbc..e71b5cf4f 100644
--- a/llm-service/app/services/CaiiModel.py
+++ b/llm-service/app/services/CaiiModel.py
@@ -60,7 +60,8 @@ def __init__(
             api_base=api_base,
             messages_to_prompt=messages_to_prompt,
             completion_to_prompt=completion_to_prompt,
-            default_headers=default_headers)
+            default_headers=default_headers,
+            context=context)
         self.context = context
 
     @property
diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py
index 40530c7df..7fb6154fb 100644
--- a/llm-service/app/tests/conftest.py
+++ b/llm-service/app/tests/conftest.py
@@ -55,7 +55,7 @@
 from app.main import app
 from app.services import models, rag_vector_store
 from app.services.rag_qdrant_vector_store import RagQdrantVectorStore
-
+from app.services.utils import get_last_segment
 
 @pytest.fixture
 def aws_region() -> str:
@@ -92,7 +92,7 @@ def data_source_id() -> int:
 @pytest.fixture
 def index_document_request_body(data_source_id, s3_object) -> dict[str, Any]:
     return {
-        "document_id": s3_object.key,
+        "document_id": get_last_segment(s3_object.key),
         "data_source_id": data_source_id,
         "s3_bucket_name": s3_object.bucket_name,
         "s3_document_key": s3_object.key,
diff --git a/llm-service/app/tests/routers/index/test_data_source.py b/llm-service/app/tests/routers/index/test_data_source.py
index 698bfbde8..b88e4ccdc 100644
--- a/llm-service/app/tests/routers/index/test_data_source.py
+++ b/llm-service/app/tests/routers/index/test_data_source.py
@@ -40,7 +40,6 @@
 
 from typing import Any
 
-import pytest
 from llama_index.core import VectorStoreIndex
 from llama_index.core.vector_stores import VectorStoreQuery
 
@@ -53,7 +52,6 @@ def get_vector_store_index(data_source_id) -> VectorStoreIndex:
     index = VectorStoreIndex.from_vector_store(vector_store, embed_model=models.get_embedding_model())
     return index
 
-@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
 class TestDocumentIndexing:
     @staticmethod
     def test_create_document(
diff --git a/llm-service/app/tests/routers/index/test_doc_summaries.py b/llm-service/app/tests/routers/index/test_doc_summaries.py
index aa80fac50..92ffda5b1 100644
--- a/llm-service/app/tests/routers/index/test_doc_summaries.py
+++ b/llm-service/app/tests/routers/index/test_doc_summaries.py
@@ -39,7 +39,6 @@
 import pytest
 from typing import Any
 
-@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
 class TestDocumentSummaries:
     @staticmethod
     def test_generate_summary(client, index_document_request_body: dict[str, Any], data_source_id, document_id, s3_object) -> None:

From 02b7383217e91cfd8f2588a5e3aa3ca73ae464d5 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 10:18:20 -0800
Subject: [PATCH 08/10] resolve mypy

---
 llm-service/app/ai/indexing/index.py          | 65 +++++++++++++------
 llm-service/app/ai/indexing/readers/nop.py    |  1 +
 llm-service/app/ai/indexing/readers/pdf.py    |  6 +-
 .../app/routers/index/data_source/__init__.py |  3 +-
 4 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 5d59dc154..19fbcb97e 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -36,19 +36,20 @@
 #  DATA.
 #
 
-from dataclasses import dataclass
 import logging
 import os
+from dataclasses import dataclass
+from pathlib import Path
 from typing import Dict, List, Type
 
-from .readers.pdf import PDFReader
-from .readers.nop import NopReader
-from llama_index.core.readers.base import BaseReader
-from llama_index.core.node_parser import SentenceSplitter
-from llama_index.core.schema import Document
 from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import BaseNode, Document, TextNode
+
 from ...services.vector_store import VectorStore
-from llama_index.core.node_parser.interface import BaseNode
+from .readers.nop import NopReader
+from .readers.pdf import PDFReader
 
 logger = logging.getLogger(__name__)
 
@@ -59,18 +60,26 @@
 }
 CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"}
 
+
 @dataclass
 class NotSupportedFileExtensionError(Exception):
     file_extension: str
 
+
 class Indexer:
-    def __init__(self, data_source_id: int, splitter: SentenceSplitter, embedding_model: BaseEmbedding, chunks_vector_store: VectorStore):
+    def __init__(
+        self,
+        data_source_id: int,
+        splitter: SentenceSplitter,
+        embedding_model: BaseEmbedding,
+        chunks_vector_store: VectorStore,
+    ):
         self.data_source_id = data_source_id
         self.splitter = splitter
         self.embedding_model = embedding_model
         self.chunks_vector_store = chunks_vector_store
 
-    def index_file(self, file_path: str, file_id: str):
+    def index_file(self, file_path: Path, file_id: str) -> None:
         logger.debug(f"Indexing file: {file_path}")
 
         file_extension = os.path.splitext(file_path)[1]
@@ -85,7 +94,11 @@ def index_file(self, file_path: str, file_id: str):
         documents = self._documents_in_file(reader, file_path, file_id)
         if file_extension in CHUNKABLE_FILE_EXTENSIONS:
             logger.debug(f"Chunking file: {file_path}")
-            chunks = [chunk for document in documents for chunk in self._chunks_in_document(document)]
+            chunks = [
+                chunk
+                for document in documents
+                for chunk in self._chunks_in_document(document)
+            ]
         else:
             chunks = documents
 
@@ -95,34 +108,48 @@ def index_file(self, file_path: str, file_id: str):
 
         for chunk, embedding in zip(chunks, embeddings):
             chunk.embedding = embedding
-            chunk.metadata["file_name"] = os.path.basename(file_path)
 
         logger.debug(f"Adding {len(chunks)} chunks to vector store")
         chunks_vector_store = self.chunks_vector_store.access_vector_store()
-        chunks_vector_store.add(chunks)
+
+        # We have to explicitly convert here even though the types are compatible (TextNode inherits from BaseNode)
+        # because the "add" annotation uses List instead of Sequence. We need to use TextNode explicitly because
+        # we're capturing "text".
+        converted_chunks: List[BaseNode] = [chunk for chunk in chunks]
+        chunks_vector_store.add(converted_chunks)
 
         logger.debug(f"Indexing file: {file_path} completed")
 
-    def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -> List[Document]:
+    def _documents_in_file(
+        self, reader: BaseReader, file_path: Path, file_id: str
+    ) -> List[Document]:
         documents = reader.load_data(file_path)
 
         for i, document in enumerate(documents):
             # Update the document metadata
             document.id_ = file_id
-            document.metadata["file_id"] = file_id
+            document.metadata["file_name"] = os.path.basename(file_path)
+            document.metadata["document_id"] = file_id
             document.metadata["document_part_number"] = i
             document.metadata["data_source_id"] = self.data_source_id
 
         return documents
 
-    def _chunks_in_document(self, document: Document) -> List[BaseNode]:
+    def _chunks_in_document(self, document: Document) -> List[TextNode]:
         chunks = self.splitter.get_nodes_from_documents([document])
 
         for j, chunk in enumerate(chunks):
-            chunk.metadata["file_id"] = document.metadata["file_id"]
-            chunk.metadata["document_id"] = document.metadata["file_id"]
-            chunk.metadata["document_part_number"] = document.metadata["document_part_number"]
+            chunk.metadata["file_name"] = document.metadata["file_name"]
+            chunk.metadata["document_id"] = document.metadata["document_id"]
+            chunk.metadata["document_part_number"] = document.metadata[
+                "document_part_number"
+            ]
             chunk.metadata["chunk_number"] = j
             chunk.metadata["data_source_id"] = document.metadata["data_source_id"]
 
-        return chunks
+        converted_chunks: List[TextNode] = []
+        for chunk in chunks:
+            assert isinstance(chunk, TextNode)
+            converted_chunks.append(chunk)
+
+        return converted_chunks
diff --git a/llm-service/app/ai/indexing/readers/nop.py b/llm-service/app/ai/indexing/readers/nop.py
index 41d0fa452..3320e79fa 100644
--- a/llm-service/app/ai/indexing/readers/nop.py
+++ b/llm-service/app/ai/indexing/readers/nop.py
@@ -40,6 +40,7 @@
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
 
+
 class NopReader(BaseReader):
     def load_data(self, file_path: str) -> List[Document]:
         with open(file_path, "r") as f:
diff --git a/llm-service/app/ai/indexing/readers/pdf.py b/llm-service/app/ai/indexing/readers/pdf.py
index dfb922617..697721417 100644
--- a/llm-service/app/ai/indexing/readers/pdf.py
+++ b/llm-service/app/ai/indexing/readers/pdf.py
@@ -36,15 +36,17 @@
 #  DATA.
 #
 
+from pathlib import Path
 from typing import List
 
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
 from llama_index.readers.file import PDFReader as LlamaIndexPDFReader
 
+
 class PDFReader(BaseReader):
-    def __init__(self):
+    def __init__(self) -> None:
         self.inner = LlamaIndexPDFReader(return_full_document=True)
 
-    def load_data(self, file_path: str) -> List[Document]:
+    def load_data(self, file_path: Path) -> List[Document]:
         return self.inner.load_data(file_path)
diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
index 204d844cd..dedf8c5d1 100644
--- a/llm-service/app/routers/index/data_source/__init__.py
+++ b/llm-service/app/routers/index/data_source/__init__.py
@@ -31,6 +31,7 @@
 import logging
 import os
 import tempfile
+from pathlib import Path
 
 from fastapi import APIRouter
 from llama_index.core.node_parser import SentenceSplitter
@@ -132,7 +133,7 @@ def download_and_index(
         files = os.listdir(tmpdirname)
         if len(files) != 1:
             raise ValueError("Expected a single file in the temporary directory")
-        file_path = os.path.join(tmpdirname, files[0])
+        file_path = Path(os.path.join(tmpdirname, files[0]))
 
         indexer = Indexer(
             data_source_id,

From ce44205e246c716fbe34ce27da11b0bdfeb1239d Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 10:20:46 -0800
Subject: [PATCH 09/10] docx support

---
 llm-service/app/ai/indexing/index.py |  4 +++-
 llm-service/pdm.lock                 | 11 ++++++++++-
 llm-service/pyproject.toml           |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 19fbcb97e..85ac69930 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -46,6 +46,7 @@
 from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import BaseNode, Document, TextNode
+from llama_index.readers.file import DocxReader
 
 from ...services.vector_store import VectorStore
 from .readers.nop import NopReader
@@ -57,8 +58,9 @@
     ".pdf": PDFReader,
     ".txt": NopReader,
     ".md": NopReader,
+    ".docx": DocxReader,
 }
-CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"}
+CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md", ".docx"}
 
 
 @dataclass
diff --git a/llm-service/pdm.lock b/llm-service/pdm.lock
index 3a3ec66ed..cddb261f3 100644
--- a/llm-service/pdm.lock
+++ b/llm-service/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "dev"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
-content_hash = "sha256:cf38cbf44250032e4b248c90dbc34037bb94662cdfe512d50fcb3e271309fd84"
+content_hash = "sha256:2a9b3e86ee90d639241d72fdeee20e779c2e7c42e90ab2e43e335c18454e0858"
 
 [[metadata.targets]]
 requires_python = "==3.10.*"
@@ -382,6 +382,15 @@ files = [
     {file = "dnspython-2.7.0.tar.gz", hash = "sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1"},
 ]
 
+[[package]]
+name = "docx2txt"
+version = "0.8"
+summary = "A pure python-based utility to extract text and images from docx files."
+groups = ["default"]
+files = [
+    {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
+]
+
 [[package]]
 name = "email-validator"
 version = "2.2.0"
diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml
index 830db6091..d22188469 100644
--- a/llm-service/pyproject.toml
+++ b/llm-service/pyproject.toml
@@ -5,7 +5,7 @@ description = "Default template for PDM package"
 authors = [
     {name = "Conrado Silva Miranda", email = "csilvamiranda@cloudera.com"},
 ]
-dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17"]
+dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17", "docx2txt>=0.8"]
 requires-python = "==3.10.*"
 readme = "README.md"
 license = {text = "APACHE"}

From 60a7cbf641ec205268c147056eab15a6f059bd5d Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 10:25:31 -0800
Subject: [PATCH 10/10] make ruff happy

---
 llm-service/app/services/qdrant.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py
index 138dd8c10..314264359 100644
--- a/llm-service/app/services/qdrant.py
+++ b/llm-service/app/services/qdrant.py
@@ -44,17 +44,12 @@
 from llama_index.core.chat_engine.types import AgentChatResponse
 from llama_index.core.indices import VectorStoreIndex
 from llama_index.core.indices.vector_store import VectorIndexRetriever
-from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.query_engine import RetrieverQueryEngine
-from llama_index.core.readers import SimpleDirectoryReader
 from llama_index.core.response_synthesizers import get_response_synthesizer
-from llama_index.core.storage import StorageContext
-from pydantic import BaseModel
 
 from ..rag_types import RagPredictConfiguration
 from . import models, rag_vector_store
 from .chat_store import RagContext
-from .utils import get_last_segment
 
 logger = logging.getLogger(__name__)