diff --git a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
index cbdc33d38..b1e2cd797 100644
--- a/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
+++ b/backend/src/main/java/com/cloudera/cai/rag/external/RagBackendClient.java
@@ -68,7 +68,8 @@ public void indexFile(
               + "/data_sources/"
               + ragDocument.dataSourceId()
               + "/documents/download-and-index",
-          new IndexRequest(bucketName, ragDocument.s3Path(), configuration));
+          new IndexRequest(
+              ragDocument.documentId(), bucketName, ragDocument.s3Path(), configuration));
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -97,6 +98,7 @@ public void deleteSession(Long sessionId) {
   }
 
   record IndexRequest(
+      @JsonProperty("document_id") String documentId,
       @JsonProperty("s3_bucket_name") String s3BucketName,
       @JsonProperty("s3_document_key") String s3DocumentKey,
       IndexConfiguration configuration) {}
diff --git a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
index 60f6393ff..e1f8760d7 100644
--- a/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
+++ b/backend/src/test/java/com/cloudera/cai/rag/external/RagBackendClientTest.java
@@ -57,7 +57,7 @@ void indexFile() {
     Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
     RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
     IndexConfiguration indexConfiguration = new IndexConfiguration(123, 2);
-    RagDocument document = indexRequest("s3Path", 1234L);
+    RagDocument document = indexRequest("documentId", "s3Path", 1234L);
 
     client.indexFile(document, "bucketName", indexConfiguration);
 
@@ -68,14 +68,15 @@ void indexFile() {
             new TrackedHttpRequest<>(
                 HttpMethod.POST,
                 "http://rag-backend:8000/data_sources/" + 1234L + "/documents/download-and-index",
-                new RagBackendClient.IndexRequest("bucketName", "s3Path", indexConfiguration)));
+                new RagBackendClient.IndexRequest(
+                    "documentId", "bucketName", "s3Path", indexConfiguration)));
   }
 
   @Test
   void createSummary() {
     Tracker<TrackedHttpRequest<?>> tracker = new Tracker<>();
     RagBackendClient client = new RagBackendClient(SimpleHttpClient.createNull(tracker));
-    RagDocument document = indexRequest("s3Path", 1234L);
+    RagDocument document = indexRequest("documentId", "s3Path", 1234L);
 
     client.createSummary(document, "bucketName");
 
@@ -134,13 +135,25 @@ void deleteSession() {
   void null_handlesThrowable() {
     RagBackendClient client =
         RagBackendClient.createNull(new Tracker<>(), new NotFound("not found"));
-    RagDocument document = indexRequest("s3Path", 1234L);
+    RagDocument document = indexRequest("documentId", "s3Path", 1234L);
     assertThatThrownBy(() -> client.indexFile(document, "fakeit", null))
         .isInstanceOf(NotFound.class);
   }
 
-  private static RagDocument indexRequest(String s3Path, Long dataSourceId) {
+  private static RagDocument indexRequest(String documentId, String s3Path, Long dataSourceId) {
     return new RagDocument(
-        null, null, dataSourceId, null, s3Path, null, null, null, null, null, null, null, null);
+        null,
+        null,
+        dataSourceId,
+        documentId,
+        s3Path,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null);
   }
 }
diff --git a/llm-service/app/ai/__init__.py b/llm-service/app/ai/__init__.py
new file mode 100644
index 000000000..e2b4ac6c2
--- /dev/null
+++ b/llm-service/app/ai/__init__.py
@@ -0,0 +1,37 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
diff --git a/llm-service/app/ai/indexing/__init__.py b/llm-service/app/ai/indexing/__init__.py
new file mode 100644
index 000000000..e2b4ac6c2
--- /dev/null
+++ b/llm-service/app/ai/indexing/__init__.py
@@ -0,0 +1,37 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
new file mode 100644
index 000000000..85ac69930
--- /dev/null
+++ b/llm-service/app/ai/indexing/index.py
@@ -0,0 +1,157 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Type
+
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import BaseNode, Document, TextNode
+from llama_index.readers.file import DocxReader
+
+from ...services.vector_store import VectorStore
+from .readers.nop import NopReader
+from .readers.pdf import PDFReader
+
+logger = logging.getLogger(__name__)
+
+READERS: Dict[str, Type[BaseReader]] = {
+    ".pdf": PDFReader,
+    ".txt": NopReader,
+    ".md": NopReader,
+    ".docx": DocxReader,
+}
+CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md", ".docx"}
+
+
+@dataclass
+class NotSupportedFileExtensionError(Exception):
+    file_extension: str
+
+
+class Indexer:
+    def __init__(
+        self,
+        data_source_id: int,
+        splitter: SentenceSplitter,
+        embedding_model: BaseEmbedding,
+        chunks_vector_store: VectorStore,
+    ):
+        self.data_source_id = data_source_id
+        self.splitter = splitter
+        self.embedding_model = embedding_model
+        self.chunks_vector_store = chunks_vector_store
+
+    def index_file(self, file_path: Path, file_id: str) -> None:
+        logger.debug(f"Indexing file: {file_path}")
+
+        file_extension = os.path.splitext(file_path)[1]
+        reader_cls = READERS.get(file_extension)
+        if not reader_cls:
+            raise NotSupportedFileExtensionError(file_extension)
+
+        reader = reader_cls()
+
+        logger.debug(f"Parsing file: {file_path}")
+
+        documents = self._documents_in_file(reader, file_path, file_id)
+        if file_extension in CHUNKABLE_FILE_EXTENSIONS:
+            logger.debug(f"Chunking file: {file_path}")
+            chunks = [
+                chunk
+                for document in documents
+                for chunk in self._chunks_in_document(document)
+            ]
+        else:
+            chunks = documents
+
+        texts = [chunk.text for chunk in chunks]
+        logger.debug(f"Embedding {len(texts)} chunks")
+        embeddings = self.embedding_model.get_text_embedding_batch(texts)
+
+        for chunk, embedding in zip(chunks, embeddings):
+            chunk.embedding = embedding
+
+        logger.debug(f"Adding {len(chunks)} chunks to vector store")
+        chunks_vector_store = self.chunks_vector_store.access_vector_store()
+
+        # We have to explicitly convert here even though the types are compatible (TextNode inherits from BaseNode)
+        # because the "add" annotation uses List instead of Sequence. We need to use TextNode explicitly because
+        # we're capturing "text".
+        converted_chunks: List[BaseNode] = [chunk for chunk in chunks]
+        chunks_vector_store.add(converted_chunks)
+
+        logger.debug(f"Indexing file: {file_path} completed")
+
+    def _documents_in_file(
+        self, reader: BaseReader, file_path: Path, file_id: str
+    ) -> List[Document]:
+        documents = reader.load_data(file_path)
+
+        for i, document in enumerate(documents):
+            # Update the document metadata
+            document.id_ = file_id
+            document.metadata["file_name"] = os.path.basename(file_path)
+            document.metadata["document_id"] = file_id
+            document.metadata["document_part_number"] = i
+            document.metadata["data_source_id"] = self.data_source_id
+
+        return documents
+
+    def _chunks_in_document(self, document: Document) -> List[TextNode]:
+        chunks = self.splitter.get_nodes_from_documents([document])
+
+        for j, chunk in enumerate(chunks):
+            chunk.metadata["file_name"] = document.metadata["file_name"]
+            chunk.metadata["document_id"] = document.metadata["document_id"]
+            chunk.metadata["document_part_number"] = document.metadata[
+                "document_part_number"
+            ]
+            chunk.metadata["chunk_number"] = j
+            chunk.metadata["data_source_id"] = document.metadata["data_source_id"]
+
+        converted_chunks: List[TextNode] = []
+        for chunk in chunks:
+            assert isinstance(chunk, TextNode)
+            converted_chunks.append(chunk)
+
+        return converted_chunks
diff --git a/llm-service/app/ai/indexing/readers/__init__.py b/llm-service/app/ai/indexing/readers/__init__.py
new file mode 100644
index 000000000..e2b4ac6c2
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/__init__.py
@@ -0,0 +1,37 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
diff --git a/llm-service/app/ai/indexing/readers/nop.py b/llm-service/app/ai/indexing/readers/nop.py
new file mode 100644
index 000000000..3320e79fa
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/nop.py
@@ -0,0 +1,47 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from typing import List
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class NopReader(BaseReader):
+    def load_data(self, file_path: str) -> List[Document]:
+        with open(file_path, "r") as f:
+            return [Document(text=f.read())]
diff --git a/llm-service/app/ai/indexing/readers/pdf.py b/llm-service/app/ai/indexing/readers/pdf.py
new file mode 100644
index 000000000..697721417
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/pdf.py
@@ -0,0 +1,52 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from pathlib import Path
+from typing import List
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+from llama_index.readers.file import PDFReader as LlamaIndexPDFReader
+
+
+class PDFReader(BaseReader):
+    def __init__(self) -> None:
+        self.inner = LlamaIndexPDFReader(return_full_document=True)
+
+    def load_data(self, file_path: Path) -> List[Document]:
+        return self.inner.load_data(file_path)
diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
index a3d544bf3..dedf8c5d1 100644
--- a/llm-service/app/routers/index/data_source/__init__.py
+++ b/llm-service/app/routers/index/data_source/__init__.py
@@ -28,15 +28,18 @@
 #  DATA.
 # ##############################################################################
 
-import http
 import logging
+import os
 import tempfile
+from pathlib import Path
 
 from fastapi import APIRouter
+from llama_index.core.node_parser import SentenceSplitter
 from pydantic import BaseModel
 
 from .... import exceptions
-from ....services import doc_summaries, qdrant, s3
+from ....ai.indexing.index import Indexer
+from ....services import doc_summaries, models, qdrant, rag_vector_store, s3
 
 logger = logging.getLogger(__name__)
 
@@ -100,12 +103,17 @@ def delete_document(data_source_id: int, doc_id: str) -> None:
     doc_summaries.delete_document(data_source_id, doc_id)
 
 
+class RagIndexDocumentConfiguration(BaseModel):
+    # TODO: Add more params
+    chunk_size: int = 512  # this is llama-index's default
+    chunk_overlap: int = 10  # percentage of tokens in a chunk (chunk_size)
+
+
 class RagIndexDocumentRequest(BaseModel):
+    document_id: str
     s3_bucket_name: str
     s3_document_key: str
-    configuration: qdrant.RagIndexDocumentConfiguration = (
-        qdrant.RagIndexDocumentConfiguration()
-    )
+    configuration: RagIndexDocumentConfiguration = RagIndexDocumentConfiguration()
 
 
 @router.post(
@@ -117,11 +125,29 @@ class RagIndexDocumentRequest(BaseModel):
 def download_and_index(
     data_source_id: int,
     request: RagIndexDocumentRequest,
-) -> str:
+) -> None:
     with tempfile.TemporaryDirectory() as tmpdirname:
         logger.debug("created temporary directory %s", tmpdirname)
         s3.download(tmpdirname, request.s3_bucket_name, request.s3_document_key)
-        qdrant.download_and_index(
-            tmpdirname, data_source_id, request.configuration, request.s3_document_key
+        # Get the single file in the directory
+        files = os.listdir(tmpdirname)
+        if len(files) != 1:
+            raise ValueError("Expected a single file in the temporary directory")
+        file_path = Path(os.path.join(tmpdirname, files[0]))
+
+        indexer = Indexer(
+            data_source_id,
+            splitter=SentenceSplitter(
+                chunk_size=request.configuration.chunk_size,
+                chunk_overlap=int(
+                    request.configuration.chunk_overlap
+                    * 0.01
+                    * request.configuration.chunk_size
+                ),
+            ),
+            embedding_model=models.get_embedding_model(),
+            chunks_vector_store=rag_vector_store.create_rag_vector_store(
+                data_source_id
+            ),
         )
-        return http.HTTPStatus.OK.phrase
+        indexer.index_file(file_path, request.document_id)
diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py
index efeeff25d..314264359 100644
--- a/llm-service/app/services/qdrant.py
+++ b/llm-service/app/services/qdrant.py
@@ -44,77 +44,16 @@
 from llama_index.core.chat_engine.types import AgentChatResponse
 from llama_index.core.indices import VectorStoreIndex
 from llama_index.core.indices.vector_store import VectorIndexRetriever
-from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.query_engine import RetrieverQueryEngine
-from llama_index.core.readers import SimpleDirectoryReader
 from llama_index.core.response_synthesizers import get_response_synthesizer
-from llama_index.core.storage import StorageContext
-from pydantic import BaseModel
 
 from ..rag_types import RagPredictConfiguration
 from . import models, rag_vector_store
 from .chat_store import RagContext
-from .utils import get_last_segment
 
 logger = logging.getLogger(__name__)
 
 
-class RagIndexDocumentConfiguration(BaseModel):
-    # TODO: Add more params
-    chunk_size: int = 512  # this is llama-index's default
-    chunk_overlap: int = 10  # percentage of tokens in a chunk (chunk_size)
-
-
-def download_and_index(
-    tmpdirname: str,
-    data_source_id: int,
-    configuration: RagIndexDocumentConfiguration,
-    s3_document_key: str,
-) -> None:
-    try:
-        documents = SimpleDirectoryReader(tmpdirname).load_data()
-        document_id = get_last_segment(s3_document_key)
-        for document in documents:
-            document.id_ = document_id  # this is a terrible way to assign the doc id...
-            document.metadata["document_id"] = document_id
-    except Exception as e:
-        logger.error(
-            "error loading document from temporary directory %s",
-            tmpdirname,
-        )
-        raise HTTPException(
-            status_code=422,
-            detail=f"error loading document from temporary directory {tmpdirname}",
-        ) from e
-
-    logger.info("instantiating vector store")
-    vector_store = rag_vector_store.create_rag_vector_store(
-        data_source_id
-    ).access_vector_store()
-    logger.info("instantiated vector store")
-
-    storage_context = StorageContext.from_defaults(vector_store=vector_store)
-
-    chunk_overlap_tokens = int(
-        configuration.chunk_overlap * 0.01 * configuration.chunk_size
-    )
-
-    logger.info("indexing document")
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        embed_model=models.get_embedding_model(),
-        show_progress=False,
-        transformations=[
-            SentenceSplitter(
-                chunk_size=configuration.chunk_size,
-                chunk_overlap=chunk_overlap_tokens,
-            ),
-        ],
-    )
-    logger.info("indexed document")
-
-
 def check_data_source_exists(data_source_size: int) -> None:
     if data_source_size == -1:
         raise HTTPException(status_code=404, detail="Knowledge base not found.")
diff --git a/llm-service/app/tests/conftest.py b/llm-service/app/tests/conftest.py
index 5c6201fa6..092df5048 100644
--- a/llm-service/app/tests/conftest.py
+++ b/llm-service/app/tests/conftest.py
@@ -65,7 +65,7 @@
 from app.main import app
 from app.services import models, rag_vector_store
 from app.services.rag_qdrant_vector_store import RagQdrantVectorStore
-
+from app.services.utils import get_last_segment
 
 @dataclass
 class BotoObject:
@@ -110,6 +110,7 @@ def index_document_request_body(
     data_source_id: int, s3_object: BotoObject
 ) -> Dict[str, Any]:
     return {
+        "document_id": get_last_segment(s3_object.key),
         "data_source_id": data_source_id,
         "s3_bucket_name": s3_object.bucket_name,
         "s3_document_key": s3_object.key,
diff --git a/llm-service/pdm.lock b/llm-service/pdm.lock
index 3a3ec66ed..cddb261f3 100644
--- a/llm-service/pdm.lock
+++ b/llm-service/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "dev"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
-content_hash = "sha256:cf38cbf44250032e4b248c90dbc34037bb94662cdfe512d50fcb3e271309fd84"
+content_hash = "sha256:2a9b3e86ee90d639241d72fdeee20e779c2e7c42e90ab2e43e335c18454e0858"
 
 [[metadata.targets]]
 requires_python = "==3.10.*"
@@ -382,6 +382,15 @@ files = [
     {file = "dnspython-2.7.0.tar.gz", hash = "sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1"},
 ]
 
+[[package]]
+name = "docx2txt"
+version = "0.8"
+summary = "A pure python-based utility to extract text and images from docx files."
+groups = ["default"]
+files = [
+    {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
+]
+
 [[package]]
 name = "email-validator"
 version = "2.2.0"
diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml
index 830db6091..d22188469 100644
--- a/llm-service/pyproject.toml
+++ b/llm-service/pyproject.toml
@@ -5,7 +5,7 @@ description = "Default template for PDM package"
 authors = [
     {name = "Conrado Silva Miranda", email = "csilvamiranda@cloudera.com"},
 ]
-dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17"]
+dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17", "docx2txt>=0.8"]
 requires-python = "==3.10.*"
 readme = "README.md"
 license = {text = "APACHE"}