From cf10494ddad4bbf6f6ca37743193f821af9b3348 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 10:35:36 -0800
Subject: [PATCH 1/7] Support parsing json and csv

---
 llm-service/app/ai/indexing/index.py        |  4 ++
 llm-service/app/ai/indexing/readers/csv.py  | 56 +++++++++++++++++++++
 llm-service/app/ai/indexing/readers/json.py | 51 +++++++++++++++++++
 llm-service/app/ai/indexing/readers/nop.py  |  4 +-
 llm-service/pdm.lock                        |  2 +-
 llm-service/pyproject.toml                  |  2 +-
 6 files changed, 116 insertions(+), 3 deletions(-)
 create mode 100644 llm-service/app/ai/indexing/readers/csv.py
 create mode 100644 llm-service/app/ai/indexing/readers/json.py

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 85ac69930..08e823739 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -49,6 +49,8 @@
 from llama_index.readers.file import DocxReader
 
 from ...services.vector_store import VectorStore
+from .readers.csv import CSVReader
+from .readers.json import JSONReader
 from .readers.nop import NopReader
 from .readers.pdf import PDFReader
 
@@ -59,6 +61,8 @@
     ".txt": NopReader,
     ".md": NopReader,
     ".docx": DocxReader,
+    ".csv": CSVReader,
+    ".json": JSONReader,
 }
 CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md", ".docx"}
 
diff --git a/llm-service/app/ai/indexing/readers/csv.py b/llm-service/app/ai/indexing/readers/csv.py
new file mode 100644
index 000000000..5061627cb
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/csv.py
@@ -0,0 +1,56 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+import json
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class CSVReader(BaseReader):
+    def load_data(self, file_path: Path) -> List[Document]:
+        # Read the CSV file into a pandas DataFrame
+        df = pd.read_csv(file_path)
+        # Convert the dataframe into a list of dictionaries, one per row
+        rows = df.to_dict(orient="records")
+        # Convert each dictionary into a Document
+        documents = [Document(text=json.dumps(row, sort_keys=True)) for row in rows]
+        return documents
diff --git a/llm-service/app/ai/indexing/readers/json.py b/llm-service/app/ai/indexing/readers/json.py
new file mode 100644
index 000000000..e122a1e71
--- /dev/null
+++ b/llm-service/app/ai/indexing/readers/json.py
@@ -0,0 +1,51 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+import json
+from pathlib import Path
+from typing import List
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class JSONReader(BaseReader):
+    def load_data(self, file_path: Path) -> List[Document]:
+        with open(file_path, "r") as f:
+            content = json.load(f)
+        return [Document(text=json.dumps(content, sort_keys=True))]
diff --git a/llm-service/app/ai/indexing/readers/nop.py b/llm-service/app/ai/indexing/readers/nop.py
index 3320e79fa..aaa01e209 100644
--- a/llm-service/app/ai/indexing/readers/nop.py
+++ b/llm-service/app/ai/indexing/readers/nop.py
@@ -36,12 +36,14 @@
 #  DATA.
 #
 
+from pathlib import Path
 from typing import List
+
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
 
 
 class NopReader(BaseReader):
-    def load_data(self, file_path: str) -> List[Document]:
+    def load_data(self, file_path: Path) -> List[Document]:
         with open(file_path, "r") as f:
             return [Document(text=f.read())]
diff --git a/llm-service/pdm.lock b/llm-service/pdm.lock
index cddb261f3..f7cc56681 100644
--- a/llm-service/pdm.lock
+++ b/llm-service/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "dev"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
-content_hash = "sha256:2a9b3e86ee90d639241d72fdeee20e779c2e7c42e90ab2e43e335c18454e0858"
+content_hash = "sha256:37f4e048b91c6850dc6133d0ab73273fa296454ed6776ad177856e619ff421d3"
 
 [[metadata.targets]]
 requires_python = "==3.10.*"
diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml
index d22188469..eeff2848d 100644
--- a/llm-service/pyproject.toml
+++ b/llm-service/pyproject.toml
@@ -5,7 +5,7 @@ description = "Default template for PDM package"
 authors = [
     {name = "Conrado Silva Miranda", email = "csilvamiranda@cloudera.com"},
 ]
-dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17", "docx2txt>=0.8"]
+dependencies = ["llama-index-core==0.10.68", "llama-index-readers-file==0.1.33", "fastapi==0.111.0", "pydantic==2.8.2", "pydantic-settings==2.3.4", "boto3>=1.35.66", "llama-index-embeddings-bedrock==0.2.1", "llama-index-llms-bedrock==0.1.13", "llama-index-llms-openai==0.1.31", "llama-index-llms-mistralai==0.1.20", "llama-index-embeddings-openai==0.1.11", "llama-index-vector-stores-qdrant==0.2.17", "docx2txt>=0.8", "pandas>=2.2.3"]
 requires-python = "==3.10.*"
 readme = "README.md"
 license = {text = "APACHE"}

From c773dfb4ba8e28e19ac9a1474375f7e73bd9b287 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 13:11:45 -0800
Subject: [PATCH 2/7] parallelize requests

---
 llm-service/app/ai/indexing/index.py | 52 ++++++++++++++++++++--------
 llm-service/app/services/utils.py    | 27 +++++++++++++--
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 08e823739..47f39d01d 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -38,9 +38,10 @@
 
 import logging
 import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Type
+from typing import Dict, Generator, List, Type
 
 from llama_index.core.base.embeddings.base import BaseEmbedding
 from llama_index.core.node_parser import SentenceSplitter
@@ -48,6 +49,7 @@
 from llama_index.core.schema import BaseNode, Document, TextNode
 from llama_index.readers.file import DocxReader
 
+from ...services.utils import batch_sequence, flatten_sequence
 from ...services.vector_store import VectorStore
 from .readers.csv import CSVReader
 from .readers.json import JSONReader
@@ -108,21 +110,22 @@ def index_file(self, file_path: Path, file_id: str) -> None:
         else:
             chunks = documents
 
-        texts = [chunk.text for chunk in chunks]
-        logger.debug(f"Embedding {len(texts)} chunks")
-        embeddings = self.embedding_model.get_text_embedding_batch(texts)
+        logger.debug(f"Embedding {len(chunks)} chunks")
 
-        for chunk, embedding in zip(chunks, embeddings):
-            chunk.embedding = embedding
+        chunks_with_embeddings = flatten_sequence(self._compute_embeddings(chunks))
 
-        logger.debug(f"Adding {len(chunks)} chunks to vector store")
-        chunks_vector_store = self.chunks_vector_store.access_vector_store()
+        acc = 0
+        for chunk_batch in batch_sequence(chunks_with_embeddings, 1000):
+            acc += len(chunk_batch)
+            logger.debug(f"Adding {acc}/{len(chunks)} chunks to vector store")
 
-        # We have to explicitly convert here even though the types are compatible (TextNode inherits from BaseNode)
-        # because the "add" annotation uses List instead of Sequence. We need to use TextNode explicitly because
-        # we're capturing "text".
-        converted_chunks: List[BaseNode] = [chunk for chunk in chunks]
-        chunks_vector_store.add(converted_chunks)
+            # We have to explicitly convert here even though the types are compatible (TextNode inherits from BaseNode)
+            # because the "add" annotation uses List instead of Sequence. We need to use TextNode explicitly because
+            # we're capturing "text".
+            converted_chunks: List[BaseNode] = [chunk for chunk in chunk_batch]
+
+            chunks_vector_store = self.chunks_vector_store.access_vector_store()
+            chunks_vector_store.add(converted_chunks)
 
         logger.debug(f"Indexing file: {file_path} completed")
 
@@ -132,8 +135,6 @@ def _documents_in_file(
         documents = reader.load_data(file_path)
 
         for i, document in enumerate(documents):
-            # Update the document metadata
-            document.id_ = file_id
             document.metadata["file_name"] = os.path.basename(file_path)
             document.metadata["document_id"] = file_id
             document.metadata["document_part_number"] = i
@@ -159,3 +160,24 @@ def _chunks_in_document(self, document: Document) -> List[TextNode]:
             converted_chunks.append(chunk)
 
         return converted_chunks
+
+    def _compute_embeddings(
+        self, chunks: List[TextNode]
+    ) -> Generator[List[TextNode], None, None]:
+        batched_chunks = list(batch_sequence(chunks, 100))
+        batched_texts = [[chunk.text for chunk in batch] for batch in batched_chunks]
+
+        with ThreadPoolExecutor(max_workers=20) as executor:
+            futures = [
+                executor.submit(
+                    lambda b: (i, self.embedding_model.get_text_embedding_batch(b)),
+                    batch,
+                )
+                for i, batch in enumerate(batched_texts)
+            ]
+            logger.debug(f"Waiting for {len(futures)} futures")
+            for future in as_completed(futures):
+                i, batch_embeddings = future.result()
+                for chunk, embedding in zip(batched_chunks[i], batch_embeddings):
+                    chunk.embedding = embedding
+                yield batched_chunks[i]
diff --git a/llm-service/app/services/utils.py b/llm-service/app/services/utils.py
index 6ac9893cf..6aeee03f0 100644
--- a/llm-service/app/services/utils.py
+++ b/llm-service/app/services/utils.py
@@ -37,8 +37,7 @@
 # ##############################################################################
 
 import re
-from typing import List, Tuple
-
+from typing import Generator, List, Sequence, Tuple, TypeVar, Union
 
 # TODO delete this if it's not being used
 
@@ -82,3 +81,27 @@ def parse_choice_select_answer_fn(
 
 def get_last_segment(path: str) -> str:
     return path.split("/")[-1]
+
+
+T = TypeVar("T")
+
+
+def batch_sequence(
+    sequence: Union[Sequence[T], Generator[T, None, None]], batch_size: int
+) -> Generator[List[T], None, None]:
+    batch = []
+    for val in sequence:
+        batch.append(val)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if batch:
+        yield batch
+
+
+def flatten_sequence(
+    sequence: Union[Sequence[Sequence[T]], Generator[Sequence[T], None, None]],
+) -> Generator[T, None, None]:
+    for sublist in sequence:
+        for item in sublist:
+            yield item

From e6d8cfba6e20e2955fcc1b4e62d9faaed7b3e267 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 14:02:48 -0800
Subject: [PATCH 3/7] try to fix deletion

---
 llm-service/app/ai/indexing/index.py             |  4 ++--
 llm-service/app/services/qdrant.py               | 15 ++++++++-------
 .../app/services/rag_qdrant_vector_store.py      | 16 +++++++++++++++-
 llm-service/app/services/vector_store.py         |  4 ++++
 .../app/tests/routers/index/test_data_source.py  | 10 ++++++----
 5 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 47f39d01d..1fd2c5c57 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -87,7 +87,7 @@ def __init__(
         self.embedding_model = embedding_model
         self.chunks_vector_store = chunks_vector_store
 
-    def index_file(self, file_path: Path, file_id: str) -> None:
+    def index_file(self, file_path: Path, document_id: str) -> None:
         logger.debug(f"Indexing file: {file_path}")
 
         file_extension = os.path.splitext(file_path)[1]
@@ -99,7 +99,7 @@ def index_file(self, file_path: Path, file_id: str) -> None:
 
         logger.debug(f"Parsing file: {file_path}")
 
-        documents = self._documents_in_file(reader, file_path, file_id)
+        documents = self._documents_in_file(reader, file_path, document_id)
         if file_extension in CHUNKABLE_FILE_EXTENSIONS:
             logger.debug(f"Chunking file: {file_path}")
             chunks = [
diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py
index 314264359..2950ed5e1 100644
--- a/llm-service/app/services/qdrant.py
+++ b/llm-service/app/services/qdrant.py
@@ -46,6 +46,12 @@
 from llama_index.core.indices.vector_store import VectorIndexRetriever
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.response_synthesizers import get_response_synthesizer
+from llama_index.core.vector_stores.types import (
+    FilterOperator,
+    MetadataFilter,
+    MetadataFilters,
+    VectorStoreQuery,
+)
 
 from ..rag_types import RagPredictConfiguration
 from . import models, rag_vector_store
@@ -78,14 +84,9 @@ def delete(data_source_id: int) -> None:
 
 
 def delete_document(data_source_id: int, document_id: str) -> None:
-    vector_store = rag_vector_store.create_rag_vector_store(
-        data_source_id
-    ).access_vector_store()
-    index = VectorStoreIndex.from_vector_store(
-        vector_store=vector_store,
-        embed_model=models.get_embedding_model(),
+    rag_vector_store.create_rag_vector_store(data_source_id).delete_document(
+        document_id
     )
-    index.delete_ref_doc(document_id)
 
 
 def query(
diff --git a/llm-service/app/services/rag_qdrant_vector_store.py b/llm-service/app/services/rag_qdrant_vector_store.py
index 18fd1cf55..ae3d4c26f 100644
--- a/llm-service/app/services/rag_qdrant_vector_store.py
+++ b/llm-service/app/services/rag_qdrant_vector_store.py
@@ -39,9 +39,10 @@
 import os
 
 import qdrant_client
+import qdrant_client.http.models as rest
 from llama_index.core.vector_stores.types import BasePydanticVectorStore
 from llama_index.vector_stores.qdrant import QdrantVectorStore
-from qdrant_client.http.models import CountResult
+from qdrant_client.http.models import CountResult, FieldCondition, MatchValue
 
 from .vector_store import VectorStore
 
@@ -87,3 +88,16 @@ def _create_qdrant_clients(
     def access_vector_store(self) -> BasePydanticVectorStore:
         vector_store = QdrantVectorStore(self.table_name, self.client, self.aclient)
         return vector_store
+
+    def delete_document(self, document_id: str) -> None:
+        result = self.client.delete(
+            self.table_name,
+            points_selector=rest.Filter(
+                must=[
+                    FieldCondition(
+                        key="metadata.document_id", match=MatchValue(value=document_id)
+                    )
+                ]
+            ),
+        )
+        print(result)
diff --git a/llm-service/app/services/vector_store.py b/llm-service/app/services/vector_store.py
index 77fd31f29..515b5950e 100644
--- a/llm-service/app/services/vector_store.py
+++ b/llm-service/app/services/vector_store.py
@@ -61,3 +61,7 @@ def access_vector_store(self) -> BasePydanticVectorStore:
     @abstractmethod
     def exists(self) -> bool:
         """Does the vector store exist?"""
+
+    @abstractmethod
+    def delete_document(self, document_id: str) -> None:
+        """Delete a document from the vector store"""
diff --git a/llm-service/app/tests/routers/index/test_data_source.py b/llm-service/app/tests/routers/index/test_data_source.py
index b6450f46f..5498d6237 100644
--- a/llm-service/app/tests/routers/index/test_data_source.py
+++ b/llm-service/app/tests/routers/index/test_data_source.py
@@ -74,7 +74,7 @@ def test_create_document(
         assert document_id is not None
         index = get_vector_store_index(data_source_id)
         vectors = index.vector_store.query(
-            VectorStoreQuery(query_embedding=[0.66] * 1024, doc_ids=[document_id])
+            VectorStoreQuery(query_embedding=[0.66] * 1024)
         )
         assert len(vectors.nodes or []) == 1
 
@@ -93,7 +93,7 @@ def test_delete_data_source(
 
         index = get_vector_store_index(data_source_id)
         vectors = index.vector_store.query(
-            VectorStoreQuery(query_embedding=[0.66] * 1024, doc_ids=[document_id])
+            VectorStoreQuery(query_embedding=[0.66] * 1024)
         )
         assert len(vectors.nodes or []) == 1
 
@@ -122,9 +122,11 @@ def test_delete_document(
 
         index = get_vector_store_index(data_source_id)
         vectors = index.vector_store.query(
-            VectorStoreQuery(query_embedding=[0.2] * 1024, doc_ids=[document_id])
+            VectorStoreQuery(query_embedding=[0.2] * 1024)
         )
         assert len(vectors.nodes or []) == 1
+        print(document_id)
+        print("\n" * 10)
 
         response = client.delete(
             f"/data_sources/{data_source_id}/documents/{document_id}"
@@ -133,7 +135,7 @@ def test_delete_document(
 
         index = get_vector_store_index(data_source_id)
         vectors = index.vector_store.query(
-            VectorStoreQuery(query_embedding=[0.2] * 1024, doc_ids=[document_id])
+            VectorStoreQuery(query_embedding=[0.2] * 1024)
         )
         assert len(vectors.nodes or []) == 0
 

From d287d35d48574ed7d9c14afc49d153869a72a010 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 14:11:06 -0800
Subject: [PATCH 4/7] keep document id

---
 llm-service/app/ai/indexing/index.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
index 1fd2c5c57..fcbc822ff 100644
--- a/llm-service/app/ai/indexing/index.py
+++ b/llm-service/app/ai/indexing/index.py
@@ -130,13 +130,14 @@ def index_file(self, file_path: Path, document_id: str) -> None:
         logger.debug(f"Indexing file: {file_path} completed")
 
     def _documents_in_file(
-        self, reader: BaseReader, file_path: Path, file_id: str
+        self, reader: BaseReader, file_path: Path, document_id: str
     ) -> List[Document]:
         documents = reader.load_data(file_path)
 
         for i, document in enumerate(documents):
+            document.id_ = document_id
             document.metadata["file_name"] = os.path.basename(file_path)
-            document.metadata["document_id"] = file_id
+            document.metadata["document_id"] = document_id
             document.metadata["document_part_number"] = i
             document.metadata["data_source_id"] = self.data_source_id
 

From 4acff7cb4d3b5da6799c549f6cb170625b82567b Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 14:12:16 -0800
Subject: [PATCH 5/7] revert delete changes

---
 llm-service/app/services/qdrant.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llm-service/app/services/qdrant.py b/llm-service/app/services/qdrant.py
index 2950ed5e1..314264359 100644
--- a/llm-service/app/services/qdrant.py
+++ b/llm-service/app/services/qdrant.py
@@ -46,12 +46,6 @@
 from llama_index.core.indices.vector_store import VectorIndexRetriever
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.response_synthesizers import get_response_synthesizer
-from llama_index.core.vector_stores.types import (
-    FilterOperator,
-    MetadataFilter,
-    MetadataFilters,
-    VectorStoreQuery,
-)
 
 from ..rag_types import RagPredictConfiguration
 from . import models, rag_vector_store
@@ -84,9 +78,14 @@ def delete(data_source_id: int) -> None:
 
 
 def delete_document(data_source_id: int, document_id: str) -> None:
-    rag_vector_store.create_rag_vector_store(data_source_id).delete_document(
-        document_id
+    vector_store = rag_vector_store.create_rag_vector_store(
+        data_source_id
+    ).access_vector_store()
+    index = VectorStoreIndex.from_vector_store(
+        vector_store=vector_store,
+        embed_model=models.get_embedding_model(),
     )
+    index.delete_ref_doc(document_id)
 
 
 def query(

From 0582a03dcfec4d506039fe77a0a6e0c725fe2191 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 14:12:43 -0800
Subject: [PATCH 6/7] remove delete_document

---
 llm-service/app/services/rag_qdrant_vector_store.py | 13 -------------
 llm-service/app/services/vector_store.py            |  4 ----
 2 files changed, 17 deletions(-)

diff --git a/llm-service/app/services/rag_qdrant_vector_store.py b/llm-service/app/services/rag_qdrant_vector_store.py
index ae3d4c26f..b9b934ff1 100644
--- a/llm-service/app/services/rag_qdrant_vector_store.py
+++ b/llm-service/app/services/rag_qdrant_vector_store.py
@@ -88,16 +88,3 @@ def _create_qdrant_clients(
     def access_vector_store(self) -> BasePydanticVectorStore:
         vector_store = QdrantVectorStore(self.table_name, self.client, self.aclient)
         return vector_store
-
-    def delete_document(self, document_id: str) -> None:
-        result = self.client.delete(
-            self.table_name,
-            points_selector=rest.Filter(
-                must=[
-                    FieldCondition(
-                        key="metadata.document_id", match=MatchValue(value=document_id)
-                    )
-                ]
-            ),
-        )
-        print(result)
diff --git a/llm-service/app/services/vector_store.py b/llm-service/app/services/vector_store.py
index 515b5950e..77fd31f29 100644
--- a/llm-service/app/services/vector_store.py
+++ b/llm-service/app/services/vector_store.py
@@ -61,7 +61,3 @@ def access_vector_store(self) -> BasePydanticVectorStore:
     @abstractmethod
     def exists(self) -> bool:
         """Does the vector store exist?"""
-
-    @abstractmethod
-    def delete_document(self, document_id: str) -> None:
-        """Delete a document from the vector store"""

From 3acfc15b171de53eb41ca70900ba0867be1fd2a9 Mon Sep 17 00:00:00 2001
From: Conrado Silva Miranda <csilvamiranda@cloudera.com>
Date: Thu, 21 Nov 2024 14:14:11 -0800
Subject: [PATCH 7/7] fix lint

---
 llm-service/app/services/rag_qdrant_vector_store.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llm-service/app/services/rag_qdrant_vector_store.py b/llm-service/app/services/rag_qdrant_vector_store.py
index 827f3daa2..f7b78f681 100644
--- a/llm-service/app/services/rag_qdrant_vector_store.py
+++ b/llm-service/app/services/rag_qdrant_vector_store.py
@@ -39,10 +39,9 @@
 import os
 
 import qdrant_client
-import qdrant_client.http.models as rest
 from llama_index.core.vector_stores.types import BasePydanticVectorStore
 from llama_index.vector_stores.qdrant import QdrantVectorStore
-from qdrant_client.http.models import CountResult, FieldCondition, MatchValue
+from qdrant_client.http.models import CountResult
 
 from .vector_store import VectorStore