Skip to content

a few tweaks, fix test & a couple bugs #29

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions llm-service/app/ai/indexing/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,7 @@
".txt": NopReader,
".md": NopReader,
}
CHUNKABLE_FILE_EXTENSIONS = set(
[
".pdf",
".txt",
".md",
]
)
CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"}

@dataclass
class NotSupportedFileExtensionError(Exception):
Expand Down Expand Up @@ -101,6 +95,7 @@ def index_file(self, file_path: str, file_id: str):

for chunk, embedding in zip(chunks, embeddings):
chunk.embedding = embedding
chunk.metadata["file_name"] = os.path.basename(file_path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is true unless we save with the original file name to s3. I thought we saved with the UUID


logger.debug(f"Adding {len(chunks)} chunks to vector store")
chunks_vector_store = self.chunks_vector_store.access_vector_store()
Expand All @@ -113,6 +108,7 @@ def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) -

for i, document in enumerate(documents):
# Update the document metadata
document.id_ = file_id
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to couple them, right? Is there a reason why we need them to match?

document.metadata["file_id"] = file_id
document.metadata["document_part_number"] = i
document.metadata["data_source_id"] = self.data_source_id
Expand All @@ -124,6 +120,7 @@ def _chunks_in_document(self, document: Document) -> List[BaseNode]:

for j, chunk in enumerate(chunks):
chunk.metadata["file_id"] = document.metadata["file_id"]
chunk.metadata["document_id"] = document.metadata["file_id"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why? Maybe document_id on both sides?

chunk.metadata["document_part_number"] = document.metadata["document_part_number"]
chunk.metadata["chunk_number"] = j
chunk.metadata["data_source_id"] = document.metadata["data_source_id"]
Expand Down
9 changes: 4 additions & 5 deletions llm-service/app/routers/index/data_source/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
# DATA.
# ##############################################################################

import http
import logging
import os
import tempfile
Expand All @@ -39,8 +38,8 @@
from .... import exceptions
from ....services import doc_summaries, qdrant, s3
from ....ai.indexing.index import Indexer
from ....services.rag_vector_store import create_rag_vector_store
from ....services.models import get_embedding_model
from ....services import rag_vector_store
from ....services import models
from llama_index.core.node_parser import SentenceSplitter

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -143,7 +142,7 @@ def download_and_index(
chunk_size=request.configuration.chunk_size,
chunk_overlap=int(request.configuration.chunk_overlap * 0.01 * request.configuration.chunk_size),
),
embedding_model=get_embedding_model(),
chunks_vector_store=create_rag_vector_store(data_source_id)
embedding_model=models.get_embedding_model(),
chunks_vector_store=rag_vector_store.create_rag_vector_store(data_source_id)
)
indexer.index_file(file_path, request.document_id)
3 changes: 2 additions & 1 deletion llm-service/app/services/CaiiModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def __init__(
api_base=api_base,
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
default_headers=default_headers)
default_headers=default_headers,
context=context)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this fix is actually on main, I think

self.context = context

@property
Expand Down
4 changes: 2 additions & 2 deletions llm-service/app/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
from app.main import app
from app.services import models, rag_vector_store
from app.services.rag_qdrant_vector_store import RagQdrantVectorStore

from app.services.utils import get_last_segment

@pytest.fixture
def aws_region() -> str:
Expand Down Expand Up @@ -92,7 +92,7 @@ def data_source_id() -> int:
@pytest.fixture
def index_document_request_body(data_source_id, s3_object) -> dict[str, Any]:
return {
"document_id": s3_object.key,
"document_id": get_last_segment(s3_object.key),
"data_source_id": data_source_id,
"s3_bucket_name": s3_object.bucket_name,
"s3_document_key": s3_object.key,
Expand Down
2 changes: 0 additions & 2 deletions llm-service/app/tests/routers/index/test_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@

from typing import Any

import pytest
from llama_index.core import VectorStoreIndex
from llama_index.core.vector_stores import VectorStoreQuery

Expand All @@ -53,7 +52,6 @@ def get_vector_store_index(data_source_id) -> VectorStoreIndex:
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=models.get_embedding_model())
return index

@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
class TestDocumentIndexing:
@staticmethod
def test_create_document(
Expand Down
1 change: 0 additions & 1 deletion llm-service/app/tests/routers/index/test_doc_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
import pytest
from typing import Any

@pytest.mark.skip(reason="The test and the http handler are getting different vector stores and I'm not sure how they were getting the same one before. Re-enabling this test requires dependencies to be defined more explicitly.")
class TestDocumentSummaries:
@staticmethod
def test_generate_summary(client, index_document_request_body: dict[str, Any], data_source_id, document_id, s3_object) -> None:
Expand Down
Loading