-
Notifications
You must be signed in to change notification settings - Fork 6
a few tweaks, fix test & a couple bugs #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,13 +57,7 @@ | |
".txt": NopReader, | ||
".md": NopReader, | ||
} | ||
CHUNKABLE_FILE_EXTENSIONS = set( | ||
[ | ||
".pdf", | ||
".txt", | ||
".md", | ||
] | ||
) | ||
CHUNKABLE_FILE_EXTENSIONS = {".pdf", ".txt", ".md"} | ||
|
||
@dataclass | ||
class NotSupportedFileExtensionError(Exception): | ||
|
@@ -101,6 +95,7 @@ def index_file(self, file_path: str, file_id: str): | |
|
||
for chunk, embedding in zip(chunks, embeddings): | ||
chunk.embedding = embedding | ||
chunk.metadata["file_name"] = os.path.basename(file_path) | ||
|
||
logger.debug(f"Adding {len(chunks)} chunks to vector store") | ||
chunks_vector_store = self.chunks_vector_store.access_vector_store() | ||
|
@@ -113,6 +108,7 @@ def _documents_in_file(self, reader: BaseReader, file_path: str, file_id: str) - | |
|
||
for i, document in enumerate(documents): | ||
# Update the document metadata | ||
document.id_ = file_id | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to couple them, right? Is there a reason why we need them to match? |
||
document.metadata["file_id"] = file_id | ||
document.metadata["document_part_number"] = i | ||
document.metadata["data_source_id"] = self.data_source_id | ||
|
@@ -124,6 +120,7 @@ def _chunks_in_document(self, document: Document) -> List[BaseNode]: | |
|
||
for j, chunk in enumerate(chunks): | ||
chunk.metadata["file_id"] = document.metadata["file_id"] | ||
chunk.metadata["document_id"] = document.metadata["file_id"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure why? Maybe |
||
chunk.metadata["document_part_number"] = document.metadata["document_part_number"] | ||
chunk.metadata["chunk_number"] = j | ||
chunk.metadata["data_source_id"] = document.metadata["data_source_id"] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,7 +60,8 @@ def __init__( | |
api_base=api_base, | ||
messages_to_prompt=messages_to_prompt, | ||
completion_to_prompt=completion_to_prompt, | ||
default_headers=default_headers) | ||
default_headers=default_headers, | ||
context=context) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this fix is actually on |
||
self.context = context | ||
|
||
@property | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure this is true unless we save with the original file name to s3. I thought we saved with the UUID