Skip to content

Commit c18df5f

Browse files
feat: Add support to handle bytes for vector retrieval (#1003)
Co-authored-by: Wendong <w3ndong.fan@gmail.com> Co-authored-by: Wendong-Fan <133094783+Wendong-Fan@users.noreply.github.com>
1 parent 8a2f0b7 commit c18df5f

File tree

2 files changed

+67
-17
lines changed

2 files changed

+67
-17
lines changed

camel/loaders/unstructured_io.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import uuid
1515
import warnings
1616
from typing import (
17+
IO,
1718
Any,
1819
Dict,
1920
List,
@@ -108,7 +109,7 @@ def parse_file_or_url(
108109
specified.
109110
110111
Notes:
111-
Available document types:
112+
Supported file types:
112113
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt",
113114
"org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx".
114115
@@ -152,6 +153,41 @@ def parse_file_or_url(
152153
warnings.warn(f"Failed to partition the file: {input_path}")
153154
return None
154155

156+
@staticmethod
157+
def parse_bytes(
158+
file: IO[bytes], **kwargs: Any
159+
) -> Union[List[Element], None]:
160+
r"""Parses a bytes stream and converts its contents into elements.
161+
162+
Args:
163+
file (IO[bytes]): The file in bytes format to be parsed.
164+
**kwargs: Extra kwargs passed to the partition function.
165+
166+
Returns:
167+
Union[List[Element], None]: List of elements after parsing the file
168+
if successful, otherwise `None`.
169+
170+
Notes:
171+
Supported file types:
172+
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt",
173+
"org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx".
174+
175+
References:
176+
https://docs.unstructured.io/open-source/core-functionality/partitioning
177+
"""
178+
179+
from unstructured.partition.auto import partition
180+
181+
try:
182+
# Use partition to process the bytes stream
183+
elements = partition(file=file, **kwargs)
184+
return elements
185+
except Exception as e:
186+
import warnings
187+
188+
warnings.warn(f"Failed to partition the file stream: {e}")
189+
return None
190+
155191
@staticmethod
156192
def clean_text_data(
157193
text: str,

camel/retrievers/vector_retriever.py

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
1414
import os
1515
import warnings
16-
from typing import Any, Dict, List, Optional, Union
16+
from typing import IO, Any, Dict, List, Optional, Union
1717
from urllib.parse import urlparse
1818

1919
from camel.embeddings import BaseEmbedding, OpenAIEmbedding
@@ -72,26 +72,34 @@ def __init__(
7272

7373
def process(
7474
self,
75-
content: Union[str, Element],
75+
content: Union[str, Element, IO[bytes]],
7676
chunk_type: str = "chunk_by_title",
7777
max_characters: int = 500,
78+
embed_batch: int = 50,
79+
should_chunk: bool = True,
7880
**kwargs: Any,
7981
) -> None:
80-
r"""Processes content from a file or URL, divides it into chunks by
81-
using `Unstructured IO`, and stores their embeddings in the specified
82-
vector storage.
82+
r"""Processes content from local file path, remote URL, string
83+
content, Element object, or a binary file object, divides it into
84+
chunks by using `Unstructured IO`, and stores their embeddings in the
85+
specified vector storage.
8386
8487
Args:
85-
content (Union[str, Element]): Local file path, remote URL,
86-
string content or Element object.
88+
content (Union[str, Element, IO[bytes]]): Local file path, remote
89+
URL, string content, Element object, or a binary file object.
8790
chunk_type (str): Type of chunking going to apply. Defaults to
8891
"chunk_by_title".
8992
max_characters (int): Max number of characters in each chunk.
9093
Defaults to `500`.
94+
embed_batch (int): Size of batch for embeddings. Defaults to `50`.
95+
should_chunk (bool): If True, divide the content into chunks,
96+
otherwise skip chunking. Defaults to True.
9197
**kwargs (Any): Additional keyword arguments for content parsing.
9298
"""
9399
if isinstance(content, Element):
94100
elements = [content]
101+
elif isinstance(content, IO):
102+
elements = self.uio.parse_bytes(file=content, **kwargs) or []
95103
else:
96104
# Check if the content is URL
97105
parsed_url = urlparse(content)
@@ -100,20 +108,26 @@ def process(
100108
elements = self.uio.parse_file_or_url(content, **kwargs) or []
101109
else:
102110
elements = [self.uio.create_element_from_text(text=content)]
103-
if elements:
104-
chunks = self.uio.chunk_elements(
105-
chunk_type=chunk_type,
106-
elements=elements,
107-
max_characters=max_characters,
108-
)
109111
if not elements:
110112
warnings.warn(
111113
f"No elements were extracted from the content: {content}"
112114
)
113115
return
114-
# Iterate to process and store embeddings, set batch of 50
115-
for i in range(0, len(chunks), 50):
116-
batch_chunks = chunks[i : i + 50]
116+
117+
# Chunk the content if required
118+
chunks = (
119+
self.uio.chunk_elements(
120+
chunk_type=chunk_type,
121+
elements=elements,
122+
max_characters=max_characters,
123+
)
124+
if should_chunk
125+
else elements
126+
)
127+
128+
# Process chunks in batches and store embeddings
129+
for i in range(0, len(chunks), embed_batch):
130+
batch_chunks = chunks[i : i + embed_batch]
117131
batch_vectors = self.embedding_model.embed_list(
118132
objs=[str(chunk) for chunk in batch_chunks]
119133
)

0 commit comments

Comments
 (0)