feat: Add support to handle bytes for vector retrieval (#1003)

Asher-hss · Wendong-Fan · web-flow · commit c18df5f5e439 · 2024-10-07T16:17:54.000+08:00
Co-authored-by: Wendong &lt;w3ndong.fan@gmail.com&gt;
Co-authored-by: Wendong-Fan &lt;133094783+Wendong-Fan@users.noreply.github.com&gt;
diff --git a/camel/loaders/unstructured_io.py b/camel/loaders/unstructured_io.py
@@ -14,6 +14,7 @@
 import uuid
 import warnings
 from typing import (
+    IO,
     Any,
     Dict,
     List,
@@ -108,7 +109,7 @@ def parse_file_or_url(
                 specified.
 
         Notes:
-            Available document types:
+            Supported file types:
                 "csv", "doc", "docx", "epub", "image", "md", "msg", "odt",
                 "org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx".
 
@@ -152,6 +153,41 @@ def parse_file_or_url(
                 warnings.warn(f"Failed to partition the file: {input_path}")
                 return None
 
+    @staticmethod
+    def parse_bytes(
+        file: IO[bytes], **kwargs: Any
+    ) -> Union[List[Element], None]:
+        r"""Parses a bytes stream and converts its contents into elements.
+
+        Args:
+            file (IO[bytes]): The file in bytes format to be parsed.
+            **kwargs: Extra kwargs passed to the partition function.
+
+        Returns:
+            Union[List[Element], None]: List of elements after parsing the file
+                if successful, otherwise `None`.
+
+        Notes:
+            Supported file types:
+                "csv", "doc", "docx", "epub", "image", "md", "msg", "odt",
+                "org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx".
+
+        References:
+            https://docs.unstructured.io/open-source/core-functionality/partitioning
+        """
+
+        from unstructured.partition.auto import partition
+
+        try:
+            # Use partition to process the bytes stream
+            elements = partition(file=file, **kwargs)
+            return elements
+        except Exception as e:
+            import warnings
+
+            warnings.warn(f"Failed to partition the file stream: {e}")
+            return None
+
     @staticmethod
     def clean_text_data(
         text: str,
diff --git a/camel/retrievers/vector_retriever.py b/camel/retrievers/vector_retriever.py
@@ -13,7 +13,7 @@
 # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
 import os
 import warnings
-from typing import Any, Dict, List, Optional, Union
+from typing import IO, Any, Dict, List, Optional, Union
 from urllib.parse import urlparse
 
 from camel.embeddings import BaseEmbedding, OpenAIEmbedding
@@ -72,26 +72,34 @@ def __init__(
 
     def process(
         self,
-        content: Union[str, Element],
+        content: Union[str, Element, IO[bytes]],
         chunk_type: str = "chunk_by_title",
         max_characters: int = 500,
+        embed_batch: int = 50,
+        should_chunk: bool = True,
         **kwargs: Any,
     ) -> None:
-        r"""Processes content from a file or URL, divides it into chunks by
-        using `Unstructured IO`, and stores their embeddings in the specified
-        vector storage.
+        r"""Processes content from local file path, remote URL, string
+        content, Element object, or a binary file object, divides it into
+        chunks by using `Unstructured IO`, and stores their embeddings in the
+        specified vector storage.
 
         Args:
-            content (Union[str, Element]): Local file path, remote URL,
-                string content or Element object.
+            content (Union[str, Element, IO[bytes]]): Local file path, remote
+                URL, string content, Element object, or a binary file object.
             chunk_type (str): Type of chunking going to apply. Defaults to
                 "chunk_by_title".
             max_characters (int): Max number of characters in each chunk.
                 Defaults to `500`.
+            embed_batch (int): Size of batch for embeddings. Defaults to `50`.
+            should_chunk (bool): If True, divide the content into chunks,
+                otherwise skip chunking. Defaults to True.
             **kwargs (Any): Additional keyword arguments for content parsing.
         """
         if isinstance(content, Element):
             elements = [content]
+        elif isinstance(content, IO):
+            elements = self.uio.parse_bytes(file=content, **kwargs) or []
         else:
             # Check if the content is URL
             parsed_url = urlparse(content)
@@ -100,20 +108,26 @@ def process(
                 elements = self.uio.parse_file_or_url(content, **kwargs) or []
             else:
                 elements = [self.uio.create_element_from_text(text=content)]
-        if elements:
-            chunks = self.uio.chunk_elements(
-                chunk_type=chunk_type,
-                elements=elements,
-                max_characters=max_characters,
-            )
         if not elements:
             warnings.warn(
                 f"No elements were extracted from the content: {content}"
             )
             return
-        # Iterate to process and store embeddings, set batch of 50
-        for i in range(0, len(chunks), 50):
-            batch_chunks = chunks[i : i + 50]
+
+        # Chunk the content if required
+        chunks = (
+            self.uio.chunk_elements(
+                chunk_type=chunk_type,
+                elements=elements,
+                max_characters=max_characters,
+            )
+            if should_chunk
+            else elements
+        )
+
+        # Process chunks in batches and store embeddings
+        for i in range(0, len(chunks), embed_batch):
+            batch_chunks = chunks[i : i + embed_batch]
             batch_vectors = self.embedding_model.embed_list(
                 objs=[str(chunk) for chunk in batch_chunks]
             )