1313# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
1414import os
1515import warnings
16- from typing import Any , Dict , List , Optional , Union
16+ from typing import IO , Any , Dict , List , Optional , Union
1717from urllib .parse import urlparse
1818
1919from camel .embeddings import BaseEmbedding , OpenAIEmbedding
@@ -72,26 +72,34 @@ def __init__(
7272
7373 def process (
7474 self ,
75- content : Union [str , Element ],
75+ content : Union [str , Element , IO [ bytes ] ],
7676 chunk_type : str = "chunk_by_title" ,
7777 max_characters : int = 500 ,
78+ embed_batch : int = 50 ,
79+ should_chunk : bool = True ,
7880 ** kwargs : Any ,
7981 ) -> None :
80- r"""Processes content from a file or URL, divides it into chunks by
81- using `Unstructured IO`, and stores their embeddings in the specified
82- vector storage.
82+ r"""Processes content from local file path, remote URL, string
83+ content, Element object, or a binary file object, divides it into
84+ chunks by using `Unstructured IO`, and stores their embeddings in the
85+ specified vector storage.
8386
8487 Args:
85- content (Union[str, Element] ): Local file path, remote URL,
86- string content or Element object.
88+ content (Union[str, Element, IO[bytes]] ): Local file path, remote
89+ URL, string content, Element object, or a binary file object.
8790 chunk_type (str): Type of chunking going to apply. Defaults to
8891 "chunk_by_title".
8992 max_characters (int): Max number of characters in each chunk.
9093 Defaults to `500`.
94+ embed_batch (int): Size of batch for embeddings. Defaults to `50`.
95+ should_chunk (bool): If True, divide the content into chunks,
96+ otherwise skip chunking. Defaults to True.
9197 **kwargs (Any): Additional keyword arguments for content parsing.
9298 """
9399 if isinstance (content , Element ):
94100 elements = [content ]
101+ elif isinstance (content , IO ):
102+ elements = self .uio .parse_bytes (file = content , ** kwargs ) or []
95103 else :
96104 # Check if the content is URL
97105 parsed_url = urlparse (content )
@@ -100,20 +108,26 @@ def process(
100108 elements = self .uio .parse_file_or_url (content , ** kwargs ) or []
101109 else :
102110 elements = [self .uio .create_element_from_text (text = content )]
103- if elements :
104- chunks = self .uio .chunk_elements (
105- chunk_type = chunk_type ,
106- elements = elements ,
107- max_characters = max_characters ,
108- )
109111 if not elements :
110112 warnings .warn (
111113 f"No elements were extracted from the content: { content } "
112114 )
113115 return
114- # Iterate to process and store embeddings, set batch of 50
115- for i in range (0 , len (chunks ), 50 ):
116- batch_chunks = chunks [i : i + 50 ]
116+
117+ # Chunk the content if required
118+ chunks = (
119+ self .uio .chunk_elements (
120+ chunk_type = chunk_type ,
121+ elements = elements ,
122+ max_characters = max_characters ,
123+ )
124+ if should_chunk
125+ else elements
126+ )
127+
128+ # Process chunks in batches and store embeddings
129+ for i in range (0 , len (chunks ), embed_batch ):
130+ batch_chunks = chunks [i : i + embed_batch ]
117131 batch_vectors = self .embedding_model .embed_list (
118132 objs = [str (chunk ) for chunk in batch_chunks ]
119133 )
0 commit comments