From 7a13a6819ff35a6f6197ee837d0eb8ea65e31776 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 4 Jun 2024 12:01:21 +0200 Subject: [PATCH 1/6] feat: refactoring of rag node --- .gitignore | 4 ++++ scrapegraphai/nodes/rag_node.py | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index c1750078..aa84820c 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ docs/source/_static/ venv/ .venv/ .vscode/ +.conda/ # exclude pdf, mp3 *.pdf @@ -38,3 +39,6 @@ lib/ *.html .idea +# extras +cache/ +run_smart_scraper.py diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 6d26bd1c..e9834693 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -3,6 +3,7 @@ """ from typing import List, Optional +import os from langchain.docstore.document import Document from langchain.retrievers import ContextualCompressionRetriever @@ -98,7 +99,18 @@ def execute(self, state: dict) -> dict: ) embeddings = self.embedder_model - retriever = FAISS.from_documents(chunked_docs, embeddings).as_retriever() + #------ + index = FAISS.from_documents(chunked_docs, embeddings) + # Define the folder name + folder_name = "cache" + # Check if the folder exists, if not, create it + if not os.path.exists(folder_name): + os.makedirs(folder_name) + # Save the index to the folder + index.save_local(folder_name) + + retriever = index.as_retriever() + #------ redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings) # similarity_threshold could be set, now k=20 @@ -121,4 +133,4 @@ def execute(self, state: dict) -> dict: self.logger.info("--- (tokens compressed and vector stored) ---") state.update({self.output[0]: compressed_docs}) - return state + return state \ No newline at end of file From 7ed2fe8ef0d16fd93cb2ff88840bcaa643349e33 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 4 Jun 2024 14:27:46 +0200 Subject: [PATCH 2/6] feat: add dynamic caching --- scrapegraphai/nodes/rag_node.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index e9834693..bc239ebb 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -99,18 +99,18 @@ def execute(self, state: dict) -> dict: ) embeddings = self.embedder_model - #------ - index = FAISS.from_documents(chunked_docs, embeddings) - # Define the folder name - folder_name = "cache" - # Check if the folder exists, if not, create it - if not os.path.exists(folder_name): - os.makedirs(folder_name) - # Save the index to the folder - index.save_local(folder_name) + if self.node_config.get("cache", False): + index = FAISS.from_documents(chunked_docs, embeddings) + folder_name = "cache" + + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + index.save_local(folder_name) + else: + index = FAISS.from_documents(chunked_docs, embeddings) retriever = index.as_retriever() - #------ redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings) # similarity_threshold could be set, now k=20 @@ -133,4 +133,4 @@ def execute(self, state: dict) -> dict: self.logger.info("--- (tokens compressed and vector stored) ---") state.update({self.output[0]: compressed_docs}) - return state \ No newline at end of file + return state From d79036149a3197a385b73553f29df66d36480c38 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 6 Jun 2024 21:35:52 +0200 Subject: [PATCH 3/6] feat: add caching --- scrapegraphai/nodes/rag_node.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index bc239ebb..9c4dc164 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -99,14 +99,15 @@ def execute(self, state: dict) -> dict: ) embeddings = self.embedder_model - if self.node_config.get("cache", False): - index = FAISS.from_documents(chunked_docs, embeddings) - folder_name = "cache" + folder_name = "cache" - if not os.path.exists(folder_name): - os.makedirs(folder_name) + if self.node_config.get("cache", False) and not os.path.exists(folder_name): + index = FAISS.from_documents(chunked_docs, embeddings) + os.makedirs(folder_name) index.save_local(folder_name) + if self.node_config.get("cache", False) and os.path.exists(folder_name): + index = FAISS.load_local(folder_path=folder_name, embeddings=embeddings) else: index = FAISS.from_documents(chunked_docs, embeddings) From 543b48764a2923a444df55511d45f51030787ec5 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 7 Jun 2024 09:47:21 +0200 Subject: [PATCH 4/6] add default folder for the cache --- scrapegraphai/nodes/rag_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 9c4dc164..23e7cbb8 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -99,7 +99,7 @@ def execute(self, state: dict) -> dict: ) embeddings = self.embedder_model - folder_name = "cache" + folder_name = self.node_config.get("cache", "cache") if self.node_config.get("cache", False) and not os.path.exists(folder_name): index = FAISS.from_documents(chunked_docs, embeddings) From c881f64209a86a69ddd3105f5d0360d9ed183490 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Tue, 11 Jun 2024 22:56:09 +0200 Subject: [PATCH 5/6] fix(cache): correctly pass the node arguments and logging --- requirements-dev.txt | 2 +- scrapegraphai/graphs/abstract_graph.py | 7 +++---- scrapegraphai/nodes/rag_node.py | 16 +++++++++++----- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 13f2257f..d33296d5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ sphinx==7.1.2 furo==2024.5.6 pytest==8.0.0 -burr[start]==0.19.1 \ No newline at end of file +burr[start]==0.22.1 \ No newline at end of file diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 7814efa8..70a81401 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -76,6 +76,7 @@ def __init__(self, prompt: str, config: dict, self.headless = True if config is None else config.get( "headless", True) self.loader_kwargs = config.get("loader_kwargs", {}) + self.cache_path = config.get("cache_path", False) # Create the graph self.graph = self._create_graph() @@ -91,15 +92,13 @@ def __init__(self, prompt: str, config: dict, else: set_verbosity_warning() - self.headless = True if config is None else config.get("headless", True) - self.loader_kwargs = config.get("loader_kwargs", {}) - common_params = { "headless": self.headless, "verbose": self.verbose, "loader_kwargs": self.loader_kwargs, "llm_model": self.llm_model, - "embedder_model": self.embedder_model + "embedder_model": self.embedder_model, + "cache_path": self.cache_path, } self.set_common_params(common_params, overwrite=False) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 23e7cbb8..a4f58191 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -51,6 +51,7 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) + self.cache_path = node_config.get("cache_path", False) def execute(self, state: dict) -> dict: """ @@ -99,15 +100,20 @@ def execute(self, state: dict) -> dict: ) embeddings = self.embedder_model - folder_name = self.node_config.get("cache", "cache") + folder_name = self.node_config.get("cache_path", "cache") - if self.node_config.get("cache", False) and not os.path.exists(folder_name): + if self.node_config.get("cache_path", False) and not os.path.exists(folder_name): index = FAISS.from_documents(chunked_docs, embeddings) os.makedirs(folder_name) - index.save_local(folder_name) - if self.node_config.get("cache", False) and os.path.exists(folder_name): - index = FAISS.load_local(folder_path=folder_name, embeddings=embeddings) + self.logger.info("--- (indexes saved to cache) ---") + + elif self.node_config.get("cache_path", False) and os.path.exists(folder_name): + index = FAISS.load_local(folder_path=folder_name, + embeddings=embeddings, + allow_dangerous_deserialization=True) + self.logger.info("--- (indexes loaded from cache) ---") + else: index = FAISS.from_documents(chunked_docs, embeddings) From edddb682d06262088885e340b7b73cc70adf9583 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Tue, 11 Jun 2024 23:01:31 +0200 Subject: [PATCH 6/6] docs(cache): added cache_path param --- docs/source/scrapers/graph_config.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/scrapers/graph_config.rst b/docs/source/scrapers/graph_config.rst index 6b046d5b..9e1d49e0 100644 --- a/docs/source/scrapers/graph_config.rst +++ b/docs/source/scrapers/graph_config.rst @@ -13,6 +13,7 @@ Some interesting ones are: - `loader_kwargs`: A dictionary with additional parameters to be passed to the `Loader` class, such as `proxy`. - `burr_kwargs`: A dictionary with additional parameters to enable `Burr` graphical user interface. - `max_images`: The maximum number of images to be analyzed. Useful in `OmniScraperGraph` and `OmniSearchGraph`. +- `cache_path`: The path where the cache files will be saved. If already exists, the cache will be loaded from this path. .. _Burr: