feat: add explore graph

VinciGit00 · VinciGit00 · commit 3d8c3a3508e4 · 2024-06-14T22:00:03.000+02:00
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT)
 [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX)
 
-ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, etc.).
+ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, CSVS, etc.).
 
 Just say which information you want to extract and the library will do it for you!
 
diff --git a/scrapegraphai/graphs/explore_graph.py b/scrapegraphai/graphs/explore_graph.py
@@ -19,40 +19,37 @@
 
 class ExploreGraph(AbstractGraph):
     """
-    SmartScraper is a scraping pipeline that automates the process of 
-    extracting information from web pages
-    using a natural language model to interpret and answer prompts.
+    ExploreGraph is a web scraping pipeline that automates the extraction of information
+    from web pages using natural language models to interpret and respond to prompts.
 
     Attributes:
         prompt (str): The prompt for the graph.
-        source (str): The source of the graph.
+        source (str): The source URL or local directory for the graph.
         config (dict): Configuration parameters for the graph.
         schema (str): The schema for the graph output.
-        llm_model: An instance of a language model client, configured for generating answers.
-        embedder_model: An instance of an embedding model client, 
-        configured for generating embeddings.
+        llm_model: An instance of a language model client for generating answers.
+        embedder_model: An instance of an embedding model client for generating embeddings.
         verbose (bool): A flag indicating whether to show print statements during execution.
         headless (bool): A flag indicating whether to run the graph in headless mode.
 
     Args:
         prompt (str): The prompt for the graph.
-        source (str): The source of the graph.
+        source (str): The source URL or local directory for the graph.
         config (dict): Configuration parameters for the graph.
-        schema (str): The schema for the graph output.
+        schema (Optional[BaseModel]): The schema for the graph output.
 
     Example:
-        >>> smart_scraper = ExploreGraph(
+        >>> explore_graph = ExploreGraph(
         ...     "List me all the attractions in Chioggia.",
         ...     "https://en.wikipedia.org/wiki/Chioggia",
         ...     {"llm": {"model": "gpt-3.5-turbo"}}
         ... )
-        >>> result = smart_scraper.run()
-        )
+        >>> result = explore_graph.run()
+        >>> print(result)
     """
 
     def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
         super().__init__(prompt, config, source, schema)
-
         self.input_key = "url" if source.startswith("http") else "local_dir"
 
     def _create_graph(self) -> BaseGraph:
@@ -109,7 +106,6 @@ def _create_graph(self) -> BaseGraph:
                 search_link_node,
                 generate_answer_node,
             ],
-           
             edges=[
                 (fetch_node, parse_node),
                 (parse_node, rag_node),
@@ -126,7 +122,6 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
-
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/parallel_search_graph.py b/scrapegraphai/graphs/parallel_search_graph.py
@@ -0,0 +1,129 @@
+"""
+ParallelSearchGraph Module
+"""
+from copy import copy, deepcopy
+from typing import Optional
+from pydantic import BaseModel
+
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+
+from ..nodes import (
+    GraphIteratorNode,
+    ReRankNode,
+    MergeExploreGraphsNode
+)
+
+from ..graphs.explore_graph import ExploreGraph
+
+
+class ParallelSearchGraph(AbstractGraph):
+    """
+    SmartScraper is a scraping pipeline that automates the process of 
+    extracting information from web pages
+    using a natural language model to interpret and answer prompts.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (str): The schema for the graph output.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, 
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (str): The schema for the graph output.
+
+    Example:
+        >>> smart_scraper = ParallelSearchGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = smart_scraper.run()
+        )
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
+        super().__init__(prompt, config, source, schema)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+        if all(isinstance(value, str) for value in config.values()):
+            self.copy_config = copy(config)
+        else:
+            self.copy_config = deepcopy(config)
+        self.copy_schema = deepcopy(schema)
+
+        super().__init__(prompt, config, schema)
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+         
+        explore_graph_instance  = ExploreGraph(
+            prompt="",
+            source="",
+            config=self.copy_config,
+        )
+        
+        rerank_link_node  = ReRankNode(
+              input="user_prompt & urls",
+            output=["results"],
+            node_config={
+                "graph_instance": explore_graph_instance ,
+            }
+        )
+
+        graph_iterator_node = GraphIteratorNode(
+            input="user_prompt & urls",
+            output=["results"],
+            node_config={
+                "graph_instance": explore_graph_instance ,
+            }
+        )
+
+        merge_explore_graphs_node = MergeExploreGraphsNode(input="user_prompt & results",
+            output=["answer"],
+            node_config={
+                "llm_model": self.llm_model,
+                "schema": self.schema
+            }
+        )
+
+
+        return BaseGraph(
+            nodes=[
+                rerank_link_node,               
+                graph_iterator_node,
+                merge_explore_graphs_node,
+            ],
+            edges=[
+                (rerank_link_node, graph_iterator_node),
+                (graph_iterator_node, merge_explore_graphs_node),
+            ],
+            entry_point=rerank_link_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py
@@ -51,7 +51,6 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None
             self.copy_config = copy(config)
         else:
             self.copy_config = deepcopy(config)
-        
         self.copy_schema = deepcopy(schema)
 
         super().__init__(prompt, config, schema)
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -20,3 +20,5 @@
 from .graph_iterator_node import GraphIteratorNode
 from .merge_answers_node import MergeAnswersNode
 from .generate_answer_omni_node import GenerateAnswerOmniNode
+from .merge_explore_graphs_node import MergeExploreGraphsNode
+from .rerank_node import ReRankNode
diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py
@@ -5,7 +5,7 @@
 import asyncio
 import copy
 from typing import List, Optional
-
+from ..graphs.explore_graph import ExploreGraph
 from tqdm.asyncio import tqdm
 
 from ..utils.logging import get_logger
diff --git a/scrapegraphai/nodes/merge_explore_graphs_node.py b/scrapegraphai/nodes/merge_explore_graphs_node.py
@@ -0,0 +1,79 @@
+"""
+MergeExploreGraphsNode Module
+"""
+
+# Imports from standard library
+from typing import List, Optional
+
+# Imports from Langchain
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.runnables import RunnableParallel
+from tqdm import tqdm
+
+
+from ..utils.logging import get_logger
+from ..models import Ollama
+# Imports from the library
+from .base_node import BaseNode
+from ..helpers import template_chunks, template_no_chunks, template_merge
+
+
+class MergeExploreGraphsNode(BaseNode):
+    """
+    A node that generates an answer using a large language model (LLM) based on the user's input
+    and the content extracted from a webpage. It constructs a prompt from the user's input
+    and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
+    an answer.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "GenerateAnswer",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+      
+        self.llm_model = node_config["llm_model"]
+
+        if isinstance(node_config["llm_model"], Ollama):
+            self.llm_model.format="json"
+
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
+
+    def execute(self, state: dict) -> dict:
+        """
+        Generates an answer by constructing a prompt from the user's input and the scraped
+        content, querying the language model, and parsing its response.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+       
+        state.update({self.output[0]: "answaer"})
+        return state
diff --git a/scrapegraphai/nodes/rerank_node.py b/scrapegraphai/nodes/rerank_node.py
@@ -0,0 +1,69 @@
+"""
+ReRankNode Module
+"""
+
+
+from typing import List, Optional
+from ..utils.logging import get_logger
+from .base_node import BaseNode
+
+
+class ReRankNode(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+
+    def execute(self, state: dict) -> dict:
+        """
+        Executes the node's logic to implement RAG (Retrieval-Augmented Generation).
+        The method updates the state with relevant chunks of the document.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used to fetch the
+                            correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the relevant chunks of the document.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating that the
+                        necessary information for compressing the content is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        
+
+        self.logger.info("--- (tokens compressed and vector stored) ---")
+
+        state.update({self.output[0]: "compressed_docs"})
+        return state