add explore graph

VinciGit00 · VinciGit00 · commit 300fc05109da · 2024-06-11T21:58:59.000+02:00
diff --git a/examples/local_models/explore_graph_ollama.py b/examples/local_models/explore_graph_ollama.py
@@ -0,0 +1,45 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+from scrapegraphai.graphs import ExploreGraph
+from scrapegraphai.utils import prettify_exec_info
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        # "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "verbose": True,
+    "headless": False
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = ExploreGraph(
+    prompt="List me all the titles",
+    # also accepts a string with the already downloaded HTML code
+    source="https://www.wired.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/scrapegraphai/graphs/explore_graph.py b/scrapegraphai/graphs/explore_graph.py
@@ -1,14 +1,12 @@
-""" 
+"""
 ExploreGraph Module
 """
 
-from copy import copy, deepcopy
 from typing import Optional
 from pydantic import BaseModel
 
 from .base_graph import BaseGraph
 from .abstract_graph import AbstractGraph
-from .smart_scraper_graph import SmartScraperGraph
 
 from ..nodes import (
     FetchNode,
@@ -20,56 +18,50 @@
 
 
 class ExploreGraph(AbstractGraph):
-    """ 
-    ExploreGraph is a scraping pipeline that searches the internet for answers to a given prompt.
-    It only requires a user prompt to search the internet and generate an answer.
+    """
+    SmartScraper is a scraping pipeline that automates the process of 
+    extracting information from web pages
+    using a natural language model to interpret and answer prompts.
 
     Attributes:
-        prompt (str): The user prompt to search the internet.
-        llm_model (dict): The configuration for the language model.
-        embedder_model (dict): The configuration for the embedder model.
-        headless (bool): A flag to run the browser in headless mode.
-        verbose (bool): A flag to display the execution information.
-        model_token (int): The token limit for the language model.
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (str): The schema for the graph output.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, 
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
 
     Args:
-        prompt (str): The user prompt to search the internet.
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
         config (dict): Configuration parameters for the graph.
-        schema (Optional[str]): The schema for the graph output.
+        schema (str): The schema for the graph output.
 
     Example:
-        >>> search_graph = ExploreGraph(
-        ...     "What is Chioggia famous for?",
+        >>> smart_scraper = ExploreGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
         ...     {"llm": {"model": "gpt-3.5-turbo"}}
         ... )
-        >>> result = search_graph.run()
+        >>> result = smart_scraper.run()
+        )
     """
 
-    def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None):
-
-        self.max_results = config.get("max_results", 3)
+    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
+        super().__init__(prompt, config, source, schema)
 
-        if all(isinstance(value, str) for value in config.values()):
-            self.copy_config = copy(config)
-        else:
-            self.copy_config = deepcopy(config)
-        
-        self.copy_schema = deepcopy(schema)
-
-        super().__init__(prompt, config, schema)
+        self.input_key = "url" if source.startswith("http") else "local_dir"
 
     def _create_graph(self) -> BaseGraph:
         """
-        Creates the graph of nodes representing the workflow for web scraping and searching.
+        Creates the graph of nodes representing the workflow for web scraping.
 
         Returns:
-            BaseGraph: A graph instance representing the web scraping and searching workflow.
+            BaseGraph: A graph instance representing the web scraping workflow.
         """
-
-        # ************************************************
-        # Create a SmartScraperGraph instance
-        # ************************************************
-
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc", "link_urls", "img_urls"],
@@ -100,7 +92,7 @@ def _create_graph(self) -> BaseGraph:
                 "schema": self.schema,
             }
         )
-
+      
         search_link_node = SearchLinkNode(
             input="doc",
             output=[{"link": "description"}],
@@ -114,25 +106,28 @@ def _create_graph(self) -> BaseGraph:
                 fetch_node,
                 parse_node,
                 rag_node,
+                search_link_node,
                 generate_answer_node,
             ],
+            
             edges=[
                 (fetch_node, parse_node),
                 (parse_node, rag_node),
-                (rag_node, generate_answer_node),
-                (generate_answer_node, search_link_node)
+                (rag_node, search_link_node),
+                (search_link_node, generate_answer_node)
             ],
             entry_point=fetch_node
         )
 
     def run(self) -> str:
         """
-        Executes the web scraping and searching process.
+        Executes the scraping process and returns the answer to the prompt.
 
         Returns:
             str: The answer to the prompt.
         """
-        inputs = {"user_prompt": self.prompt}
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
         return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py
@@ -67,17 +67,15 @@ def execute(self, state: dict) -> dict:
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        # Interpret input keys based on the provided input expression
-        input_keys = self.get_input_keys(state)
 
-        user_prompt = state[input_keys[0]]
-        parsed_content_chunks = state[input_keys[1]]
+        user_prompt = state.get("user_prompt")
+        links = state.get("link_urls")
+        parsed_content_chunks = state.get("parsed_doc")
         output_parser = JsonOutputParser()
 
         prompt_relevant_links = """
             You are a website scraper and you have just scraped the following content from a website.
-            Content: {content}
-            
+
             You are now tasked with identifying all hyper links within the content that are potentially
             relevant to the user task: {user_prompt}
             
@@ -87,19 +85,15 @@ def execute(self, state: dict) -> dict:
             Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
             whether the content at the link is directly relevant.
 
+            This is the list of links: {links}
+
+            Content: {content}
+
             The output should be a dictionary whose key is the link and whose value is a short description or a slug relevant 
             for the link; if no such description or slug can be learnt from the scraped content, just leave it null
 
-            Output only a list of relevant links in the format:
-            {
-                "link1": "description link 1",
-                "link2": "description link 2",
-                "link3": "description link 3",
-                .
-                .
-                .
-            }
             """
+         
         relevant_links = []
 
         for i, chunk in enumerate(
@@ -111,12 +105,13 @@ def execute(self, state: dict) -> dict:
         ):
             merge_prompt = PromptTemplate(
                 template=prompt_relevant_links,
-                input_variables=["content", "user_prompt"],
+                input_variables=["content", "user_prompt", "links"],
             )
             merge_chain = merge_prompt | self.llm_model | output_parser
             # merge_chain = merge_prompt | self.llm_model
             answer = merge_chain.invoke(
-                {"content": chunk.page_content, "user_prompt": user_prompt}
+                {"content": chunk, "links": links,
+                 "user_prompt": user_prompt}
             )
             relevant_links += answer
         state.update({self.output[0]: relevant_links})