From cacd9cde004dace1a7dcc27981245632a78b95f3 Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Tue, 15 Oct 2024 14:23:02 +0300 Subject: [PATCH 1/3] feat: add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition --- requirements.txt | 3 +- scrapegraphai/graphs/smart_scraper_graph.py | 138 +++++++++++------- scrapegraphai/prompts/__init__.py | 2 +- .../prompts/generate_answer_node_prompts.py | 4 + 4 files changed, 90 insertions(+), 57 deletions(-) diff --git a/requirements.txt b/requirements.txt index c72ad1bb..6c7a0326 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ undetected-playwright>=0.3.0 semchunk>=1.0.1 langchain-ollama>=0.1.3 simpleeval>=0.9.13 -googlesearch-python>=1.2.5 \ No newline at end of file +googlesearch-python>=1.2.5 +async_timeout>=4.0.3 \ No newline at end of file diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 60407624..478f6634 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -2,7 +2,6 @@ SmartScraperGraph Module """ from typing import Optional -import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph @@ -10,8 +9,10 @@ FetchNode, ParseNode, ReasoningNode, - GenerateAnswerNode + GenerateAnswerNode, + ConditionalNode ) +from ..prompts import REGEN_ADDITIONAL_INFO class SmartScraperGraph(AbstractGraph): """ @@ -89,6 +90,28 @@ def _create_graph(self) -> BaseGraph: } ) + cond_node = None + regen_node = None + if self.config.get("reattempt") is True: + cond_node = ConditionalNode( + input="results", + output=["results"], + node_name="ConditionalNode", + node_config={ + "key_name": "results", + "condition": 'results and results!="NA"', + } + ) + regen_node = GenerateAnswerNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "additional_info": REGEN_ADDITIONAL_INFO, + "schema": self.schema, + } + ) + if self.config.get("html_mode") is False: parse_node = ParseNode( input="doc", @@ -99,6 +122,7 @@ def _create_graph(self) -> BaseGraph: } ) + reasoning_node = None if self.config.get("reasoning"): reasoning_node = ReasoningNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -109,68 +133,72 @@ def _create_graph(self) -> BaseGraph: "schema": self.schema, } ) + + # Define the graph variation configurations + # (html_mode, reasoning, reattempt) + graph_variation_config = { + (False, True, False): { + "nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node], + "edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node)] + }, + (True, True, False): { + "nodes": [fetch_node, reasoning_node, generate_answer_node], + "edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node)] + }, + (True, False, False): { + "nodes": [fetch_node, generate_answer_node], + "edges": [(fetch_node, generate_answer_node)] + }, + (False, False, False): { + "nodes": [fetch_node, parse_node, generate_answer_node], + "edges": [(fetch_node, parse_node), (parse_node, generate_answer_node)] + }, + (False, True, True): { + "nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + }, + (True, True, True): { + "nodes": [fetch_node, reasoning_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + }, + (True, False, True): { + "nodes": [fetch_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, generate_answer_node), (generate_answer_node, cond_node), + (cond_node, regen_node), (cond_node, None)] + }, + (False, False, True): { + "nodes": [fetch_node, parse_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, parse_node), (parse_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + } + } - if self.config.get("html_mode") is False and self.config.get("reasoning") is True: - - return BaseGraph( - nodes=[ - fetch_node, - parse_node, - reasoning_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, reasoning_node), - (reasoning_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - - elif self.config.get("html_mode") is True and self.config.get("reasoning") is True: + # Get the current conditions + html_mode = self.config.get("html_mode", False) + reasoning = self.config.get("reasoning", False) + reattempt = self.config.get("reattempt", False) - return BaseGraph( - nodes=[ - fetch_node, - reasoning_node, - generate_answer_node, - ], - edges=[ - (fetch_node, reasoning_node), - (reasoning_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) + # Retrieve the appropriate graph configuration + config = graph_variation_config.get((html_mode, reasoning, reattempt)) - elif self.config.get("html_mode") is True and self.config.get("reasoning") is False: + if config: return BaseGraph( - nodes=[ - fetch_node, - generate_answer_node, - ], - edges=[ - (fetch_node, generate_answer_node) - ], + nodes=config["nodes"], + edges=config["edges"], entry_point=fetch_node, graph_name=self.__class__.__name__ ) + # Default return if no conditions match return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - + nodes=[fetch_node, parse_node, generate_answer_node], + edges=[(fetch_node, parse_node), (parse_node, generate_answer_node)], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + def run(self) -> str: """ Executes the scraping process and returns the answer to the prompt. diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py index b23374a4..ea916842 100644 --- a/scrapegraphai/prompts/__init__.py +++ b/scrapegraphai/prompts/__init__.py @@ -5,7 +5,7 @@ from .generate_answer_node_prompts import (TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, - TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD) + TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD, REGEN_ADDITIONAL_INFO) from .generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV) diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py index f9506a7b..a14f27f4 100644 --- a/scrapegraphai/prompts/generate_answer_node_prompts.py +++ b/scrapegraphai/prompts/generate_answer_node_prompts.py @@ -86,3 +86,7 @@ USER QUESTION: {question}\n WEBSITE CONTENT: {context}\n """ + +REGEN_ADDITIONAL_INFO = """ +You are a scraper and you have just failed to scrape the requested information from a website. \n +I want you to try again and provide the missing informations. \n""" From 038d2ef916ff0306c1fa5258a161889281d54235 Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Tue, 15 Oct 2024 15:23:05 +0300 Subject: [PATCH 2/3] refactor cond node structure to fit with the new implementation --- scrapegraphai/graphs/base_graph.py | 7 ++++++- scrapegraphai/graphs/smart_scraper_graph.py | 10 +++++----- scrapegraphai/nodes/conditional_node.py | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 5fa9ff34..d3a9cf85 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -95,7 +95,10 @@ def _set_conditional_node_edges(self): raise ValueError(f"ConditionalNode '{node.node_name}' must have exactly two outgoing edges.") # Assign true_node_name and false_node_name node.true_node_name = outgoing_edges[0][1].node_name - node.false_node_name = outgoing_edges[1][1].node_name + try: + node.false_node_name = outgoing_edges[1][1].node_name + except: + node.false_node_name = None def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: """ @@ -221,6 +224,8 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: node_names = {node.node_name for node in self.nodes} if result in node_names: current_node_name = result + elif result is None: + current_node_name = None else: raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph") diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 478f6634..03e8cd5d 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -94,16 +94,16 @@ def _create_graph(self) -> BaseGraph: regen_node = None if self.config.get("reattempt") is True: cond_node = ConditionalNode( - input="results", - output=["results"], + input="answer", + output=["answer"], node_name="ConditionalNode", node_config={ - "key_name": "results", - "condition": 'results and results!="NA"', + "key_name": "answer", + "condition": 'not answer or answer=="NA"', } ) regen_node = GenerateAnswerNode( - input="user_prompt & results", + input="user_prompt & answer", output=["answer"], node_config={ "llm_model": self.llm_model, diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 02ff61e9..c5ff58f3 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -61,7 +61,7 @@ def execute(self, state: dict) -> dict: str: The name of the next node to execute based on the presence of the key. """ - if self.true_node_name is None or self.false_node_name is None: + if self.true_node_name is None: raise ValueError("ConditionalNode's next nodes are not set properly.") if self.condition: From eaa83edc04b803f2a14c7705549fae62c64275fb Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Wed, 16 Oct 2024 15:21:23 +0300 Subject: [PATCH 3/3] update project requirement and add example --- examples/extras/cond_smartscraper_usage.py | 38 ++++++++++++++++++++++ pyproject.toml | 3 +- 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 examples/extras/cond_smartscraper_usage.py diff --git a/examples/extras/cond_smartscraper_usage.py b/examples/extras/cond_smartscraper_usage.py new file mode 100644 index 00000000..54c40712 --- /dev/null +++ b/examples/extras/cond_smartscraper_usage.py @@ -0,0 +1,38 @@ +""" +Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("GROQ_APIKEY"), + "model": "groq/gemma-7b-it", + }, + "verbose": True, + "headless": True, + "reattempt": True #Setting this to True will allow the graph to reattempt the scraping process +} + +# ******************************************************* +# Create the SmartScraperMultiCondGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/pyproject.toml b/pyproject.toml index 6bade627..7a374c97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,8 @@ dependencies = [ "async-timeout>=4.0.3", "transformers>=4.44.2", "googlesearch-python>=1.2.5", - "simpleeval>=1.0.0" + "simpleeval>=1.0.0", + "async_timeout>=4.0.3" ] license = "MIT"