From 073d226723f5f03b960865d07408905b7a506180 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 18 Jun 2024 14:35:13 +0200 Subject: [PATCH 1/8] feat: add new search engine avaiability and new tests --- examples/single_node/search_internet_node.py | 50 +++++++++++++++++ scrapegraphai/nodes/search_internet_node.py | 4 +- scrapegraphai/utils/research_web.py | 3 +- tests/nodes/search_internet_node_test.py | 58 ++++++++++++++++++++ 4 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 examples/single_node/search_internet_node.py create mode 100644 tests/nodes/search_internet_node_test.py diff --git a/examples/single_node/search_internet_node.py b/examples/single_node/search_internet_node.py new file mode 100644 index 00000000..8a8149fa --- /dev/null +++ b/examples/single_node/search_internet_node.py @@ -0,0 +1,50 @@ +""" +Example of custom graph using existing nodes +""" + +from scrapegraphai.models import Ollama +from scrapegraphai.nodes import SearchInternetNode + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "llama3", + "temperature": 0, + "streaming": True + }, + "search_engine": "google", + "max_results": 3, + "verbose": True +} + +# ************************************************ +# Define the node +# ************************************************ + +llm_model = Ollama(graph_config["llm"]) + +search_node = SearchInternetNode( + input="user_input", + output=["search_results"], + node_config={ + "llm_model": llm_model, + "search_engine": graph_config["search_engine"], + "max_results": graph_config["max_results"], + "verbose": graph_config["verbose"] + } +) + +# ************************************************ +# Test the node +# ************************************************ + +state = { + "user_input": "What is the capital of France?" +} + +result = search_node.execute(state) + +print(result) diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 9fa4a8f5..59c56975 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -43,6 +43,7 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) + self.search_engine = node_config.get("search_engine", "google") self.max_results = node_config.get("max_results", 3) def execute(self, state: dict) -> dict: @@ -97,7 +98,8 @@ def execute(self, state: dict) -> dict: self.logger.info(f"Search Query: {search_query}") - answer = search_on_web(query=search_query, max_results=self.max_results) + answer = search_on_web(query=search_query, max_results=self.max_results, + search_engine=self.search_engine) if len(answer) == 0: # raise an exception if no answer is found diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index a839a680..62ffd2ee 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -26,7 +26,8 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] - This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs. + This function allows switching between Google and DuckDuckGo to perform + internet searches, returning a list of result URLs. """ if search_engine.lower() == "google": diff --git a/tests/nodes/search_internet_node_test.py b/tests/nodes/search_internet_node_test.py new file mode 100644 index 00000000..db2cbdee --- /dev/null +++ b/tests/nodes/search_internet_node_test.py @@ -0,0 +1,58 @@ +import unittest +from scrapegraphai.models import Ollama +from scrapegraphai.nodes import SearchInternetNode + +class TestSearchInternetNode(unittest.TestCase): + + def setUp(self): + # Configuration for the graph + self.graph_config = { + "llm": { + "model": "llama3", + "temperature": 0, + "streaming": True + }, + "search_engine": "google", + "max_results": 3, + "verbose": True + } + + # Define the model + self.llm_model = Ollama(self.graph_config["llm"]) + + # Initialize the SearchInternetNode + self.search_node = SearchInternetNode( + input="user_input", + output=["search_results"], + node_config={ + "llm_model": self.llm_model, + "search_engine": self.graph_config["search_engine"], + "max_results": self.graph_config["max_results"], + "verbose": self.graph_config["verbose"] + } + ) + + def test_execute_search_node(self): + # Initial state + state = { + "user_input": "What is the capital of France?" + } + + # Expected output + expected_output = { + "user_input": "What is the capital of France?", + "search_results": [ + "https://en.wikipedia.org/wiki/Paris", + "https://en.wikipedia.org/wiki/France", + "https://en.wikipedia.org/wiki/%C3%8Ele-de-France" + ] + } + + # Execute the node + result = self.search_node.execute(state) + + # Assert the results + self.assertEqual(result, expected_output) + +if __name__ == "__main__": + unittest.main() From aedda448682ce5a921a62e661bffb02478bab75f Mon Sep 17 00:00:00 2001 From: Jason Vertrees Date: Tue, 18 Jun 2024 12:36:50 -0500 Subject: [PATCH 2/8] fix: updated for schema changes docs: updated for schema changes --- examples/ernie/smart_scraper_schema_ernie.py | 39 +++++++++---------- .../smart_scraper_schema_huggingfacehub.py | 27 +++++-------- .../smart_scraper_schema_groq_openai.py | 31 +++++++-------- scrapegraphai/graphs/abstract_graph.py | 2 +- .../graphs/csv_scraper_multi_graph.py | 6 ++- scrapegraphai/graphs/deep_scraper_graph.py | 4 +- scrapegraphai/graphs/json_scraper_graph.py | 4 +- .../graphs/json_scraper_multi_graph.py | 2 +- scrapegraphai/graphs/omni_scraper_graph.py | 4 +- scrapegraphai/graphs/omni_search_graph.py | 2 +- scrapegraphai/graphs/pdf_scraper_graph.py | 4 +- .../graphs/pdf_scraper_multi_graph.py | 2 +- scrapegraphai/graphs/script_creator_graph.py | 4 +- .../graphs/script_creator_multi_graph.py | 6 ++- scrapegraphai/graphs/search_graph.py | 2 +- scrapegraphai/graphs/smart_scraper_graph.py | 4 +- .../graphs/smart_scraper_multi_graph.py | 2 +- scrapegraphai/graphs/speech_graph.py | 4 +- scrapegraphai/graphs/xml_scraper_graph.py | 4 +- .../graphs/xml_scraper_multi_graph.py | 2 +- 20 files changed, 73 insertions(+), 82 deletions(-) diff --git a/examples/ernie/smart_scraper_schema_ernie.py b/examples/ernie/smart_scraper_schema_ernie.py index 65448821..64a74937 100644 --- a/examples/ernie/smart_scraper_schema_ernie.py +++ b/examples/ernie/smart_scraper_schema_ernie.py @@ -2,32 +2,31 @@ Basic example of scraping pipeline using SmartScraper with schema """ -import os, json +import json +import os +from typing import Dict + from dotenv import load_dotenv +from pydantic import BaseModel + from scrapegraphai.graphs import SmartScraperGraph + load_dotenv() # ************************************************ # Define the output schema for the graph # ************************************************ -schema= """ - { - "Projects": [ - "Project #": - { - "title": "...", - "description": "...", - }, - "Project #": - { - "title": "...", - "description": "...", - } - ] - } -""" + +class Project(BaseModel): + title: str + description: str + + +class Projects(BaseModel): + Projects: Dict[str, Project] + # ************************************************ # Define the configuration for the graph @@ -37,7 +36,7 @@ graph_config = { "llm": { - "api_key":openai_key, + "api_key": openai_key, "model": "gpt-3.5-turbo", }, "verbose": True, @@ -51,8 +50,8 @@ smart_scraper_graph = SmartScraperGraph( prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", - schema=schema, - config=graph_config + schema=Projects, + config=graph_config, ) result = smart_scraper_graph.run() diff --git a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py index 1e0c94d6..784079e4 100644 --- a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py @@ -4,6 +4,9 @@ import os from dotenv import load_dotenv +from typing import Dict + +from pydantic import BaseModel from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.llms import HuggingFaceEndpoint @@ -13,22 +16,12 @@ # Define the output schema for the graph # ************************************************ -schema= """ - { - "Projects": [ - "Project #": - { - "title": "...", - "description": "...", - }, - "Project #": - { - "title": "...", - "description": "...", - } - ] - } -""" +class Project(BaseModel): + title: str + description: str + +class Projects(BaseModel): + Projects: Dict[str, Project] ## required environment variable in .env #HUGGINGFACEHUB_API_TOKEN @@ -61,7 +54,7 @@ smart_scraper_graph = SmartScraperGraph( prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", - schema=schema, + schema=Projects, config=graph_config ) result = smart_scraper_graph.run() diff --git a/examples/mixed_models/smart_scraper_schema_groq_openai.py b/examples/mixed_models/smart_scraper_schema_groq_openai.py index 321c71b8..f177cb61 100644 --- a/examples/mixed_models/smart_scraper_schema_groq_openai.py +++ b/examples/mixed_models/smart_scraper_schema_groq_openai.py @@ -2,8 +2,13 @@ Basic example of scraping pipeline using SmartScraper with schema """ -import os, json +import json +import os +from typing import Dict, List + from dotenv import load_dotenv +from pydantic import BaseModel + from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info @@ -13,22 +18,12 @@ # Define the output schema for the graph # ************************************************ -schema= """ - { - "Projects": [ - "Project #": - { - "title": "...", - "description": "...", - }, - "Project #": - { - "title": "...", - "description": "...", - } - ] - } -""" +class Project(BaseModel): + title: str + description: str + +class Projects(BaseModel): + Projects: Dict[str, Project] # ************************************************ # Define the configuration for the graph @@ -60,7 +55,7 @@ prompt="List me all the projects with their description.", # also accepts a string with the already downloaded HTML code source="https://perinim.github.io/projects/", - schema=schema, + schema=Projects, config=graph_config ) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index ed61255c..ef188b27 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -39,7 +39,7 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index fd15f49a..716e9aca 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -5,6 +5,8 @@ from copy import copy, deepcopy from typing import List, Optional +from pydantic import BaseModel + from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .csv_scraper_graph import CSVScraperGraph @@ -32,7 +34,7 @@ class CSVScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( @@ -42,7 +44,7 @@ class CSVScraperMultiGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index e9e41771..df04c9ce 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -34,7 +34,7 @@ class DeepScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -45,7 +45,7 @@ class DeepScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> deep_scraper = DeepScraperGraph( diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 09a5f02e..4165a194 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -23,7 +23,7 @@ class JSONScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -34,7 +34,7 @@ class JSONScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> json_scraper = JSONScraperGraph( diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index 2824c416..48fd8217 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -33,7 +33,7 @@ class JSONScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index a5eefad2..5b1ad30b 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -29,7 +29,7 @@ class OmniScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -41,7 +41,7 @@ class OmniScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> omni_scraper = OmniScraperGraph( diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py index df525949..b6f6df59 100644 --- a/scrapegraphai/graphs/omni_search_graph.py +++ b/scrapegraphai/graphs/omni_search_graph.py @@ -34,7 +34,7 @@ class OmniSearchGraph(AbstractGraph): Args: prompt (str): The user prompt to search the internet. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> omni_search_graph = OmniSearchGraph( diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 41099d8b..89d8018c 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -26,7 +26,7 @@ class PDFScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -38,7 +38,7 @@ class PDFScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> pdf_scraper = PDFScraperGraph( diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index e9b5660b..86b2477f 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -34,7 +34,7 @@ class PdfScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index ce3fa319..83bef2ab 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -23,7 +23,7 @@ class ScriptCreatorGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -36,7 +36,7 @@ class ScriptCreatorGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> script_creator = ScriptCreatorGraph( diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index 2b36f4ed..a415a82c 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -5,6 +5,8 @@ from copy import copy, deepcopy from typing import List, Optional +from pydantic import BaseModel + from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .script_creator_graph import ScriptCreatorGraph @@ -30,7 +32,7 @@ class ScriptCreatorMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> script_graph = ScriptCreatorMultiGraph( ... "What is Chioggia famous for?", @@ -41,7 +43,7 @@ class ScriptCreatorMultiGraph(AbstractGraph): >>> result = script_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 6bece062..7efcccc2 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -33,7 +33,7 @@ class SearchGraph(AbstractGraph): Args: prompt (str): The user prompt to search the internet. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = SearchGraph( diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 9ee0c3cc..cfbfc000 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -26,7 +26,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -37,7 +37,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> smart_scraper = SmartScraperGraph( diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 996beff1..84e028fc 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -33,7 +33,7 @@ class SmartScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 1058d127..4816a154 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -28,7 +28,7 @@ class SpeechGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. @@ -39,7 +39,7 @@ class SpeechGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> speech_graph = SpeechGraph( diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index dbab0b73..4513422b 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -24,7 +24,7 @@ class XMLScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -36,7 +36,7 @@ class XMLScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> xml_scraper = XMLScraperGraph( diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index e1f4423c..da772647 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -34,7 +34,7 @@ class XMLScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( From ce0a47aee5edbb26fd82e41f6688a4bc48a10822 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 18 Jun 2024 18:54:59 +0000 Subject: [PATCH 3/8] ci(release): 1.7.0-beta.13 [skip ci] ## [1.7.0-beta.13](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.12...v1.7.0-beta.13) (2024-06-18) ### Bug Fixes * updated for schema changes ([aedda44](https://github.com/VinciGit00/Scrapegraph-ai/commit/aedda448682ce5a921a62e661bffb02478bab75f)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe471b0c..7ca50119 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.7.0-beta.13](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.12...v1.7.0-beta.13) (2024-06-18) + + +### Bug Fixes + +* updated for schema changes ([aedda44](https://github.com/VinciGit00/Scrapegraph-ai/commit/aedda448682ce5a921a62e661bffb02478bab75f)) + ## [1.7.0-beta.12](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.11...v1.7.0-beta.12) (2024-06-17) diff --git a/pyproject.toml b/pyproject.toml index 7901a1a0..b94b3e4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.7.0b12" +version = "1.7.0b13" From aa2160c108764745a696ffc16038f370e9702c14 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 18 Jun 2024 21:28:29 +0200 Subject: [PATCH 4/8] feat: add research with bing + test function --- scrapegraphai/utils/research_web.py | 34 ++++++++++++++++++++--------- tests/utils/research_web_test.py | 28 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 10 deletions(-) create mode 100644 tests/utils/research_web_test.py diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 62ffd2ee..ac7fc09d 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,11 +1,12 @@ """ -Module for making the request on the web +research web module """ import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults from googlesearch import search as google_search - +import requests +from bs4 import BeautifulSoup def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: """ @@ -13,35 +14,48 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. Returns: List[str]: A list of URLs as strings that are the search results. Raises: - ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'. + ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'. Example: >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] - This function allows switching between Google and DuckDuckGo to perform + This function allows switching between Google, DuckDuckGo, and Bing to perform internet searches, returning a list of result URLs. """ if search_engine.lower() == "google": res = [] - for url in google_search(query, stop=max_results): res.append(url) return res + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) - links = re.findall(r'https?://[^\s,\]]+', res) - return links - raise ValueError( - "The only search engines available are DuckDuckGo or Google") + + elif search_engine.lower() == "bing": + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + search_url = f"https://www.bing.com/search?q={query}" + response = requests.get(search_url, headers=headers) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + search_results = [] + for result in soup.find_all('li', class_='b_algo', limit=max_results): + link = result.find('a')['href'] + search_results.append(link) + return search_results + + raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing") diff --git a/tests/utils/research_web_test.py b/tests/utils/research_web_test.py new file mode 100644 index 00000000..46630625 --- /dev/null +++ b/tests/utils/research_web_test.py @@ -0,0 +1,28 @@ +import pytest +from scrapegraphai.utils.research_web import search_on_web # Replace with actual path to your file + + +def test_google_search(): + """Tests search_on_web with Google search engine.""" + results = search_on_web("test query", search_engine="Google", max_results=2) + assert len(results) == 2 + # You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries + +def test_bing_search(): + """Tests search_on_web with Bing search engine.""" + results = search_on_web("test query", search_engine="Bing", max_results=1) + assert results is not None + # You can further assert if the results contain '.com' or '.org' in the domain + + +def test_invalid_search_engine(): + """Tests search_on_web with invalid search engine.""" + with pytest.raises(ValueError): + search_on_web("test query", search_engine="Yahoo", max_results=5) + + +def test_max_results(): + """Tests search_on_web with different max_results values.""" + results_5 = search_on_web("test query", max_results=5) + results_10 = search_on_web("test query", max_results=10) + assert len(results_5) <= len(results_10) From ec77ff7ea4eb071469c2fb53e5959d4ea1f73ad6 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 19 Jun 2024 08:40:20 +0000 Subject: [PATCH 5/8] ci(release): 1.7.0-beta.14 [skip ci] ## [1.7.0-beta.14](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.13...v1.7.0-beta.14) (2024-06-19) ### Features * add new search engine avaiability and new tests ([073d226](https://github.com/VinciGit00/Scrapegraph-ai/commit/073d226723f5f03b960865d07408905b7a506180)) * add research with bing + test function ([aa2160c](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa2160c108764745a696ffc16038f370e9702c14)) --- CHANGELOG.md | 8 ++++++++ pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ca50119..dc1c5bf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## [1.7.0-beta.14](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.13...v1.7.0-beta.14) (2024-06-19) + + +### Features + +* add new search engine avaiability and new tests ([073d226](https://github.com/VinciGit00/Scrapegraph-ai/commit/073d226723f5f03b960865d07408905b7a506180)) +* add research with bing + test function ([aa2160c](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa2160c108764745a696ffc16038f370e9702c14)) + ## [1.7.0-beta.13](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.12...v1.7.0-beta.13) (2024-06-18) diff --git a/pyproject.toml b/pyproject.toml index b94b3e4c..02114c26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.7.0b13" +version = "1.7.0b14" From f75e0835fc21f7a5acd10980b4b017b35510709c Mon Sep 17 00:00:00 2001 From: Maorsg Date: Mon, 24 Jun 2024 21:31:28 -0400 Subject: [PATCH 6/8] added a function to the search_graph class to allow user to return URLs considered in the search --- scrapegraphai/graphs/search_graph.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 6bece062..1f4e8950 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -3,7 +3,7 @@ """ from copy import copy, deepcopy -from typing import Optional +from typing import Optional, List from pydantic import BaseModel from .base_graph import BaseGraph @@ -17,6 +17,7 @@ ) + class SearchGraph(AbstractGraph): """ SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt. @@ -29,6 +30,7 @@ class SearchGraph(AbstractGraph): headless (bool): A flag to run the browser in headless mode. verbose (bool): A flag to display the execution information. model_token (int): The token limit for the language model. + considered_urls (List[str]): A list of URLs considered during the search. Args: prompt (str): The user prompt to search the internet. @@ -41,10 +43,10 @@ class SearchGraph(AbstractGraph): ... {"llm": {"model": "gpt-3.5-turbo"}} ... ) >>> result = search_graph.run() + >>> print(search_graph.get_considered_urls()) """ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) if all(isinstance(value, str) for value in config.values()): @@ -53,6 +55,7 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None self.copy_config = deepcopy(config) self.copy_schema = deepcopy(schema) + self.considered_urls = [] # New attribute to store URLs super().__init__(prompt, config, schema) @@ -64,10 +67,7 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # ************************************************ # Create a SmartScraperGraph instance - # ************************************************ - smart_scraper_instance = SmartScraperGraph( prompt="", source="", @@ -75,10 +75,7 @@ def _create_graph(self) -> BaseGraph: schema=self.copy_schema ) - # ************************************************ # Define the graph nodes - # ************************************************ - search_internet_node = SearchInternetNode( input="user_prompt", output=["urls"], @@ -128,4 +125,17 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt} self.final_state, self.execution_info = self.graph.execute(inputs) + # Store the URLs after execution + if 'urls' in self.final_state: + self.considered_urls = self.final_state['urls'] + return self.final_state.get("answer", "No answer found.") + + def get_considered_urls(self) -> List[str]: + """ + Returns the list of URLs considered during the search. + + Returns: + List[str]: A list of URLs considered during the search. + """ + return self.considered_urls From bbfbbd93be3c87c5f25e3c75ec7d677832d37467 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 25 Jun 2024 08:49:47 +0000 Subject: [PATCH 7/8] ci(release): 1.8.0-beta.1 [skip ci] ## [1.8.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.4...v1.8.0-beta.1) (2024-06-25) ### Features * add new search engine avaiability and new tests ([073d226](https://github.com/VinciGit00/Scrapegraph-ai/commit/073d226723f5f03b960865d07408905b7a506180)) * add research with bing + test function ([aa2160c](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa2160c108764745a696ffc16038f370e9702c14)) ### Bug Fixes * updated for schema changes ([aedda44](https://github.com/VinciGit00/Scrapegraph-ai/commit/aedda448682ce5a921a62e661bffb02478bab75f)) ### CI * **release:** 1.7.0-beta.13 [skip ci] ([ce0a47a](https://github.com/VinciGit00/Scrapegraph-ai/commit/ce0a47aee5edbb26fd82e41f6688a4bc48a10822)) * **release:** 1.7.0-beta.14 [skip ci] ([ec77ff7](https://github.com/VinciGit00/Scrapegraph-ai/commit/ec77ff7ea4eb071469c2fb53e5959d4ea1f73ad6)) --- CHANGELOG.md | 19 +++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e276828d..a69adb32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +## [1.8.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.4...v1.8.0-beta.1) (2024-06-25) + + +### Features + +* add new search engine avaiability and new tests ([073d226](https://github.com/VinciGit00/Scrapegraph-ai/commit/073d226723f5f03b960865d07408905b7a506180)) +* add research with bing + test function ([aa2160c](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa2160c108764745a696ffc16038f370e9702c14)) + + +### Bug Fixes + +* updated for schema changes ([aedda44](https://github.com/VinciGit00/Scrapegraph-ai/commit/aedda448682ce5a921a62e661bffb02478bab75f)) + + +### CI + +* **release:** 1.7.0-beta.13 [skip ci] ([ce0a47a](https://github.com/VinciGit00/Scrapegraph-ai/commit/ce0a47aee5edbb26fd82e41f6688a4bc48a10822)) +* **release:** 1.7.0-beta.14 [skip ci] ([ec77ff7](https://github.com/VinciGit00/Scrapegraph-ai/commit/ec77ff7ea4eb071469c2fb53e5959d4ea1f73ad6)) + ## [1.7.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.3...v1.7.4) (2024-06-21) diff --git a/pyproject.toml b/pyproject.toml index f9df8d3e..0df19e6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.7.4" +version = "1.8.0b1" From a3c43c9a9941023ee9a9ddcf8eed7337870cf5aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20Feitosa=20da=20Silva?= Date: Thu, 27 Jun 2024 17:53:20 -0300 Subject: [PATCH 8/8] =?UTF-8?q?=F0=9F=90=9B=20Rename=20`user=5Fprompt`=20p?= =?UTF-8?q?arameter=20to=20`prompt`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adjustment makes the class consistent with the definition of the arguments. --- scrapegraphai/builders/graph_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py index ab19a251..e807a0df 100644 --- a/scrapegraphai/builders/graph_builder.py +++ b/scrapegraphai/builders/graph_builder.py @@ -40,11 +40,11 @@ class GraphBuilder: ValueError: If 'api_key' is not included in llm_config. """ - def __init__(self, user_prompt: str, config: dict): + def __init__(self, prompt: str, config: dict): """ Initializes the GraphBuilder with a user prompt and language model configuration. """ - self.user_prompt = user_prompt + self.prompt = prompt self.config = config self.llm = self._create_llm(config["llm"]) self.nodes_description = self._generate_nodes_description() @@ -122,7 +122,7 @@ def build_graph(self): Returns: dict: A JSON representation of the graph configuration. """ - return self.chain.invoke(self.user_prompt) + return self.chain.invoke(self.prompt) @staticmethod def convert_json_to_graphviz(json_data, format: str = 'pdf'):