From 073d226723f5f03b960865d07408905b7a506180 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 18 Jun 2024 14:35:13 +0200 Subject: [PATCH 1/2] feat: add new search engine avaiability and new tests --- examples/single_node/search_internet_node.py | 50 +++++++++++++++++ scrapegraphai/nodes/search_internet_node.py | 4 +- scrapegraphai/utils/research_web.py | 3 +- tests/nodes/search_internet_node_test.py | 58 ++++++++++++++++++++ 4 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 examples/single_node/search_internet_node.py create mode 100644 tests/nodes/search_internet_node_test.py diff --git a/examples/single_node/search_internet_node.py b/examples/single_node/search_internet_node.py new file mode 100644 index 00000000..8a8149fa --- /dev/null +++ b/examples/single_node/search_internet_node.py @@ -0,0 +1,50 @@ +""" +Example of custom graph using existing nodes +""" + +from scrapegraphai.models import Ollama +from scrapegraphai.nodes import SearchInternetNode + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "llama3", + "temperature": 0, + "streaming": True + }, + "search_engine": "google", + "max_results": 3, + "verbose": True +} + +# ************************************************ +# Define the node +# ************************************************ + +llm_model = Ollama(graph_config["llm"]) + +search_node = SearchInternetNode( + input="user_input", + output=["search_results"], + node_config={ + "llm_model": llm_model, + "search_engine": graph_config["search_engine"], + "max_results": graph_config["max_results"], + "verbose": graph_config["verbose"] + } +) + +# ************************************************ +# Test the node +# ************************************************ + +state = { + "user_input": "What is the capital of France?" +} + +result = search_node.execute(state) + +print(result) diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 9fa4a8f5..59c56975 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -43,6 +43,7 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) + self.search_engine = node_config.get("search_engine", "google") self.max_results = node_config.get("max_results", 3) def execute(self, state: dict) -> dict: @@ -97,7 +98,8 @@ def execute(self, state: dict) -> dict: self.logger.info(f"Search Query: {search_query}") - answer = search_on_web(query=search_query, max_results=self.max_results) + answer = search_on_web(query=search_query, max_results=self.max_results, + search_engine=self.search_engine) if len(answer) == 0: # raise an exception if no answer is found diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index a839a680..62ffd2ee 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -26,7 +26,8 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] - This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs. + This function allows switching between Google and DuckDuckGo to perform + internet searches, returning a list of result URLs. """ if search_engine.lower() == "google": diff --git a/tests/nodes/search_internet_node_test.py b/tests/nodes/search_internet_node_test.py new file mode 100644 index 00000000..db2cbdee --- /dev/null +++ b/tests/nodes/search_internet_node_test.py @@ -0,0 +1,58 @@ +import unittest +from scrapegraphai.models import Ollama +from scrapegraphai.nodes import SearchInternetNode + +class TestSearchInternetNode(unittest.TestCase): + + def setUp(self): + # Configuration for the graph + self.graph_config = { + "llm": { + "model": "llama3", + "temperature": 0, + "streaming": True + }, + "search_engine": "google", + "max_results": 3, + "verbose": True + } + + # Define the model + self.llm_model = Ollama(self.graph_config["llm"]) + + # Initialize the SearchInternetNode + self.search_node = SearchInternetNode( + input="user_input", + output=["search_results"], + node_config={ + "llm_model": self.llm_model, + "search_engine": self.graph_config["search_engine"], + "max_results": self.graph_config["max_results"], + "verbose": self.graph_config["verbose"] + } + ) + + def test_execute_search_node(self): + # Initial state + state = { + "user_input": "What is the capital of France?" + } + + # Expected output + expected_output = { + "user_input": "What is the capital of France?", + "search_results": [ + "https://en.wikipedia.org/wiki/Paris", + "https://en.wikipedia.org/wiki/France", + "https://en.wikipedia.org/wiki/%C3%8Ele-de-France" + ] + } + + # Execute the node + result = self.search_node.execute(state) + + # Assert the results + self.assertEqual(result, expected_output) + +if __name__ == "__main__": + unittest.main() From aa2160c108764745a696ffc16038f370e9702c14 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 18 Jun 2024 21:28:29 +0200 Subject: [PATCH 2/2] feat: add research with bing + test function --- scrapegraphai/utils/research_web.py | 34 ++++++++++++++++++++--------- tests/utils/research_web_test.py | 28 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 10 deletions(-) create mode 100644 tests/utils/research_web_test.py diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 62ffd2ee..ac7fc09d 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,11 +1,12 @@ """ -Module for making the request on the web +research web module """ import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults from googlesearch import search as google_search - +import requests +from bs4 import BeautifulSoup def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: """ @@ -13,35 +14,48 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. Returns: List[str]: A list of URLs as strings that are the search results. Raises: - ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'. + ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'. Example: >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] - This function allows switching between Google and DuckDuckGo to perform + This function allows switching between Google, DuckDuckGo, and Bing to perform internet searches, returning a list of result URLs. """ if search_engine.lower() == "google": res = [] - for url in google_search(query, stop=max_results): res.append(url) return res + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) - links = re.findall(r'https?://[^\s,\]]+', res) - return links - raise ValueError( - "The only search engines available are DuckDuckGo or Google") + + elif search_engine.lower() == "bing": + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + search_url = f"https://www.bing.com/search?q={query}" + response = requests.get(search_url, headers=headers) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + search_results = [] + for result in soup.find_all('li', class_='b_algo', limit=max_results): + link = result.find('a')['href'] + search_results.append(link) + return search_results + + raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing") diff --git a/tests/utils/research_web_test.py b/tests/utils/research_web_test.py new file mode 100644 index 00000000..46630625 --- /dev/null +++ b/tests/utils/research_web_test.py @@ -0,0 +1,28 @@ +import pytest +from scrapegraphai.utils.research_web import search_on_web # Replace with actual path to your file + + +def test_google_search(): + """Tests search_on_web with Google search engine.""" + results = search_on_web("test query", search_engine="Google", max_results=2) + assert len(results) == 2 + # You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries + +def test_bing_search(): + """Tests search_on_web with Bing search engine.""" + results = search_on_web("test query", search_engine="Bing", max_results=1) + assert results is not None + # You can further assert if the results contain '.com' or '.org' in the domain + + +def test_invalid_search_engine(): + """Tests search_on_web with invalid search engine.""" + with pytest.raises(ValueError): + search_on_web("test query", search_engine="Yahoo", max_results=5) + + +def test_max_results(): + """Tests search_on_web with different max_results values.""" + results_5 = search_on_web("test query", max_results=5) + results_10 = search_on_web("test query", max_results=10) + assert len(results_5) <= len(results_10)