From 073d226723f5f03b960865d07408905b7a506180 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Tue, 18 Jun 2024 14:35:13 +0200
Subject: [PATCH 1/2] feat: add new search engine avaiability and new tests

---
 examples/single_node/search_internet_node.py | 50 +++++++++++++++++
 scrapegraphai/nodes/search_internet_node.py  |  4 +-
 scrapegraphai/utils/research_web.py          |  3 +-
 tests/nodes/search_internet_node_test.py     | 58 ++++++++++++++++++++
 4 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 examples/single_node/search_internet_node.py
 create mode 100644 tests/nodes/search_internet_node_test.py

diff --git a/examples/single_node/search_internet_node.py b/examples/single_node/search_internet_node.py
new file mode 100644
index 00000000..8a8149fa
--- /dev/null
+++ b/examples/single_node/search_internet_node.py
@@ -0,0 +1,50 @@
+"""
+Example of custom graph using existing nodes
+"""
+
+from scrapegraphai.models import Ollama
+from scrapegraphai.nodes import SearchInternetNode
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "llama3",
+        "temperature": 0,
+        "streaming": True
+    },
+    "search_engine": "google",
+    "max_results": 3,
+    "verbose": True
+}
+
+# ************************************************
+# Define the node
+# ************************************************
+
+llm_model = Ollama(graph_config["llm"])
+
+search_node = SearchInternetNode(
+    input="user_input",
+    output=["search_results"],
+    node_config={
+        "llm_model": llm_model,
+        "search_engine": graph_config["search_engine"],
+        "max_results": graph_config["max_results"],
+        "verbose": graph_config["verbose"]
+    }
+)
+
+# ************************************************
+# Test the node
+# ************************************************
+
+state = {
+    "user_input": "What is the capital of France?"
+}
+
+result = search_node.execute(state)
+
+print(result)
diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
index 9fa4a8f5..59c56975 100644
--- a/scrapegraphai/nodes/search_internet_node.py
+++ b/scrapegraphai/nodes/search_internet_node.py
@@ -43,6 +43,7 @@ def __init__(
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
+        self.search_engine = node_config.get("search_engine", "google")
         self.max_results = node_config.get("max_results", 3)
 
     def execute(self, state: dict) -> dict:
@@ -97,7 +98,8 @@ def execute(self, state: dict) -> dict:
 
         self.logger.info(f"Search Query: {search_query}")
 
-        answer = search_on_web(query=search_query, max_results=self.max_results)
+        answer = search_on_web(query=search_query, max_results=self.max_results,
+                               search_engine=self.search_engine)
 
         if len(answer) == 0:
             # raise an exception if no answer is found
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
index a839a680..62ffd2ee 100644
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@@ -26,7 +26,8 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
         >>> search_on_web("example query", search_engine="Google", max_results=5)
         ['http://example.com', 'http://example.org', ...]
 
-    This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
+    This function allows switching between Google and DuckDuckGo to perform 
+    internet searches, returning a list of result URLs.
     """
 
     if search_engine.lower() == "google":
diff --git a/tests/nodes/search_internet_node_test.py b/tests/nodes/search_internet_node_test.py
new file mode 100644
index 00000000..db2cbdee
--- /dev/null
+++ b/tests/nodes/search_internet_node_test.py
@@ -0,0 +1,58 @@
+import unittest
+from scrapegraphai.models import Ollama
+from scrapegraphai.nodes import SearchInternetNode
+
+class TestSearchInternetNode(unittest.TestCase):
+
+    def setUp(self):
+        # Configuration for the graph
+        self.graph_config = {
+            "llm": {
+                "model": "llama3",
+                "temperature": 0,
+                "streaming": True
+            },
+            "search_engine": "google",
+            "max_results": 3,
+            "verbose": True
+        }
+
+        # Define the model
+        self.llm_model = Ollama(self.graph_config["llm"])
+
+        # Initialize the SearchInternetNode
+        self.search_node = SearchInternetNode(
+            input="user_input",
+            output=["search_results"],
+            node_config={
+                "llm_model": self.llm_model,
+                "search_engine": self.graph_config["search_engine"],
+                "max_results": self.graph_config["max_results"],
+                "verbose": self.graph_config["verbose"]
+            }
+        )
+
+    def test_execute_search_node(self):
+        # Initial state
+        state = {
+            "user_input": "What is the capital of France?"
+        }
+
+        # Expected output
+        expected_output = {
+            "user_input": "What is the capital of France?",
+            "search_results": [
+                "https://en.wikipedia.org/wiki/Paris",
+                "https://en.wikipedia.org/wiki/France",
+                "https://en.wikipedia.org/wiki/%C3%8Ele-de-France"
+            ]
+        }
+
+        # Execute the node
+        result = self.search_node.execute(state)
+
+        # Assert the results
+        self.assertEqual(result, expected_output)
+
+if __name__ == "__main__":
+    unittest.main()

From aa2160c108764745a696ffc16038f370e9702c14 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Tue, 18 Jun 2024 21:28:29 +0200
Subject: [PATCH 2/2] feat: add research with bing + test function

---
 scrapegraphai/utils/research_web.py | 34 ++++++++++++++++++++---------
 tests/utils/research_web_test.py    | 28 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 10 deletions(-)
 create mode 100644 tests/utils/research_web_test.py

diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
index 62ffd2ee..ac7fc09d 100644
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@@ -1,11 +1,12 @@
 """
-Module for making the request on the web
+research web module
 """
 import re
 from typing import List
 from langchain_community.tools import DuckDuckGoSearchResults
 from googlesearch import search as google_search
-
+import requests
+from bs4 import BeautifulSoup
 
 def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
     """
@@ -13,35 +14,48 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
 
     Args:
         query (str): The search query to find on the internet.
-        search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
+        search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
         max_results (int, optional): The maximum number of search results to return.
 
     Returns:
         List[str]: A list of URLs as strings that are the search results.
 
     Raises:
-        ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
+        ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
 
     Example:
         >>> search_on_web("example query", search_engine="Google", max_results=5)
         ['http://example.com', 'http://example.org', ...]
 
-    This function allows switching between Google and DuckDuckGo to perform 
+    This function allows switching between Google, DuckDuckGo, and Bing to perform 
     internet searches, returning a list of result URLs.
     """
 
     if search_engine.lower() == "google":
         res = []
-
         for url in google_search(query, stop=max_results):
             res.append(url)
         return res
+
     elif search_engine.lower() == "duckduckgo":
         research = DuckDuckGoSearchResults(max_results=max_results)
         res = research.run(query)
-
         links = re.findall(r'https?://[^\s,\]]+', res)
-
         return links
-    raise ValueError(
-        "The only search engines available are DuckDuckGo or Google")
+
+    elif search_engine.lower() == "bing":
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        }
+        search_url = f"https://www.bing.com/search?q={query}"
+        response = requests.get(search_url, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        search_results = []
+        for result in soup.find_all('li', class_='b_algo', limit=max_results):
+            link = result.find('a')['href']
+            search_results.append(link)
+        return search_results
+
+    raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
diff --git a/tests/utils/research_web_test.py b/tests/utils/research_web_test.py
new file mode 100644
index 00000000..46630625
--- /dev/null
+++ b/tests/utils/research_web_test.py
@@ -0,0 +1,28 @@
+import pytest
+from scrapegraphai.utils.research_web import search_on_web  # Replace with actual path to your file
+
+
+def test_google_search():
+    """Tests search_on_web with Google search engine."""
+    results = search_on_web("test query", search_engine="Google", max_results=2)
+    assert len(results) == 2
+    # You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries
+
+def test_bing_search():
+    """Tests search_on_web with Bing search engine."""
+    results = search_on_web("test query", search_engine="Bing", max_results=1)
+    assert results is not None
+    # You can further assert if the results contain '.com' or '.org' in the domain
+
+
+def test_invalid_search_engine():
+    """Tests search_on_web with invalid search engine."""
+    with pytest.raises(ValueError):
+        search_on_web("test query", search_engine="Yahoo", max_results=5)
+
+
+def test_max_results():
+    """Tests search_on_web with different max_results values."""
+    results_5 = search_on_web("test query", max_results=5)
+    results_10 = search_on_web("test query", max_results=10)
+    assert len(results_5) <= len(results_10)