From c218546a3ddbdf987888e150942a244856af66cc Mon Sep 17 00:00:00 2001 From: aziz-ullah-khan Date: Sat, 2 Nov 2024 15:02:08 +0500 Subject: [PATCH 1/6] feat: Serper API integration for Google search --- scrapegraphai/graphs/search_graph.py | 3 +- scrapegraphai/nodes/search_internet_node.py | 9 +++++- scrapegraphai/utils/research_web.py | 32 ++++++++++++++++++--- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 29a91376..987eab8b 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -66,7 +66,8 @@ def _create_graph(self) -> BaseGraph: "llm_model": self.llm_model, "max_results": self.max_results, "loader_kwargs": self.loader_kwargs, - "search_engine": self.copy_config.get("search_engine") + "search_engine": self.copy_config.get("search_engine"), + "serper_api_key": self.copy_config.get("serper_api_key") } ) diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index e318f923..278f81ab 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -47,6 +47,13 @@ def __init__( if node_config.get("search_engine") else "google" ) + + self.serper_api_key = ( + node_config["serper_api_key"] + if node_config.get("serper_api_key") + else None + ) + self.max_results = node_config.get("max_results", 3) def execute(self, state: dict) -> dict: @@ -95,7 +102,7 @@ def execute(self, state: dict) -> dict: self.logger.info(f"Search Query: {search_query}") answer = search_on_web(query=search_query, max_results=self.max_results, - search_engine=self.search_engine, proxy=self.proxy) + search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key) if len(answer) == 0: raise ValueError("Zero results found for the search query.") diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 86f9f5f3..f2d0c254 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -7,10 +7,12 @@ from googlesearch import search as google_search import requests from bs4 import BeautifulSoup +import json def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080, - timeout: int = 10, proxy: str | dict = None) -> List[str]: + timeout: int = 10, proxy: str | dict = None, + serper_api_key: str = None) -> List[str]: """Search web function with improved error handling and validation""" # Input validation @@ -18,7 +20,7 @@ def search_on_web(query: str, search_engine: str = "Google", raise ValueError("Query must be a non-empty string") search_engine = search_engine.lower() - valid_engines = {"google", "duckduckgo", "bing", "searxng"} + valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"} if search_engine not in valid_engines: raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") @@ -42,7 +44,10 @@ def search_on_web(query: str, search_engine: str = "Google", elif search_engine == "searxng": results = _search_searxng(query, max_results, port, timeout) - + + elif search_engine.lower() == "serper": + results = _search_serper(query, max_results, serper_api_key, timeout) + return filter_pdf_links(results) except requests.Timeout: @@ -76,6 +81,25 @@ def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> Li response.raise_for_status() return [result['url'] for result in response.json().get("results", [])[:max_results]] +def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]: + """Helper function for serper api""" + if not serper_api_key: + raise ValueError("API key is required for serper api.") + + url = "https://google.serper.dev/search" + payload = json.dumps({ + "q": query, + "num": max_results + }) + headers = { + 'X-API-KEY': serper_api_key, + 'Content-Type': 'application/json' + } + response = requests.post(url, headers=headers, data=payload, timeout=timeout) + response.raise_for_status() + return [result.get("link") for result in response.json().get("organic", [])] + + def format_proxy(proxy): if isinstance(proxy, dict): server = proxy.get('server') @@ -102,4 +126,4 @@ def filter_pdf_links(links: List[str]) -> List[str]: Returns: List[str]: A list of URLs excluding any that end with '.pdf'. """ - return [link for link in links if not link.lower().endswith('.pdf')] + return [link for link in links if not link.lower().endswith('.pdf')] \ No newline at end of file From 65d39bbaf0671fa5ac84705e94adb42078a36c3b Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sat, 2 Nov 2024 17:36:10 +0000 Subject: [PATCH 2/6] ci(release): 1.28.0-beta.3 [skip ci] ## [1.28.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.2...v1.28.0-beta.3) (2024-11-02) ### Features * Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ee28236..03ac9b41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.28.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.2...v1.28.0-beta.3) (2024-11-02) + + +### Features + +* Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc)) + ## [1.28.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.1...v1.28.0-beta.2) (2024-10-31) diff --git a/pyproject.toml b/pyproject.toml index dad622d9..b8157393 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.28.0b2" +version = "1.28.0b3" From e8cabfd1ae7cc93abc04745948db1f6933fd2e26 Mon Sep 17 00:00:00 2001 From: aziz-ullah-khan Date: Sun, 3 Nov 2024 20:23:29 +0500 Subject: [PATCH 3/6] fix: resolved outparser issue --- .ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 ++++++ scrapegraphai/nodes/generate_answer_node.py | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 00000000..363fcab7 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 30058ec5..56d57d09 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -122,11 +122,11 @@ def execute(self, state: dict) -> dict: partial_variables={"context": doc, "format_instructions": format_instructions} ) chain = prompt | self.llm_model - raw_response = str((prompt | self.llm_model).invoke({"question": user_prompt})) + raw_response = chain.invoke({"question": user_prompt}) if output_parser: try: - answer = output_parser.parse(raw_response) + answer = output_parser.parse(raw_response.content) except JSONDecodeError: lines = raw_response.split('\n') if lines[0].strip().startswith('```'): @@ -136,7 +136,7 @@ def execute(self, state: dict) -> dict: cleaned_response = '\n'.join(lines) answer = output_parser.parse(cleaned_response) else: - answer = raw_response + answer = raw_response.content state.update({self.output[0]: answer}) return state From 7666a91f4f5a8582d6e6150d6885f90d7c5c56f3 Mon Sep 17 00:00:00 2001 From: aziz-ullah-khan Date: Sun, 3 Nov 2024 20:28:50 +0500 Subject: [PATCH 4/6] cleaning --- .ipynb_checkpoints/Untitled-checkpoint.ipynb | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 363fcab7..00000000 --- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -} From b90bb00beb8497b8dd16fa4d1ef5af22042a55f3 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 3 Nov 2024 16:57:13 +0000 Subject: [PATCH 5/6] ci(release): 1.28.0-beta.4 [skip ci] ## [1.28.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.3...v1.28.0-beta.4) (2024-11-03) ### Bug Fixes * resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03ac9b41..8d879b51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.28.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.3...v1.28.0-beta.4) (2024-11-03) + + +### Bug Fixes + +* resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26)) + ## [1.28.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.2...v1.28.0-beta.3) (2024-11-02) diff --git a/pyproject.toml b/pyproject.toml index b8157393..e16e9aed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.28.0b3" +version = "1.28.0b4" From 950e859b1b90c7d5b85cbfcb0948e93d4487f78d Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Mon, 4 Nov 2024 08:10:24 +0000 Subject: [PATCH 6/6] ci(release): 1.29.0-beta.1 [skip ci] ## [1.29.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0-beta.1) (2024-11-04) ### Features * Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc)) ### Bug Fixes * resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26)) ### CI * **release:** 1.28.0-beta.3 [skip ci] ([65d39bb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/65d39bbaf0671fa5ac84705e94adb42078a36c3b)) * **release:** 1.28.0-beta.4 [skip ci] ([b90bb00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b90bb00beb8497b8dd16fa4d1ef5af22042a55f3)) --- CHANGELOG.md | 18 ++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7e2b452..0de76f18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +## [1.29.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0-beta.1) (2024-11-04) + + +### Features + +* Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc)) + + +### Bug Fixes + +* resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26)) + + +### CI + +* **release:** 1.28.0-beta.3 [skip ci] ([65d39bb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/65d39bbaf0671fa5ac84705e94adb42078a36c3b)) +* **release:** 1.28.0-beta.4 [skip ci] ([b90bb00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b90bb00beb8497b8dd16fa4d1ef5af22042a55f3)) + ## [1.28.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.3...v1.28.0-beta.4) (2024-11-03) diff --git a/pyproject.toml b/pyproject.toml index 7f5b148d..88fed28e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.28.0b4" +version = "1.29.0b1"