diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c6eb5ce..0de76f18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,35 @@ +## [1.29.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0-beta.1) (2024-11-04) + + +### Features + +* Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc)) + + +### Bug Fixes + +* resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26)) + + +### CI + +* **release:** 1.28.0-beta.3 [skip ci] ([65d39bb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/65d39bbaf0671fa5ac84705e94adb42078a36c3b)) +* **release:** 1.28.0-beta.4 [skip ci] ([b90bb00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b90bb00beb8497b8dd16fa4d1ef5af22042a55f3)) + +## [1.28.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.3...v1.28.0-beta.4) (2024-11-03) + + +### Bug Fixes + +* resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26)) + +## [1.28.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.2...v1.28.0-beta.3) (2024-11-02) + + +### Features + +* Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc)) + ## [1.28.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0) (2024-11-01) diff --git a/pyproject.toml b/pyproject.toml index b9c50f43..88fed28e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,8 @@ name = "scrapegraphai" -version = "1.28.0" +version = "1.29.0b1" + diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 29a91376..987eab8b 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -66,7 +66,8 @@ def _create_graph(self) -> BaseGraph: "llm_model": self.llm_model, "max_results": self.max_results, "loader_kwargs": self.loader_kwargs, - "search_engine": self.copy_config.get("search_engine") + "search_engine": self.copy_config.get("search_engine"), + "serper_api_key": self.copy_config.get("serper_api_key") } ) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 30058ec5..56d57d09 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -122,11 +122,11 @@ def execute(self, state: dict) -> dict: partial_variables={"context": doc, "format_instructions": format_instructions} ) chain = prompt | self.llm_model - raw_response = str((prompt | self.llm_model).invoke({"question": user_prompt})) + raw_response = chain.invoke({"question": user_prompt}) if output_parser: try: - answer = output_parser.parse(raw_response) + answer = output_parser.parse(raw_response.content) except JSONDecodeError: lines = raw_response.split('\n') if lines[0].strip().startswith('```'): @@ -136,7 +136,7 @@ def execute(self, state: dict) -> dict: cleaned_response = '\n'.join(lines) answer = output_parser.parse(cleaned_response) else: - answer = raw_response + answer = raw_response.content state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index e318f923..278f81ab 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -47,6 +47,13 @@ def __init__( if node_config.get("search_engine") else "google" ) + + self.serper_api_key = ( + node_config["serper_api_key"] + if node_config.get("serper_api_key") + else None + ) + self.max_results = node_config.get("max_results", 3) def execute(self, state: dict) -> dict: @@ -95,7 +102,7 @@ def execute(self, state: dict) -> dict: self.logger.info(f"Search Query: {search_query}") answer = search_on_web(query=search_query, max_results=self.max_results, - search_engine=self.search_engine, proxy=self.proxy) + search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key) if len(answer) == 0: raise ValueError("Zero results found for the search query.") diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 86f9f5f3..f2d0c254 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -7,10 +7,12 @@ from googlesearch import search as google_search import requests from bs4 import BeautifulSoup +import json def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080, - timeout: int = 10, proxy: str | dict = None) -> List[str]: + timeout: int = 10, proxy: str | dict = None, + serper_api_key: str = None) -> List[str]: """Search web function with improved error handling and validation""" # Input validation @@ -18,7 +20,7 @@ def search_on_web(query: str, search_engine: str = "Google", raise ValueError("Query must be a non-empty string") search_engine = search_engine.lower() - valid_engines = {"google", "duckduckgo", "bing", "searxng"} + valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"} if search_engine not in valid_engines: raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") @@ -42,7 +44,10 @@ def search_on_web(query: str, search_engine: str = "Google", elif search_engine == "searxng": results = _search_searxng(query, max_results, port, timeout) - + + elif search_engine.lower() == "serper": + results = _search_serper(query, max_results, serper_api_key, timeout) + return filter_pdf_links(results) except requests.Timeout: @@ -76,6 +81,25 @@ def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> Li response.raise_for_status() return [result['url'] for result in response.json().get("results", [])[:max_results]] +def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]: + """Helper function for serper api""" + if not serper_api_key: + raise ValueError("API key is required for serper api.") + + url = "https://google.serper.dev/search" + payload = json.dumps({ + "q": query, + "num": max_results + }) + headers = { + 'X-API-KEY': serper_api_key, + 'Content-Type': 'application/json' + } + response = requests.post(url, headers=headers, data=payload, timeout=timeout) + response.raise_for_status() + return [result.get("link") for result in response.json().get("organic", [])] + + def format_proxy(proxy): if isinstance(proxy, dict): server = proxy.get('server') @@ -102,4 +126,4 @@ def filter_pdf_links(links: List[str]) -> List[str]: Returns: List[str]: A list of URLs excluding any that end with '.pdf'. """ - return [link for link in links if not link.lower().endswith('.pdf')] + return [link for link in links if not link.lower().endswith('.pdf')] \ No newline at end of file