Skip to content

fixed json on generate answer #784

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,35 @@
## [1.29.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0-beta.1) (2024-11-04)


### Features

* Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc))


### Bug Fixes

* resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26))


### CI

* **release:** 1.28.0-beta.3 [skip ci] ([65d39bb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/65d39bbaf0671fa5ac84705e94adb42078a36c3b))
* **release:** 1.28.0-beta.4 [skip ci] ([b90bb00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b90bb00beb8497b8dd16fa4d1ef5af22042a55f3))

## [1.28.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.3...v1.28.0-beta.4) (2024-11-03)


### Bug Fixes

* resolved outparser issue ([e8cabfd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8cabfd1ae7cc93abc04745948db1f6933fd2e26))

## [1.28.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.2...v1.28.0-beta.3) (2024-11-02)


### Features

* Serper API integration for Google search ([c218546](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c218546a3ddbdf987888e150942a244856af66cc))

## [1.28.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0) (2024-11-01)


Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
name = "scrapegraphai"


version = "1.28.0"
version = "1.29.0b1"




Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def _create_graph(self) -> BaseGraph:
"llm_model": self.llm_model,
"max_results": self.max_results,
"loader_kwargs": self.loader_kwargs,
"search_engine": self.copy_config.get("search_engine")
"search_engine": self.copy_config.get("search_engine"),
"serper_api_key": self.copy_config.get("serper_api_key")
}
)

Expand Down
6 changes: 3 additions & 3 deletions scrapegraphai/nodes/generate_answer_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,11 @@ def execute(self, state: dict) -> dict:
partial_variables={"context": doc, "format_instructions": format_instructions}
)
chain = prompt | self.llm_model
raw_response = str((prompt | self.llm_model).invoke({"question": user_prompt}))
raw_response = chain.invoke({"question": user_prompt})

if output_parser:
try:
answer = output_parser.parse(raw_response)
answer = output_parser.parse(raw_response.content)
except JSONDecodeError:
lines = raw_response.split('\n')
if lines[0].strip().startswith('```'):
Expand All @@ -136,7 +136,7 @@ def execute(self, state: dict) -> dict:
cleaned_response = '\n'.join(lines)
answer = output_parser.parse(cleaned_response)
else:
answer = raw_response
answer = raw_response.content

state.update({self.output[0]: answer})
return state
Expand Down
9 changes: 8 additions & 1 deletion scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ def __init__(
if node_config.get("search_engine")
else "google"
)

self.serper_api_key = (
node_config["serper_api_key"]
if node_config.get("serper_api_key")
else None
)

self.max_results = node_config.get("max_results", 3)

def execute(self, state: dict) -> dict:
Expand Down Expand Up @@ -95,7 +102,7 @@ def execute(self, state: dict) -> dict:
self.logger.info(f"Search Query: {search_query}")

answer = search_on_web(query=search_query, max_results=self.max_results,
search_engine=self.search_engine, proxy=self.proxy)
search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)

if len(answer) == 0:
raise ValueError("Zero results found for the search query.")
Expand Down
32 changes: 28 additions & 4 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,20 @@
from googlesearch import search as google_search
import requests
from bs4 import BeautifulSoup
import json

def search_on_web(query: str, search_engine: str = "Google",
max_results: int = 10, port: int = 8080,
timeout: int = 10, proxy: str | dict = None) -> List[str]:
timeout: int = 10, proxy: str | dict = None,
serper_api_key: str = None) -> List[str]:
"""Search web function with improved error handling and validation"""

# Input validation
if not query or not isinstance(query, str):
raise ValueError("Query must be a non-empty string")

search_engine = search_engine.lower()
valid_engines = {"google", "duckduckgo", "bing", "searxng"}
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
if search_engine not in valid_engines:
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")

Expand All @@ -42,7 +44,10 @@ def search_on_web(query: str, search_engine: str = "Google",

elif search_engine == "searxng":
results = _search_searxng(query, max_results, port, timeout)


elif search_engine.lower() == "serper":
results = _search_serper(query, max_results, serper_api_key, timeout)

return filter_pdf_links(results)

except requests.Timeout:
Expand Down Expand Up @@ -76,6 +81,25 @@ def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> Li
response.raise_for_status()
return [result['url'] for result in response.json().get("results", [])[:max_results]]

def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
"""Helper function for serper api"""
if not serper_api_key:
raise ValueError("API key is required for serper api.")

url = "https://google.serper.dev/search"
payload = json.dumps({
"q": query,
"num": max_results
})
headers = {
'X-API-KEY': serper_api_key,
'Content-Type': 'application/json'
}
response = requests.post(url, headers=headers, data=payload, timeout=timeout)
response.raise_for_status()
return [result.get("link") for result in response.json().get("organic", [])]


def format_proxy(proxy):
if isinstance(proxy, dict):
server = proxy.get('server')
Expand All @@ -102,4 +126,4 @@ def filter_pdf_links(links: List[str]) -> List[str]:
Returns:
List[str]: A list of URLs excluding any that end with '.pdf'.
"""
return [link for link in links if not link.lower().endswith('.pdf')]
return [link for link in links if not link.lower().endswith('.pdf')]