diff --git a/CHANGELOG.md b/CHANGELOG.md index ca56021a..16e9a1c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.43.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.43.0...v1.43.1-beta.1) (2025-03-21) + + +### Bug Fixes + +* Fixes schema option not working ([df1645c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df1645c5ebc6bc2362992fec3887dcbedf519ba9)) + ## [1.43.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.42.1...v1.43.0) (2025-03-13) diff --git a/pyproject.toml b/pyproject.toml index af92ed87..912d22a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.43.0" +version = "1.43.1b1" @@ -31,7 +31,8 @@ dependencies = [ "async-timeout>=4.0.3", "simpleeval>=1.0.0", "jsonschema>=4.23.0", - "duckduckgo-search>=7.2.1" + "duckduckgo-search>=7.2.1", + "pydantic>=2.10.2", ] readme = "README.md" diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 1df3091e..5e267a4d 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -2,8 +2,8 @@ GenerateAnswerNode Module """ -import time import json +import time from typing import List, Optional from langchain.prompts import PromptTemplate @@ -105,10 +105,7 @@ def process(self, state: dict) -> dict: raise ValueError("No user prompt found in state") # Create the chain input with both content and question keys - chain_input = { - "content": content, - "question": user_prompt - } + chain_input = {"content": content, "question": user_prompt} try: response = self.invoke_with_timeout(self.chain, chain_input, self.timeout) @@ -167,25 +164,13 @@ def execute(self, state: dict) -> dict: and not self.script_creator or self.is_md_scraper ): - template_no_chunks_prompt = ( - TEMPLATE_NO_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions - ) - template_chunks_prompt = ( - TEMPLATE_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions - ) - template_merge_prompt = ( - TEMPLATE_MERGE_MD + "\n\nIMPORTANT: " + format_instructions - ) + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD + template_chunks_prompt = TEMPLATE_CHUNKS_MD + template_merge_prompt = TEMPLATE_MERGE_MD else: - template_no_chunks_prompt = ( - TEMPLATE_NO_CHUNKS + "\n\nIMPORTANT: " + format_instructions - ) - template_chunks_prompt = ( - TEMPLATE_CHUNKS + "\n\nIMPORTANT: " + format_instructions - ) - template_merge_prompt = ( - TEMPLATE_MERGE + "\n\nIMPORTANT: " + format_instructions - ) + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS + template_chunks_prompt = TEMPLATE_CHUNKS + template_merge_prompt = TEMPLATE_MERGE if self.additional_info is not None: template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt @@ -210,8 +195,14 @@ def execute(self, state: dict) -> dict: chain, {"question": user_prompt}, self.timeout ) except (Timeout, json.JSONDecodeError) as e: - error_msg = "Response timeout exceeded" if isinstance(e, Timeout) else "Invalid JSON response format" - state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) + error_msg = ( + "Response timeout exceeded" + if isinstance(e, Timeout) + else "Invalid JSON response format" + ) + state.update( + {self.output[0]: {"error": error_msg, "raw_response": str(e)}} + ) return state state.update({self.output[0]: answer}) @@ -241,7 +232,11 @@ def execute(self, state: dict) -> dict: async_runner, {"question": user_prompt}, self.timeout ) except (Timeout, json.JSONDecodeError) as e: - error_msg = "Response timeout exceeded during chunk processing" if isinstance(e, Timeout) else "Invalid JSON response format in chunk processing" + error_msg = ( + "Response timeout exceeded during chunk processing" + if isinstance(e, Timeout) + else "Invalid JSON response format in chunk processing" + ) state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) return state @@ -261,7 +256,11 @@ def execute(self, state: dict) -> dict: self.timeout, ) except (Timeout, json.JSONDecodeError) as e: - error_msg = "Response timeout exceeded during merge" if isinstance(e, Timeout) else "Invalid JSON response format during merge" + error_msg = ( + "Response timeout exceeded during merge" + if isinstance(e, Timeout) + else "Invalid JSON response format during merge" + ) state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) return state diff --git a/tests/graphs/smart_scraper_openai_test.py b/tests/graphs/smart_scraper_openai_test.py index 90d6a7a7..cfe5cb2a 100644 --- a/tests/graphs/smart_scraper_openai_test.py +++ b/tests/graphs/smart_scraper_openai_test.py @@ -6,6 +6,7 @@ import pytest from dotenv import load_dotenv +from pydantic import BaseModel from scrapegraphai.graphs import SmartScraperGraph @@ -53,3 +54,27 @@ def test_get_execution_info(graph_config): graph_exec_info = smart_scraper_graph.get_execution_info() assert graph_exec_info is not None + + +def test_get_execution_info_with_schema(graph_config): + """Get the execution info with schema""" + + class ProjectSchema(BaseModel): + title: str + description: str + + class ProjectListSchema(BaseModel): + projects: list[ProjectSchema] + + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + source="https://perinim.github.io/projects/", + config=graph_config, + schema=ProjectListSchema, + ) + + smart_scraper_graph.run() + + graph_exec_info = smart_scraper_graph.get_execution_info() + + assert graph_exec_info is not None diff --git a/uv.lock b/uv.lock index 4a3e6063..9ab0fd3e 100644 --- a/uv.lock +++ b/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.10, <4.0" resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'darwin'", @@ -3377,7 +3378,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.40.1" +version = "1.43.0" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, @@ -3395,6 +3396,7 @@ dependencies = [ { name = "langchain-openai" }, { name = "minify-html" }, { name = "playwright" }, + { name = "pydantic" }, { name = "python-dotenv" }, { name = "semchunk" }, { name = "simpleeval" }, @@ -3456,6 +3458,7 @@ requires-dist = [ { name = "minify-html", specifier = ">=0.15.0" }, { name = "pillow", marker = "extra == 'ocr'", specifier = ">=10.4.0" }, { name = "playwright", specifier = ">=1.43.0" }, + { name = "pydantic", specifier = ">=2.10.2" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "semchunk", specifier = ">=2.2.0" }, { name = "simpleeval", specifier = ">=1.0.0" }, @@ -3465,6 +3468,7 @@ requires-dist = [ { name = "tqdm", specifier = ">=4.66.4" }, { name = "undetected-playwright", specifier = ">=0.3.0" }, ] +provides-extras = ["burr", "docs", "ocr"] [package.metadata.requires-dev] dev = [