diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d0ea20..ca56021a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,43 @@ +## [1.43.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.42.1...v1.43.0) (2025-03-13) + + +### Features + +* add intrgration for o3min ([fc0a148](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fc0a1480174e59e395232af123ad8ce64595e029)) + +## [1.42.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.42.0...v1.42.1) (2025-03-12) + + +### Bug Fixes + +* add new gpt model ([cff799b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cff799b50d60089f175649eec00da1c5dceeed95)) + +## [1.42.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.41.0...v1.42.0) (2025-03-10) + + +### Features + +* update terms ([ff7b33b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ff7b33b376720c81984142f2783f2e8729b5a525)) + +## [1.41.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.1...v1.41.0) (2025-03-09) + + +### Features + +* add CLoD integration ([4e0e785](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4e0e78582c3a75e64c5eba26ce40b5ffbf05d58e)) + + +### Test + +* Add coverage improvement test for tests/test_generate_answer_node.py ([6769c0d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6769c0d43ab72f1c8b520dd28d19f747b22f9b7c)) +* Add coverage improvement test for tests/test_models_tokens.py ([b21e781](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b21e781ce340c7fa2c5a99a28b7c23e06e950f1e)) +* Update coverage improvement test for tests/graphs/abstract_graph_test.py ([f296ac4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f296ac4d5088a74d4f50e7262631f202a68b152c)) + + +### CI + +* **release:** 1.41.0-beta.1 [skip ci] ([7bfe494](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7bfe494237279d73cefe4161a0b8e95491329ccb)) + ## [1.41.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.1...v1.41.0-beta.1) (2025-03-07) diff --git a/README.md b/README.md index 00a169cb..a345061f 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ The output will be a dictionary like the following: "description": "ScrapeGraphAI transforms websites into clean, organized data for AI agents and data analytics. It offers an AI-powered API for effortless and cost-effective data extraction.", "founders": [ { - "name": "Marco Perini", + "name": "", "role": "Founder & Technical Lead", "linkedin": "https://www.linkedin.com/in/perinim/" }, @@ -193,7 +193,7 @@ We collect anonymous usage metrics to enhance our package's quality and user exp If you have used our library for research purposes please quote us with the following reference: ```text @misc{scrapegraph-ai, - author = {Marco Perini, Lorenzo Padoan, Marco Vinciguerra}, + author = {Lorenzo Padoan, Marco Vinciguerra}, title = {Scrapegraph-ai}, year = {2024}, url = {https://github.com/VinciGit00/Scrapegraph-ai}, @@ -203,14 +203,9 @@ If you have used our library for research purposes please quote us with the foll ## Authors -

- Authors_logos -

- | | Contact Info | |--------------------|----------------------| | Marco Vinciguerra | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/marco-vinciguerra-7ba365242/) | -| Marco Perini | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | | Lorenzo Padoan | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/lorenzo-padoan-4521a2154/) | ## 📜 License diff --git a/docs/chinese.md b/docs/chinese.md index 5eb6460c..9eb6b73e 100644 --- a/docs/chinese.md +++ b/docs/chinese.md @@ -195,7 +195,7 @@ print(result) 如果您将我们的库用于研究目的,请引用以下参考文献: ```text @misc{scrapegraph-ai, - author = {Marco Perini, Lorenzo Padoan, Marco Vinciguerra}, + author = {, Lorenzo Padoan, Marco Vinciguerra}, title = {Scrapegraph-ai}, year = {2024}, url = {https://github.com/VinciGit00/Scrapegraph-ai}, @@ -212,7 +212,7 @@ print(result) | | Contact Info | |--------------------|----------------------| | Marco Vinciguerra | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/marco-vinciguerra-7ba365242/) | -| Marco Perini | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | +| | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | | Lorenzo Padoan | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/lorenzo-padoan-4521a2154/) | ## 📜 许可证 diff --git a/docs/japanese.md b/docs/japanese.md index 7279321f..1b4a911f 100644 --- a/docs/japanese.md +++ b/docs/japanese.md @@ -195,7 +195,7 @@ print(result) 研究目的で当社のライブラリを使用する場合は、以下の参考文献を引用してください: ```text @misc{scrapegraph-ai, - author = {Marco Perini, Lorenzo Padoan, Marco Vinciguerra}, + author = {, Lorenzo Padoan, Marco Vinciguerra}, title = {Scrapegraph-ai}, year = {2024}, url = {https://github.com/VinciGit00/Scrapegraph-ai}, @@ -212,7 +212,7 @@ print(result) | | 連絡先 | |--------------------|----------------------| | Marco Vinciguerra | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/marco-vinciguerra-7ba365242/) | -| Marco Perini | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | +| | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | | Lorenzo Padoan | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/lorenzo-padoan-4521a2154/) | ## 📜 ライセンス diff --git a/docs/korean.md b/docs/korean.md index 64c287a0..756b9e71 100644 --- a/docs/korean.md +++ b/docs/korean.md @@ -211,7 +211,7 @@ graph LR 우리의 라이브러리를 연구 목적으로 사용한 경우 다음과 같이 인용해 주세요: ```text @misc{scrapegraph-ai, - author = {Marco Perini, Lorenzo Padoan, Marco Vinciguerra}, + author = {, Lorenzo Padoan, Marco Vinciguerra}, title = {Scrapegraph-ai}, year = {2024}, url = {https://github.com/VinciGit00/Scrapegraph-ai}, @@ -228,7 +228,7 @@ graph LR | | 연락처 | |--------------------|---------------| | Marco Vinciguerra | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/marco-vinciguerra-7ba365242/) | -| Marco Perini | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | +| | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | | Lorenzo Padoan | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/lorenzo-padoan-4521a2154/) | ## 📜 라이선스 diff --git a/docs/russian.md b/docs/russian.md index ac16ef41..995916d8 100644 --- a/docs/russian.md +++ b/docs/russian.md @@ -218,7 +218,7 @@ print(result) | | Контактная информация | |--------------------|------------------------| | Marco Vinciguerra | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/marco-vinciguerra-7ba365242/) | -| Marco Perini | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | +| | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | | Lorenzo Padoan | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/lorenzo-padoan-4521a2154/) | ## 📜 Лицензия diff --git a/docs/source/conf.py b/docs/source/conf.py index f7d44113..790cfa15 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -16,7 +16,7 @@ project = "ScrapeGraphAI" copyright = "2024, ScrapeGraphAI" -author = "Marco Vinciguerra, Marco Perini, Lorenzo Padoan" +author = "Marco Vinciguerra, , Lorenzo Padoan" html_last_updated_fmt = "%b %d, %Y" diff --git a/docs/turkish.md b/docs/turkish.md index f5799dd2..fb94acb4 100644 --- a/docs/turkish.md +++ b/docs/turkish.md @@ -148,7 +148,7 @@ Kütüphanemizi araştırma amaçlı kullandıysanız, lütfen bizi aşağıdaki ```text @misc{scrapegraph-ai, - author = {Marco Perini, Lorenzo Padoan, Marco Vinciguerra}, + author = {, Lorenzo Padoan, Marco Vinciguerra}, title = {Scrapegraph-ai}, year = {2024}, url = {https://github.com/VinciGit00/Scrapegraph-ai}, @@ -165,7 +165,7 @@ Kütüphanemizi araştırma amaçlı kullandıysanız, lütfen bizi aşağıdaki | | İletişim Bilgileri | | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | Marco Vinciguerra | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/marco-vinciguerra-7ba365242/) | -| Marco Perini | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | +| | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/perinim/) | | Lorenzo Padoan | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/lorenzo-padoan-4521a2154/) | ## 📜 Lisans diff --git a/examples/ScrapegraphAI_cookbook.ipynb b/examples/ScrapegraphAI_cookbook.ipynb index 5016c8c7..3ef7eb1e 100644 --- a/examples/ScrapegraphAI_cookbook.ipynb +++ b/examples/ScrapegraphAI_cookbook.ipynb @@ -537,7 +537,7 @@ "text": [ "{\n", " \"summary\": {\n", - " \"title\": \"Projects | Marco Perini\",\n", + " \"title\": \"Projects | \",\n", " \"projects\": [\n", " {\n", " \"title\": \"Rotary Pendulum RL\",\n", diff --git a/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt b/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt index 2476565e..3320b6cb 100644 --- a/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt +++ b/examples/document_scraper_graph/ollama/inputs/plain_html_example.txt @@ -99,7 +99,7 @@
diff --git a/examples/document_scraper_graph/openai/inputs/markdown_example.md b/examples/document_scraper_graph/openai/inputs/markdown_example.md index d062d5d0..e83f92b6 100644 --- a/examples/document_scraper_graph/openai/inputs/markdown_example.md +++ b/examples/document_scraper_graph/openai/inputs/markdown_example.md @@ -1,4 +1,4 @@ -Marco Perini Toggle navigation + Toggle navigation * About * Projects(current) @@ -30,6 +30,6 @@ surroundings, considering uncertainties in their readings. Drones Modular drone architecture proposal and proof of concept. The project received maximum grade. ](/projects/wireless-esc-drone/) -© Copyright 2023 Marco Perini. Powered by Jekyll with +© Copyright 2023 . Powered by Jekyll with al-folio theme. Hosted by [GitHub Pages](https://pages.github.com/). diff --git a/examples/document_scraper_graph/openai/inputs/plain_html_example.txt b/examples/document_scraper_graph/openai/inputs/plain_html_example.txt index 2476565e..3320b6cb 100644 --- a/examples/document_scraper_graph/openai/inputs/plain_html_example.txt +++ b/examples/document_scraper_graph/openai/inputs/plain_html_example.txt @@ -99,7 +99,7 @@
diff --git a/examples/extras/cond_smartscraper_usage.py b/examples/extras/cond_smartscraper_usage.py index 5c5e82e9..5832e477 100644 --- a/examples/extras/cond_smartscraper_usage.py +++ b/examples/extras/cond_smartscraper_usage.py @@ -30,7 +30,7 @@ # ******************************************************* multiple_search_graph = SmartScraperGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source="https://perinim.github.io/", schema=None, config=graph_config, diff --git a/examples/smart_scraper_graph/ollama/smart_scraper_lite_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_lite_ollama.py index 9c4b9a69..72db77aa 100644 --- a/examples/smart_scraper_graph/ollama/smart_scraper_lite_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_lite_ollama.py @@ -19,7 +19,7 @@ } smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source="https://perinim.github.io/", config=graph_config, ) diff --git a/examples/smart_scraper_graph/ollama/smart_scraper_multi_concat_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_multi_concat_ollama.py index a29ac3fc..1264a0ad 100644 --- a/examples/smart_scraper_graph/ollama/smart_scraper_multi_concat_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_multi_concat_ollama.py @@ -29,7 +29,7 @@ # ******************************************************* multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, config=graph_config, diff --git a/examples/smart_scraper_graph/ollama/smart_scraper_multi_lite_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_multi_lite_ollama.py index 15055f96..a93803f3 100644 --- a/examples/smart_scraper_graph/ollama/smart_scraper_multi_lite_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_multi_lite_ollama.py @@ -26,7 +26,7 @@ # ************************************************ smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], config=graph_config, ) diff --git a/examples/smart_scraper_graph/ollama/smart_scraper_multi_ollama.py b/examples/smart_scraper_graph/ollama/smart_scraper_multi_ollama.py index 04eb0e67..2bc0c425 100644 --- a/examples/smart_scraper_graph/ollama/smart_scraper_multi_ollama.py +++ b/examples/smart_scraper_graph/ollama/smart_scraper_multi_ollama.py @@ -25,7 +25,7 @@ # ******************************************************* multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, config=graph_config, diff --git a/examples/smart_scraper_graph/openai/smart_scraper_lite_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_lite_openai.py index 3d768548..32d6860b 100644 --- a/examples/smart_scraper_graph/openai/smart_scraper_lite_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_lite_openai.py @@ -22,7 +22,7 @@ } smart_scraper_lite_graph = SmartScraperLiteGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source="https://perinim.github.io/", config=graph_config, ) diff --git a/examples/smart_scraper_graph/openai/smart_scraper_multi_concat_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_multi_concat_openai.py index 4774e620..9ae14800 100644 --- a/examples/smart_scraper_graph/openai/smart_scraper_multi_concat_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_multi_concat_openai.py @@ -30,7 +30,7 @@ # ******************************************************* multiple_search_graph = SmartScraperMultiConcatGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, config=graph_config, diff --git a/examples/smart_scraper_graph/openai/smart_scraper_multi_lite_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_multi_lite_openai.py index acc970be..31dbd7f3 100644 --- a/examples/smart_scraper_graph/openai/smart_scraper_multi_lite_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_multi_lite_openai.py @@ -31,7 +31,7 @@ # ************************************************ smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], config=graph_config, ) diff --git a/examples/smart_scraper_graph/openai/smart_scraper_multi_openai.py b/examples/smart_scraper_graph/openai/smart_scraper_multi_openai.py index ec510fc2..a07b992e 100644 --- a/examples/smart_scraper_graph/openai/smart_scraper_multi_openai.py +++ b/examples/smart_scraper_graph/openai/smart_scraper_multi_openai.py @@ -31,7 +31,7 @@ # ******************************************************* multiple_search_graph = SmartScraperMultiGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], schema=None, config=graph_config, diff --git a/pyproject.toml b/pyproject.toml index e3dd29bd..4e642625 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,13 @@ [project] name = "scrapegraphai" -version = "1.41.0b1" +version = "1.43.0" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, - { name = "Marco Perini", email = "perinim.98@gmail.com" }, { name = "Lorenzo Padoan", email = "lorenzo.padoan977@gmail.com" } ] @@ -32,7 +31,8 @@ dependencies = [ "async-timeout>=4.0.3", "simpleeval>=1.0.0", "jsonschema>=4.23.0", - "duckduckgo-search>=7.2.1" + "duckduckgo-search>=7.2.1", + "pydantic>=2.10.2", ] readme = "README.md" diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index d9ed45a2..3908545d 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -53,9 +53,6 @@ def __init__( source: Optional[str] = None, schema: Optional[Type[BaseModel]] = None, ): - if config.get("llm").get("temperature") is None: - config["llm"]["temperature"] = 0 - self.prompt = prompt self.source = source self.config = config @@ -125,7 +122,7 @@ def _create_llm(self, llm_config: dict) -> object: KeyError: If the model is not supported. """ - llm_defaults = {"temperature": 0, "streaming": False} + llm_defaults = {"streaming": False} llm_params = {**llm_defaults, **llm_config} rate_limit_params = llm_params.pop("rate_limit", {}) diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index a0518fb7..226aae00 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -38,7 +38,7 @@ class SmartScraperMultiGraph(AbstractGraph): Example: >>> smart_scraper_multi_graph = SmartScraperMultiGraph( - ... prompt="Who is Marco Perini?", + ... prompt="Who is ?", ... source= [ ... "https://perinim.github.io/", ... "https://perinim.github.io/cv/" diff --git a/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py index d212b08a..849c85c8 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py @@ -38,7 +38,7 @@ class SmartScraperMultiLiteGraph(AbstractGraph): Example: >>> smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - ... prompt="Who is Marco Perini?", + ... prompt="Who is ?", ... source= [ ... "https://perinim.github.io/", ... "https://perinim.github.io/cv/" diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 0871250c..eedf1cb9 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -23,8 +23,11 @@ "gpt-4o-2024-08-06": 128000, "gpt-4o-2024-05-13": 128000, "gpt-4o-mini": 128000, + "gpt-4.5": 128000, + "gpt-4.5-preview": 128000, "o1-preview": 128000, "o1-mini": 128000, + "o3-mini": 200000, }, "azure_openai": { "gpt-3.5-turbo-0125": 16385, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 1df3091e..5e267a4d 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -2,8 +2,8 @@ GenerateAnswerNode Module """ -import time import json +import time from typing import List, Optional from langchain.prompts import PromptTemplate @@ -105,10 +105,7 @@ def process(self, state: dict) -> dict: raise ValueError("No user prompt found in state") # Create the chain input with both content and question keys - chain_input = { - "content": content, - "question": user_prompt - } + chain_input = {"content": content, "question": user_prompt} try: response = self.invoke_with_timeout(self.chain, chain_input, self.timeout) @@ -167,25 +164,13 @@ def execute(self, state: dict) -> dict: and not self.script_creator or self.is_md_scraper ): - template_no_chunks_prompt = ( - TEMPLATE_NO_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions - ) - template_chunks_prompt = ( - TEMPLATE_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions - ) - template_merge_prompt = ( - TEMPLATE_MERGE_MD + "\n\nIMPORTANT: " + format_instructions - ) + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD + template_chunks_prompt = TEMPLATE_CHUNKS_MD + template_merge_prompt = TEMPLATE_MERGE_MD else: - template_no_chunks_prompt = ( - TEMPLATE_NO_CHUNKS + "\n\nIMPORTANT: " + format_instructions - ) - template_chunks_prompt = ( - TEMPLATE_CHUNKS + "\n\nIMPORTANT: " + format_instructions - ) - template_merge_prompt = ( - TEMPLATE_MERGE + "\n\nIMPORTANT: " + format_instructions - ) + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS + template_chunks_prompt = TEMPLATE_CHUNKS + template_merge_prompt = TEMPLATE_MERGE if self.additional_info is not None: template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt @@ -210,8 +195,14 @@ def execute(self, state: dict) -> dict: chain, {"question": user_prompt}, self.timeout ) except (Timeout, json.JSONDecodeError) as e: - error_msg = "Response timeout exceeded" if isinstance(e, Timeout) else "Invalid JSON response format" - state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) + error_msg = ( + "Response timeout exceeded" + if isinstance(e, Timeout) + else "Invalid JSON response format" + ) + state.update( + {self.output[0]: {"error": error_msg, "raw_response": str(e)}} + ) return state state.update({self.output[0]: answer}) @@ -241,7 +232,11 @@ def execute(self, state: dict) -> dict: async_runner, {"question": user_prompt}, self.timeout ) except (Timeout, json.JSONDecodeError) as e: - error_msg = "Response timeout exceeded during chunk processing" if isinstance(e, Timeout) else "Invalid JSON response format in chunk processing" + error_msg = ( + "Response timeout exceeded during chunk processing" + if isinstance(e, Timeout) + else "Invalid JSON response format in chunk processing" + ) state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) return state @@ -261,7 +256,11 @@ def execute(self, state: dict) -> dict: self.timeout, ) except (Timeout, json.JSONDecodeError) as e: - error_msg = "Response timeout exceeded during merge" if isinstance(e, Timeout) else "Invalid JSON response format during merge" + error_msg = ( + "Response timeout exceeded during merge" + if isinstance(e, Timeout) + else "Invalid JSON response format during merge" + ) state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}}) return state diff --git a/tests/graphs/inputs/plain_html_example.txt b/tests/graphs/inputs/plain_html_example.txt index 2476565e..3320b6cb 100644 --- a/tests/graphs/inputs/plain_html_example.txt +++ b/tests/graphs/inputs/plain_html_example.txt @@ -99,7 +99,7 @@
diff --git a/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py index e92b3239..a7a1ed53 100644 --- a/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py +++ b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py @@ -30,7 +30,7 @@ def graph_config(): def test_scraping_pipeline(graph_config): """Start of the scraping pipeline""" smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], config=graph_config, ) @@ -44,7 +44,7 @@ def test_scraping_pipeline(graph_config): def test_get_execution_info(graph_config): """Get the execution info""" smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( - prompt="Who is Marco Perini?", + prompt="Who is ?", source=["https://perinim.github.io/", "https://perinim.github.io/cv/"], config=graph_config, ) diff --git a/tests/graphs/smart_scraper_openai_test.py b/tests/graphs/smart_scraper_openai_test.py index 90d6a7a7..cfe5cb2a 100644 --- a/tests/graphs/smart_scraper_openai_test.py +++ b/tests/graphs/smart_scraper_openai_test.py @@ -6,6 +6,7 @@ import pytest from dotenv import load_dotenv +from pydantic import BaseModel from scrapegraphai.graphs import SmartScraperGraph @@ -53,3 +54,27 @@ def test_get_execution_info(graph_config): graph_exec_info = smart_scraper_graph.get_execution_info() assert graph_exec_info is not None + + +def test_get_execution_info_with_schema(graph_config): + """Get the execution info with schema""" + + class ProjectSchema(BaseModel): + title: str + description: str + + class ProjectListSchema(BaseModel): + projects: list[ProjectSchema] + + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + source="https://perinim.github.io/projects/", + config=graph_config, + schema=ProjectListSchema, + ) + + smart_scraper_graph.run() + + graph_exec_info = smart_scraper_graph.get_execution_info() + + assert graph_exec_info is not None diff --git a/tests/inputs/plain_html_example.txt b/tests/inputs/plain_html_example.txt index 2476565e..3320b6cb 100644 --- a/tests/inputs/plain_html_example.txt +++ b/tests/inputs/plain_html_example.txt @@ -99,7 +99,7 @@
diff --git a/tests/nodes/inputs/plain_html_example.txt b/tests/nodes/inputs/plain_html_example.txt index 2476565e..3320b6cb 100644 --- a/tests/nodes/inputs/plain_html_example.txt +++ b/tests/nodes/inputs/plain_html_example.txt @@ -99,7 +99,7 @@
diff --git a/uv.lock b/uv.lock index 4a3e6063..9ab0fd3e 100644 --- a/uv.lock +++ b/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.10, <4.0" resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'darwin'", @@ -3377,7 +3378,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.40.1" +version = "1.43.0" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, @@ -3395,6 +3396,7 @@ dependencies = [ { name = "langchain-openai" }, { name = "minify-html" }, { name = "playwright" }, + { name = "pydantic" }, { name = "python-dotenv" }, { name = "semchunk" }, { name = "simpleeval" }, @@ -3456,6 +3458,7 @@ requires-dist = [ { name = "minify-html", specifier = ">=0.15.0" }, { name = "pillow", marker = "extra == 'ocr'", specifier = ">=10.4.0" }, { name = "playwright", specifier = ">=1.43.0" }, + { name = "pydantic", specifier = ">=2.10.2" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "semchunk", specifier = ">=2.2.0" }, { name = "simpleeval", specifier = ">=1.0.0" }, @@ -3465,6 +3468,7 @@ requires-dist = [ { name = "tqdm", specifier = ">=4.66.4" }, { name = "undetected-playwright", specifier = ">=0.3.0" }, ] +provides-extras = ["burr", "docs", "ocr"] [package.metadata.requires-dev] dev = [