diff --git a/CHANGELOG.md b/CHANGELOG.md index d807e233..58aba1fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,113 @@ +## [1.27.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.9...v1.27.0-beta.10) (2024-10-25) + + +### Bug Fixes + +* fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422)) + +## [1.27.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.8...v1.27.0-beta.9) (2024-10-24) + + +### Features + +* add model integration gpt4 ([51c55eb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/51c55eb3a2984ba60572edbcdea4c30620e18d76)) + +## [1.27.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.7...v1.27.0-beta.8) (2024-10-24) + + +### Bug Fixes + +* removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4)) + + +### CI + +* **release:** 1.26.7 [skip ci] ([ec9ef2b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ec9ef2bcda9aa81f66b943829fcdb22fe265976e)) + +## [1.27.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.6...v1.27.0-beta.7) (2024-10-24) + + +### Features + +* refactoring of get_probable_tags node ([f658092](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f658092dffb20ea111cc00950f617057482788f4)) + +## [1.27.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.5...v1.27.0-beta.6) (2024-10-23) + + +### Features + +* add integration with scrape.do ([ae275ec](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635)) + +## [1.27.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.4...v1.27.0-beta.5) (2024-10-22) + + +### Features + +* refactoring of export functions ([0ea00c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0ea00c078f2811f0d1b356bd84cafde80763c703)) + +## [1.27.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.3...v1.27.0-beta.4) (2024-10-21) + + +### Features + +* refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887)) + +## [1.27.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.2...v1.27.0-beta.3) (2024-10-20) + + +### Features + +* implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254)) +* Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4)) +======= ## [1.26.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6...v1.26.7) (2024-10-19) ### Bug Fixes +* fix the example variable name ([69ff649](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/69ff6495564a5c670b89c0f802ebb1602f0e7cfa)) + + +### chore + +* fix example ([9cd9a87](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd9a874f91bbbb2990444818e8ab2d0855cc361)) + + +### Test + +* Add scrape_graph test ([cdb3c11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cdb3c1100ee1117afedbc70437317acaf7c7c1d3)) +* Add smart_scraper_multi_parse_merge_first_graph test ([464b8b0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/464b8b04ea0d51280849173d5eda92d4d4db8612)) + +## [1.27.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.1...v1.27.0-beta.2) (2024-10-18) + + +### Bug Fixes + +* refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80)) + + +### CI + +* **release:** 1.26.6 [skip ci] ([a4634c7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a4634c73312b5c08581a8d670d53b7eebe8dadc1)) + +## [1.27.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6-beta.1...v1.27.0-beta.1) (2024-10-16) + + +### Features + +* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3)) + + * removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4)) ## [1.26.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6) (2024-10-18) +## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14) ### Bug Fixes +* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918)) + * refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80)) ## [1.26.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.4...v1.26.5) (2024-10-13) diff --git a/examples/anthropic/smart_scraper_lite_anthropic.py b/examples/anthropic/smart_scraper_lite_anthropic.py new file mode 100644 index 00000000..698623c6 --- /dev/null +++ b/examples/anthropic/smart_scraper_lite_anthropic.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/anthropic/smart_scraper_multi_lite_anthropic.py b/examples/anthropic/smart_scraper_multi_lite_anthropic.py new file mode 100644 index 00000000..7cf3c09d --- /dev/null +++ b/examples/anthropic/smart_scraper_multi_lite_anthropic.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/azure/smart_scraper_lite_azure.py b/examples/azure/smart_scraper_lite_azure.py new file mode 100644 index 00000000..335c4832 --- /dev/null +++ b/examples/azure/smart_scraper_lite_azure.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-4o" + }, + "verbose": True, + "headless": False +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/smart_scraper_multi_lite_azure.py b/examples/azure/smart_scraper_multi_lite_azure.py new file mode 100644 index 00000000..b9046d9f --- /dev/null +++ b/examples/azure/smart_scraper_multi_lite_azure.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-4o" + }, + "verbose": True, + "headless": False +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/bedrock/smart_scraper_lite_bedrock.py b/examples/bedrock/smart_scraper_lite_bedrock.py new file mode 100644 index 00000000..2bf0471c --- /dev/null +++ b/examples/bedrock/smart_scraper_lite_bedrock.py @@ -0,0 +1,26 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + } +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/smart_scraper_multi_lite_bedrock.py b/examples/bedrock/smart_scraper_multi_lite_bedrock.py new file mode 100644 index 00000000..5cb26067 --- /dev/null +++ b/examples/bedrock/smart_scraper_multi_lite_bedrock.py @@ -0,0 +1,29 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + } +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_lite_deepseek.py b/examples/deepseek/smart_scraper_lite_deepseek.py new file mode 100644 index 00000000..a70d76b0 --- /dev/null +++ b/examples/deepseek/smart_scraper_lite_deepseek.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("DEEPSEEK_API_KEY"), + "model": "deepseek/deepseek-coder-33b-instruct", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_multi_lite_deepseek.py b/examples/deepseek/smart_scraper_multi_lite_deepseek.py new file mode 100644 index 00000000..eb5eea01 --- /dev/null +++ b/examples/deepseek/smart_scraper_multi_lite_deepseek.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("DEEPSEEK_API_KEY"), + "model": "deepseek/deepseek-coder-33b-instruct", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/ernie/smart_scraper_lite_ernie.py b/examples/ernie/smart_scraper_lite_ernie.py new file mode 100644 index 00000000..5d3ba9d9 --- /dev/null +++ b/examples/ernie/smart_scraper_lite_ernie.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ERNIE_API_KEY"), + "model": "ernie/ernie-bot-4", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/smart_scraper_multi_lite_ernie.py b/examples/ernie/smart_scraper_multi_lite_ernie.py new file mode 100644 index 00000000..777a760e --- /dev/null +++ b/examples/ernie/smart_scraper_multi_lite_ernie.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ERNIE_API_KEY"), + "model": "ernie/ernie-bot-4", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/extras/cond_smartscraper_usage.py b/examples/extras/cond_smartscraper_usage.py new file mode 100644 index 00000000..54c40712 --- /dev/null +++ b/examples/extras/cond_smartscraper_usage.py @@ -0,0 +1,38 @@ +""" +Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("GROQ_APIKEY"), + "model": "groq/gemma-7b-it", + }, + "verbose": True, + "headless": True, + "reattempt": True #Setting this to True will allow the graph to reattempt the scraping process +} + +# ******************************************************* +# Create the SmartScraperMultiCondGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/smart_scraper_lite_fireworks.py b/examples/fireworks/smart_scraper_lite_fireworks.py new file mode 100644 index 00000000..6c9a7745 --- /dev/null +++ b/examples/fireworks/smart_scraper_lite_fireworks.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("FIREWORKS_API_KEY"), + "model": "fireworks/llama-v2-70b-chat", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/smart_scraper_multi_lite_fireworks.py b/examples/fireworks/smart_scraper_multi_lite_fireworks.py new file mode 100644 index 00000000..4ffaf6bb --- /dev/null +++ b/examples/fireworks/smart_scraper_multi_lite_fireworks.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("FIREWORKS_API_KEY"), + "model": "fireworks/llama-v2-70b-chat", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/google_genai/smart_scraper_lite_google_genai.py b/examples/google_genai/smart_scraper_lite_google_genai.py new file mode 100644 index 00000000..9b776735 --- /dev/null +++ b/examples/google_genai/smart_scraper_lite_google_genai.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GOOGLE_API_KEY"), + "model": "gemini-pro", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/smart_scraper_multi_lite_gemini.py b/examples/google_genai/smart_scraper_multi_lite_gemini.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/google_genai/smart_scraper_multi_lite_google_genai.py b/examples/google_genai/smart_scraper_multi_lite_google_genai.py new file mode 100644 index 00000000..e14e2ceb --- /dev/null +++ b/examples/google_genai/smart_scraper_multi_lite_google_genai.py @@ -0,0 +1,34 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GOOGLE_API_KEY"), + "model": "gemini-pro", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_lite_google_vertexai.py b/examples/google_vertexai/smart_scraper_lite_google_vertexai.py new file mode 100644 index 00000000..eca61bbb --- /dev/null +++ b/examples/google_vertexai/smart_scraper_lite_google_vertexai.py @@ -0,0 +1,33 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "project": os.getenv("GOOGLE_CLOUD_PROJECT"), + "location": "us-central1", + "model": "text-bison@001", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py b/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py new file mode 100644 index 00000000..5c293416 --- /dev/null +++ b/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "project": os.getenv("GOOGLE_CLOUD_PROJECT"), + "location": "us-central1", + "model": "text-bison@001", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_multi_lite_vertex.py b/examples/google_vertexai/smart_scraper_multi_lite_vertex.py new file mode 100644 index 00000000..60ff3638 --- /dev/null +++ b/examples/google_vertexai/smart_scraper_multi_lite_vertex.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "project": os.getenv("GOOGLE_CLOUD_PROJECT"), + "location": "us-central1", + "model": "text-bison@001", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/smart_scraper_lite_groq.py b/examples/groq/smart_scraper_lite_groq.py new file mode 100644 index 00000000..5fe6022f --- /dev/null +++ b/examples/groq/smart_scraper_lite_groq.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GROQ_API_KEY"), + "model": "mixtral-8x7b-32768", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/smart_scraper_multi_lite_groq.py b/examples/groq/smart_scraper_multi_lite_groq.py new file mode 100644 index 00000000..9c8e4d1d --- /dev/null +++ b/examples/groq/smart_scraper_multi_lite_groq.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GROQ_API_KEY"), + "model": "mixtral-8x7b-32768", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py new file mode 100644 index 00000000..4faa8a47 --- /dev/null +++ b/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("HUGGINGFACEHUB_API_TOKEN"), + "model": "huggingfacehub/meta-llama/Llama-2-70b-chat-hf", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py new file mode 100644 index 00000000..2d7a3a45 --- /dev/null +++ b/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py @@ -0,0 +1,34 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("HUGGINGFACEHUB_API_TOKEN"), + "model": "huggingfacehub/meta-llama/Llama-2-70b-chat-hf", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/smart_scraper_multi_lite_uhggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_lite_uhggingfacehub.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/local_models/smart_scraper_lite_ollama.py b/examples/local_models/smart_scraper_lite_ollama.py new file mode 100644 index 00000000..2cf6c402 --- /dev/null +++ b/examples/local_models/smart_scraper_lite_ollama.py @@ -0,0 +1,30 @@ +""" +Basic example of scraping pipeline using SmartScraper + +""" +import json +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", + "base_url": "http://localhost:11434", + }, + "verbose": True, + "headless": False +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/smart_scraper_multi_lite_ollama.py b/examples/local_models/smart_scraper_multi_lite_ollama.py new file mode 100644 index 00000000..f09c4cb4 --- /dev/null +++ b/examples/local_models/smart_scraper_multi_lite_ollama.py @@ -0,0 +1,45 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/mistral/smart_scraper_lite_mistral.py b/examples/mistral/smart_scraper_lite_mistral.py new file mode 100644 index 00000000..390371f9 --- /dev/null +++ b/examples/mistral/smart_scraper_lite_mistral.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("MISTRAL_API_KEY"), + "model": "mistral/mistral-medium", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/smart_scraper_multi_lite_mistral.py b/examples/mistral/smart_scraper_multi_lite_mistral.py new file mode 100644 index 00000000..ce2d19bf --- /dev/null +++ b/examples/mistral/smart_scraper_multi_lite_mistral.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("MISTRAL_API_KEY"), + "model": "mistral/mistral-medium", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/moonshot/smart_scraper_lite_moonshot.py b/examples/moonshot/smart_scraper_lite_moonshot.py new file mode 100644 index 00000000..509027fb --- /dev/null +++ b/examples/moonshot/smart_scraper_lite_moonshot.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/moonshot/smart_scraper_multi_lite_moonshot.py b/examples/moonshot/smart_scraper_multi_lite_moonshot.py new file mode 100644 index 00000000..b3e2b7be --- /dev/null +++ b/examples/moonshot/smart_scraper_multi_lite_moonshot.py @@ -0,0 +1,34 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("MOONSHOT_API_KEY"), + "model": "moonshot/moonshot-v1-8b", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_lite_nemotron.py b/examples/nemotron/smart_scraper_lite_nemotron.py new file mode 100644 index 00000000..6c1d8528 --- /dev/null +++ b/examples/nemotron/smart_scraper_lite_nemotron.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_API_KEY"), + "model": "nemotron/nemotron-3.5-turbo", + "base_url": "http://127.0.0.1:3000/v1", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_multi_lite_nemotron.py b/examples/nemotron/smart_scraper_multi_lite_nemotron.py new file mode 100644 index 00000000..7639d820 --- /dev/null +++ b/examples/nemotron/smart_scraper_multi_lite_nemotron.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_API_KEY"), + "model": "nemotron/nemotron-3-8b-chat", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_lite_oneapi.py b/examples/oneapi/smart_scraper_lite_oneapi.py new file mode 100644 index 00000000..b271acb3 --- /dev/null +++ b/examples/oneapi/smart_scraper_lite_oneapi.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ONEAPI_API_KEY"), + "model": "oneapi/gpt-3.5-turbo", + "base_url": "http://127.0.0.1:3000/v1", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_multi_lite_oneapi.py b/examples/oneapi/smart_scraper_multi_lite_oneapi.py new file mode 100644 index 00000000..8cf66dea --- /dev/null +++ b/examples/oneapi/smart_scraper_multi_lite_oneapi.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/smart_scraper_lite_openai.py b/examples/openai/smart_scraper_lite_openai.py new file mode 100644 index 00000000..5de725bb --- /dev/null +++ b/examples/openai/smart_scraper_lite_openai.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/openai/smart_scraper_multi_lite_openai.py b/examples/openai/smart_scraper_multi_lite_openai.py new file mode 100644 index 00000000..69eeafc7 --- /dev/null +++ b/examples/openai/smart_scraper_multi_lite_openai.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py index 7a2e7f3e..fb7b4d9e 100644 --- a/examples/together/depth_search_graph_together.py +++ b/examples/together/depth_search_graph_together.py @@ -7,13 +7,12 @@ load_dotenv() -openai_key = os.getenv("OPENAI_APIKEY") +together_key = os.getenv("TOGETHER_APIKEY") graph_config = { "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, }, "verbose": True, "headless": False, diff --git a/examples/together/smart_scraper_lite_together.py b/examples/together/smart_scraper_lite_together.py new file mode 100644 index 00000000..0519ecba --- /dev/null +++ b/examples/together/smart_scraper_lite_together.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/together/smart_scraper_multi_lite_together.py b/examples/together/smart_scraper_multi_lite_together.py new file mode 100644 index 00000000..8cf66dea --- /dev/null +++ b/examples/together/smart_scraper_multi_lite_together.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/pyproject.toml b/pyproject.toml index 11b98499..be705469 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,9 @@ [project] name = "scrapegraphai" -version = "1.26.7" +version = "1.27.0b10" + + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ @@ -38,7 +40,8 @@ dependencies = [ "async-timeout>=4.0.3", "transformers>=4.44.2", "googlesearch-python>=1.2.5", - "simpleeval>=1.0.0" + "simpleeval>=1.0.0", + "async_timeout>=4.0.3" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index bca5e9c2..61bd3e2b 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -30,8 +30,6 @@ anyio==4.4.0 astroid==3.2.4 # via pylint async-timeout==4.0.3 - # via aiohttp - # via langchain # via scrapegraphai attrs==24.2.0 # via aiohttp @@ -80,9 +78,6 @@ distro==1.9.0 # via openai docutils==0.19 # via sphinx -exceptiongroup==1.2.2 - # via anyio - # via pytest fastapi==0.112.0 # via burr fastapi-pagination==0.12.26 @@ -136,6 +131,7 @@ graphviz==0.20.3 # via burr greenlet==3.0.3 # via playwright + # via sqlalchemy grpcio==1.65.4 # via google-api-core # via grpcio-status @@ -504,9 +500,6 @@ tokenizers==0.19.1 # via transformers toml==0.10.2 # via streamlit -tomli==2.0.1 - # via pylint - # via pytest tomlkit==0.13.0 # via pylint tornado==6.4.1 @@ -524,8 +517,6 @@ transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 # via altair - # via anyio - # via astroid # via fastapi # via fastapi-pagination # via google-generativeai @@ -540,7 +531,6 @@ typing-extensions==4.12.2 # via sqlalchemy # via streamlit # via typing-inspect - # via uvicorn typing-inspect==0.9.0 # via dataclasses-json # via sf-hamilton diff --git a/requirements.lock b/requirements.lock index 38be6e68..c2c40996 100644 --- a/requirements.lock +++ b/requirements.lock @@ -19,8 +19,6 @@ anyio==4.4.0 # via httpx # via openai async-timeout==4.0.3 - # via aiohttp - # via langchain # via scrapegraphai attrs==23.2.0 # via aiohttp @@ -50,8 +48,6 @@ dill==0.3.8 # via multiprocess distro==1.9.0 # via openai -exceptiongroup==1.2.2 - # via anyio fastembed==0.3.6 # via scrapegraphai filelock==3.15.4 @@ -91,6 +87,7 @@ googlesearch-python==1.2.5 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy grpcio==1.65.1 # via google-api-core # via grpcio-status @@ -371,7 +368,6 @@ tqdm==4.66.4 transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 - # via anyio # via google-generativeai # via huggingface-hub # via langchain-core diff --git a/requirements.txt b/requirements.txt index c72ad1bb..6c7a0326 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ undetected-playwright>=0.3.0 semchunk>=1.0.1 langchain-ollama>=0.1.3 simpleeval>=0.9.13 -googlesearch-python>=1.2.5 \ No newline at end of file +googlesearch-python>=1.2.5 +async_timeout>=4.0.3 \ No newline at end of file diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 5b217bc9..2c75f0f7 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -25,3 +25,5 @@ from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph +from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph +from .smart_scraper_lite_graph import SmartScraperLiteGraph diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 45953b2f..74135108 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -93,7 +93,10 @@ def _set_conditional_node_edges(self): if len(outgoing_edges) != 2: raise ValueError(f"ConditionalNode '{node.node_name}' must have exactly two outgoing edges.") node.true_node_name = outgoing_edges[0][1].node_name - node.false_node_name = outgoing_edges[1][1].node_name + try: + node.false_node_name = outgoing_edges[1][1].node_name + except: + node.false_node_name = None def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: """ @@ -219,6 +222,8 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: node_names = {node.node_name for node in self.nodes} if result in node_names: current_node_name = result + elif result is None: + current_node_name = None else: raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph") diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index c3463c40..325ffb45 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -44,7 +44,6 @@ class CSVScraperMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index 086651af..7c1e4e45 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -44,7 +44,6 @@ class JSONScraperMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index 1d782d0f..0eb3200a 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -43,8 +43,6 @@ class ScriptCreatorMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) - self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index bc85bb00..594420f5 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -2,7 +2,6 @@ SmartScraperGraph Module """ from typing import Optional -import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph @@ -10,8 +9,10 @@ FetchNode, ParseNode, ReasoningNode, - GenerateAnswerNode + GenerateAnswerNode, + ConditionalNode ) +from ..prompts import REGEN_ADDITIONAL_INFO class SmartScraperGraph(AbstractGraph): """ @@ -90,6 +91,28 @@ def _create_graph(self) -> BaseGraph: } ) + cond_node = None + regen_node = None + if self.config.get("reattempt") is True: + cond_node = ConditionalNode( + input="answer", + output=["answer"], + node_name="ConditionalNode", + node_config={ + "key_name": "answer", + "condition": 'not answer or answer=="NA"', + } + ) + regen_node = GenerateAnswerNode( + input="user_prompt & answer", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "additional_info": REGEN_ADDITIONAL_INFO, + "schema": self.schema, + } + ) + if self.config.get("html_mode") is False: parse_node = ParseNode( input="doc", @@ -100,6 +123,7 @@ def _create_graph(self) -> BaseGraph: } ) + reasoning_node = None if self.config.get("reasoning"): reasoning_node = ReasoningNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -110,68 +134,72 @@ def _create_graph(self) -> BaseGraph: "schema": self.schema, } ) + + # Define the graph variation configurations + # (html_mode, reasoning, reattempt) + graph_variation_config = { + (False, True, False): { + "nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node], + "edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node)] + }, + (True, True, False): { + "nodes": [fetch_node, reasoning_node, generate_answer_node], + "edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node)] + }, + (True, False, False): { + "nodes": [fetch_node, generate_answer_node], + "edges": [(fetch_node, generate_answer_node)] + }, + (False, False, False): { + "nodes": [fetch_node, parse_node, generate_answer_node], + "edges": [(fetch_node, parse_node), (parse_node, generate_answer_node)] + }, + (False, True, True): { + "nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + }, + (True, True, True): { + "nodes": [fetch_node, reasoning_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + }, + (True, False, True): { + "nodes": [fetch_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, generate_answer_node), (generate_answer_node, cond_node), + (cond_node, regen_node), (cond_node, None)] + }, + (False, False, True): { + "nodes": [fetch_node, parse_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, parse_node), (parse_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + } + } - if self.config.get("html_mode") is False and self.config.get("reasoning") is True: - - return BaseGraph( - nodes=[ - fetch_node, - parse_node, - reasoning_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, reasoning_node), - (reasoning_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - - elif self.config.get("html_mode") is True and self.config.get("reasoning") is True: + # Get the current conditions + html_mode = self.config.get("html_mode", False) + reasoning = self.config.get("reasoning", False) + reattempt = self.config.get("reattempt", False) - return BaseGraph( - nodes=[ - fetch_node, - reasoning_node, - generate_answer_node, - ], - edges=[ - (fetch_node, reasoning_node), - (reasoning_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) + # Retrieve the appropriate graph configuration + config = graph_variation_config.get((html_mode, reasoning, reattempt)) - elif self.config.get("html_mode") is True and self.config.get("reasoning") is False: + if config: return BaseGraph( - nodes=[ - fetch_node, - generate_answer_node, - ], - edges=[ - (fetch_node, generate_answer_node) - ], + nodes=config["nodes"], + edges=config["edges"], entry_point=fetch_node, graph_name=self.__class__.__name__ ) + # Default return if no conditions match return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - + nodes=[fetch_node, parse_node, generate_answer_node], + edges=[(fetch_node, parse_node), (parse_node, generate_answer_node)], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + def run(self) -> str: """ Executes the scraping process and returns the answer to the prompt. diff --git a/scrapegraphai/graphs/smart_scraper_lite_graph.py b/scrapegraphai/graphs/smart_scraper_lite_graph.py new file mode 100644 index 00000000..77437145 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_lite_graph.py @@ -0,0 +1,99 @@ +""" +SmartScraperGraph Module +""" +from typing import Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from ..nodes import ( + FetchNode, + ParseNode, +) + +class SmartScraperLiteGraph(AbstractGraph): + """ + SmartScraperLiteGraph is a scraping pipeline that automates the process of + extracting information from web pages. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + + Example: + >>> scraper = SmartScraperLiteGraph( + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + ) + """ + + def __init__(self, source: str, config: dict, prompt: str = "", + schema: Optional[BaseModel] = None): + super().__init__(prompt, config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + fetch_node = FetchNode( + input="url| local_dir", + output=["doc"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + "browser_base": self.config.get("browser_base"), + "scrape_do": self.config.get("scrape_do") + } + ) + + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + ], + edges=[ + (fetch_node, parse_node), + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the scraping content. + + Returns: + str: The scraping content. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("parsed_doc", "No document found.") diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py index f7a6bee4..35eefb6a 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py @@ -35,17 +35,16 @@ class SmartScraperMultiConcatGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> search_graph = MultipleSearchGraph( + >>> smart_scraper_multi_concat_graph = SmartScraperMultiConcatGraph( ... "What is Chioggia famous for?", ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = search_graph.run() + >>> result = smart_scraper_multi_concat_graph.run() """ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 8d0063a3..a2e21d1b 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -18,6 +18,8 @@ class SmartScraperMultiGraph(AbstractGraph): SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiLiteGraph is that in this case the content will be abstracted + by llm and then merged finally passed to the llm. Attributes: prompt (str): The user prompt to search the internet. @@ -34,11 +36,15 @@ class SmartScraperMultiGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> search_graph = MultipleSearchGraph( - ... "What is Chioggia famous for?", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + >>> smart_scraper_multi_graph = SmartScraperMultiGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = search_graph.run() + >>> result = smart_scraper_multi_graph.run() """ def __init__(self, prompt: str, source: List[str], diff --git a/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py new file mode 100644 index 00000000..bb17bd03 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py @@ -0,0 +1,103 @@ +""" +SmartScraperMultiGraph Module +""" +from copy import deepcopy +from typing import List, Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .smart_scraper_lite_graph import SmartScraperLiteGraph +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode, +) +from ..utils.copy import safe_deepcopy + +class SmartScraperMultiLiteGraph(AbstractGraph): + """ + SmartScraperMultiLiteGraph is a scraping pipeline that scrapes a + list of URLs and merge the content first and finally generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiGraph is that in this case the content is merged + before to be passed to the llm. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper_multi_lite_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): + + self.copy_config = safe_deepcopy(config) + self.copy_schema = deepcopy(schema) + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping + and parsing and then merge the content and generates answers to a given prompt. + """ + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["parsed_doc"], + node_config={ + "graph_instance": SmartScraperLiteGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & parsed_doc", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.copy_schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and parsing process first and + then concatenate the content and generates answers to a given prompt. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 02ff61e9..c5ff58f3 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -61,7 +61,7 @@ def execute(self, state: dict) -> dict: str: The name of the next node to execute based on the presence of the key. """ - if self.true_node_name is None or self.false_node_name is None: + if self.true_node_name is None: raise ValueError("ConditionalNode's next nodes are not set properly.") if self.condition: diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 4cd549a5..d90864e9 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -270,10 +270,10 @@ def handle_web_source(self, state, source): else: loader_kwargs = {} - if self.node_config is not None: + if self.node_config: loader_kwargs = self.node_config.get("loader_kwargs", {}) - if self.browser_base is not None: + if self.browser_base: try: from ..docloaders.browser_base import browser_base_fetch except ImportError: @@ -285,7 +285,7 @@ def handle_web_source(self, state, source): document = [Document(page_content=content, metadata={"source": source}) for content in data] - elif self.scrape_do is not None: + elif self.scrape_do: from ..docloaders.scrape_do import scrape_do_fetch if (self.scrape_do.get("use_proxy") is None) or \ self.scrape_do.get("geoCode") is None or \ diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index 0f772edf..ce8e4042 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -57,6 +57,7 @@ def __init__( self.headless = node_config.get("headless", True) if node_config else True self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {} self.browser_base = node_config.get("browser_base", None) + self.scrape_do = node_config.get("scrape_do", None) self.depth = node_config.get("depth", 1) if node_config else 1 self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False self.min_input_len = 1 @@ -115,6 +116,11 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: self.browser_base.get("project_id"), [source]) document = [Document(page_content=content, metadata={"source": source}) for content in data] + elif self.scrape_do: + from ..docloaders.scrape_do import scrape_do_fetch + data = scrape_do_fetch(self.scrape_do.get("api_key"), source) + document = [Document(page_content=data, + metadata={"source": source})] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() diff --git a/scrapegraphai/nodes/generate_answer_from_image_node.py b/scrapegraphai/nodes/generate_answer_from_image_node.py index 7134cabe..9359b2bb 100644 --- a/scrapegraphai/nodes/generate_answer_from_image_node.py +++ b/scrapegraphai/nodes/generate_answer_from_image_node.py @@ -71,10 +71,10 @@ async def execute_async(self, state: dict) -> dict: images = state.get('screenshots', []) analyses = [] - supported_models = ("gpt-4o", "gpt-4o-mini", "gpt-4-turbo") + supported_models = ("gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4") if self.node_config["config"]["llm"]["model"].split("/")[-1]not in supported_models: - raise ValueError(f"""Model '{self.node_config['config']['llm']['model']}' + raise ValueError(f"""The model provided is not supported. Supported models are: {', '.join(supported_models)}.""") diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index 9ba38283..e34bbbb4 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -4,6 +4,7 @@ from typing import List, Optional from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate +from ..prompts import TEMPLATE_GET_PROBABLE_TAGS from ..utils.logging import get_logger from .base_node import BaseNode @@ -68,14 +69,7 @@ def execute(self, state: dict) -> dict: output_parser = CommaSeparatedListOutputParser() format_instructions = output_parser.get_format_instructions() - template = """ - PROMPT: - You are a website scraper that knows all the types of html tags. - You are now asked to list all the html tags where you think you can find the information of the asked question.\n - INSTRUCTIONS: {format_instructions} \n - WEBPAGE: The webpage is: {webpage} \n - QUESTION: The asked question is the following: {question} - """ + template = TEMPLATE_GET_PROBABLE_TAGS tag_prompt = PromptTemplate( template=template, diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py index b23374a4..15889108 100644 --- a/scrapegraphai/prompts/__init__.py +++ b/scrapegraphai/prompts/__init__.py @@ -5,7 +5,7 @@ from .generate_answer_node_prompts import (TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, - TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD) + TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD, REGEN_ADDITIONAL_INFO) from .generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV) @@ -36,3 +36,4 @@ from .reasoning_node_prompts import (TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT) from .merge_generated_scripts_prompts import TEMPLATE_MERGE_SCRIPTS_PROMPT +from .get_probable_tags_node_prompts import TEMPLATE_GET_PROBABLE_TAGS diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py index 86264d0b..944ed24e 100644 --- a/scrapegraphai/prompts/description_node_prompts.py +++ b/scrapegraphai/prompts/description_node_prompts.py @@ -7,4 +7,4 @@ following content from a website. \n Please provide a description summary of maximum of 20 words. \n CONTENT OF THE WEBSITE: {content} -""" \ No newline at end of file +""" diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py index f9506a7b..a14f27f4 100644 --- a/scrapegraphai/prompts/generate_answer_node_prompts.py +++ b/scrapegraphai/prompts/generate_answer_node_prompts.py @@ -86,3 +86,7 @@ USER QUESTION: {question}\n WEBSITE CONTENT: {context}\n """ + +REGEN_ADDITIONAL_INFO = """ +You are a scraper and you have just failed to scrape the requested information from a website. \n +I want you to try again and provide the missing informations. \n""" diff --git a/scrapegraphai/prompts/get_probable_tags_node_prompts.py b/scrapegraphai/prompts/get_probable_tags_node_prompts.py new file mode 100644 index 00000000..ed86e163 --- /dev/null +++ b/scrapegraphai/prompts/get_probable_tags_node_prompts.py @@ -0,0 +1,12 @@ +""" +Get probable tags node prompts +""" + +TEMPLATE_GET_PROBABLE_TAGS = """ + PROMPT: + You are a website scraper that knows all the types of html tags. + You are now asked to list all the html tags where you think you can find the information of the asked question.\n + INSTRUCTIONS: {format_instructions} \n + WEBPAGE: The webpage is: {webpage} \n + QUESTION: The asked question is the following: {question} +""" diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index d5badca9..22f6a4bc 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -1,8 +1,6 @@ """ __init__.py file for utils folder """ -from .convert_to_csv import convert_to_csv -from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers from .save_audio_from_bytes import save_audio_from_bytes @@ -28,3 +26,4 @@ validation_focused_code_generation, semantic_focused_code_generation) from .save_code_to_file import save_code_to_file +from .data_export import export_to_json, export_to_csv, export_to_xml diff --git a/scrapegraphai/utils/convert_to_csv.py b/scrapegraphai/utils/convert_to_csv.py deleted file mode 100644 index e0664541..00000000 --- a/scrapegraphai/utils/convert_to_csv.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Module that given a filename and a position saves the file in the csv format -""" -import os -import sys -import pandas as pd - -def convert_to_csv(data: dict, filename: str, position: str = None) -> None: - """ - Converts a dictionary to a CSV file and saves it at a specified location. - - Args: - data (dict): The data to be converted into CSV format. - filename (str): The name of the output CSV file, without the '.csv' extension. - position (str, optional): The file path where the CSV should be saved. - Defaults to the directory of the caller script if not provided. - - Returns: - None: The function does not return anything. - - Raises: - FileNotFoundError: If the specified directory does not exist. - PermissionError: If write permissions are lacking for the directory. - TypeError: If `data` is not a dictionary. - Exception: For other issues that may arise during the creation or saving of the CSV file. - - Example: - >>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') - Saves a CSV file named 'output.csv' at '/path/to/save'. - """ - - if ".csv" in filename: - filename = filename.replace(".csv", "") - - if position is None: - caller_dir = os.path.dirname(os.path.abspath(sys.argv[0])) - position = caller_dir - - try: - if not isinstance(data, dict): - raise TypeError("Input data must be a dictionary") - - os.makedirs(position, exist_ok=True) - - df = pd.DataFrame.from_dict(data, orient='index') - df.to_csv(os.path.join(position, f"{filename}.csv"), index=False) - - except FileNotFoundError as fnfe: - raise FileNotFoundError( - f"The specified directory '{position}' does not exist.") from fnfe - except PermissionError as pe: - raise PermissionError( - f"You don't have permission to write to '{position}'.") from pe - except Exception as e: - raise e diff --git a/scrapegraphai/utils/convert_to_json.py b/scrapegraphai/utils/convert_to_json.py deleted file mode 100644 index 4e1711f1..00000000 --- a/scrapegraphai/utils/convert_to_json.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Convert to json module -""" -import json -import os -import sys - -def convert_to_json(data: dict, filename: str, position: str = None) -> None: - """ - Converts a dictionary to a JSON file and saves it at a specified location. - - Args: - data (dict): The data to be converted into JSON format. - filename (str): The name of the output JSON file, without the '.json' extension. - position (str, optional): The file path where the JSON file should be saved. - Defaults to the directory of the caller script if not provided. - - Returns: - None: The function does not return anything. - - Raises: - ValueError: If 'filename' contains '.json'. - FileNotFoundError: If the specified directory does not exist. - PermissionError: If write permissions are lacking for the directory. - - Example: - >>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') - Saves a JSON file named 'output.json' at '/path/to/save'. - - Notes: - This function automatically ensures the directory exists before - attempting to write the file. - If the directory does not exist, it will attempt to create it. - """ - - if ".json" in filename: - filename = filename.replace(".json", "") # Remove .json extension - - if position is None: - caller_dir = os.path.dirname(os.path.abspath(sys.argv[0])) - position = caller_dir - - try: - os.makedirs(position, exist_ok=True) - with open(os.path.join(position, f"{filename}.json"), "w", encoding="utf-8") as f: - f.write(json.dumps(data)) - except FileNotFoundError as fnfe: - raise FileNotFoundError( - f"The specified directory '{position}' does not exist.") from fnfe - except PermissionError as pe: - raise PermissionError( - f"You don't have permission to write to '{position}'.") from pe diff --git a/scrapegraphai/utils/data_export.py b/scrapegraphai/utils/data_export.py new file mode 100644 index 00000000..fbff45e2 --- /dev/null +++ b/scrapegraphai/utils/data_export.py @@ -0,0 +1,57 @@ +""" +data_export module +This module provides functions to export data to various file formats. +""" +import json +import csv +import xml.etree.ElementTree as ET +from typing import List, Dict, Any + +def export_to_json(data: List[Dict[str, Any]], filename: str) -> None: + """ + Export data to a JSON file. + + :param data: List of dictionaries containing the data to export + :param filename: Name of the file to save the JSON data + """ + with open(filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + print(f"Data exported to {filename}") + +def export_to_csv(data: List[Dict[str, Any]], filename: str) -> None: + """ + Export data to a CSV file. + + :param data: List of dictionaries containing the data to export + :param filename: Name of the file to save the CSV data + """ + if not data: + print("No data to export") + return + + keys = data[0].keys() + with open(filename, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=keys) + writer.writeheader() + writer.writerows(data) + print(f"Data exported to {filename}") + +def export_to_xml(data: List[Dict[str, Any]], filename: str, root_element: str = "data") -> None: + """ + Export data to an XML file. + + :param data: List of dictionaries containing the data to export + :param filename: Name of the file to save the XML data + :param root_element: Name of the root element in the XML structure + """ + root = ET.Element(root_element) + for item in data: + element = ET.SubElement(root, "item") + for key, value in item.items(): + sub_element = ET.SubElement(element, key) + sub_element.text = str(value) + + tree = ET.ElementTree(root) + tree.write(filename, encoding='utf-8', xml_declaration=True) + print(f"Data exported to {filename}") + diff --git a/tests/graphs/scrape_graph_test.py b/tests/graphs/scrape_graph_test.py new file mode 100644 index 00000000..00d3f4fb --- /dev/null +++ b/tests/graphs/scrape_graph_test.py @@ -0,0 +1,50 @@ +""" +Module for testing the scrape graph class +""" + +import os +import pytest +import pandas as pd +from dotenv import load_dotenv +from scrapegraphai.graphs import ScrapeGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +@pytest.fixture +def graph_config(): + """Configuration of the graph""" + openai_key = os.getenv("OPENAI_APIKEY") + return { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, + } + +def test_scraping_pipeline(graph_config): + """Start of the scraping pipeline""" + scrape_graph = ScrapeGraph( + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + result = scrape_graph.run() + + assert result is not None + assert isinstance(result, list) + +def test_get_execution_info(graph_config): + """Get the execution info""" + scrape_graph = ScrapeGraph( + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + scrape_graph.run() + + graph_exec_info = scrape_graph.get_execution_info() + + assert graph_exec_info is not None diff --git a/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py new file mode 100644 index 00000000..0a0e0a69 --- /dev/null +++ b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py @@ -0,0 +1,59 @@ +""" +Module for testing the smart scraper class +""" + +import os +import pytest +import pandas as pd +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +@pytest.fixture +def graph_config(): + """Configuration of the graph""" + openai_key = os.getenv("OPENAI_APIKEY") + + return { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, + } + +def test_scraping_pipeline(graph_config): + """Start of the scraping pipeline""" + smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config, + ) + + result = smart_scraper_multi_lite_graph.run() + + assert result is not None + assert isinstance(result, dict) + +def test_get_execution_info(graph_config): + """Get the execution info""" + smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config, + ) + + smart_scraper_multi_lite_graph.run() + + graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() + + assert graph_exec_info is not None