diff --git a/CHANGELOG.md b/CHANGELOG.md index dff2fccf..01fb0c3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +## [1.10.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0-beta.4...v1.10.0-beta.5) (2024-07-20) + + +### Bug Fixes + +* parse_node ([07f1e23](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/07f1e23d235db1a0db2cb155f10b73b0bf882269)) + +## [1.10.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0-beta.3...v1.10.0-beta.4) (2024-07-20) + + +### Bug Fixes + +* azure models ([03f4a3a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/03f4a3aa29c42a9a312c4afb6818de3450e7cedf)) + + +### CI + +* **release:** 1.9.2 [skip ci] ([b4b90b3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b4b90b3c121911de68a860640419907ca7674953)) + ## [1.9.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.1...v1.9.2) (2024-07-20) @@ -13,46 +32,47 @@ ## [1.9.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0...v1.9.1) (2024-07-12) + ### Bug Fixes -* solve a burr integration ([881290b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/881290b5066b39c505532656671fbf65f8fc312c)) +* add gpt o mini for azure ([77777c8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/77777c898d1fad40f340b06c5b36d35b65409ea6)) -## [1.9.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.8.0...v1.9.0) (2024-07-09) +## [1.10.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0-beta.1...v1.10.0-beta.2) (2024-07-19) ### Features -* add fireworks integration ([df0e310](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df0e3108299071b849d7e055bd11d72764d24f08)) -* add integration for infos ([3bf5f57](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3bf5f570a8f8e1b037a7ad3c9f583261a1536421)) -* add integrations for markdown files ([2804434](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2804434a9ee12c52ae8956a88b1778a4dd3ec32f)) -* add vertexai integration ([119514b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/119514bdfc2a16dfb8918b0c34ae7cc43a01384c)) -* improve md prompt recognition ([5fe694b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe694b6b4545a5091d16110318b992acfca4f58)) +* add gpt4o omni ([431edb7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/431edb7bb2504f4c1335c3ae3ce2f91867fa7222)) +## [1.10.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.1...v1.10.0-beta.1) (2024-07-19) -### Bug Fixes -* add test ([3a537ee](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3a537eec6fef1743924a9aa5cef0ba2f8d44bf11)) -* fix pyproject.toml ([7570bf8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7570bf8294e49bc54ec9e296aaadb763873390ca)) +### Features +* add searchngx integration ([5c92186](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c9218608140bf694fbfd96aa90276bc438bb475)) +* refactoring_to_md function ([602dd00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/602dd00209ee1d72a1223fc4793759450921fcf9)) -### chore -* **Docker:** fix port number ([afeb81f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/afeb81f77a884799192d79dcac85666190fb1c9d)) -* **CI:** fix pylint workflow ([583c321](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/583c32106e827f50235d8fc69511652fd4b07a35)) -* **rye:** rebuild lockfiles ([27c2dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/27c2dd23517a7e4b14fafd00320a8b81f73145dc)) +### Bug Fixes +* search link node ([cf3ab55](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cf3ab5564ae5c415c63d1771b32ea68f5169ca82)) -### Docs -* **roadmap:** fix urls ([14faba4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/14faba4f00dd9f947f8dc5e0b51be49ea684179f)) -* **roadmap:** next steps ([3e644f4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e644f498f05eb505fbd4e94b144c81567569aaa)) +### chore + +* correct search engine name ([7ba2f6a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7ba2f6ae0b9d2e9336e973e1f57ab8355c739e27)) +* remove unused import ([fd1b7cb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd1b7cb24a7c252277607abde35826e3c58e34ef)) +* remove unused workflow ([5c6dd8d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c6dd8de4da08f09b5dd93c525d14b44778c9659)) +* **ci:** upgrade lockfiles ([c7b05a4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c7b05a4993df14d6ed4848121a3cd209571232f7)) +* upgrade tiktoken ([7314bc3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7314bc383068db590662bf7e512f799529308991)) ### CI -* **release:** 1.8.1-beta.1 [skip ci] ([8f9f96f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f9f96f7e7ff41d2fff5bbbf18bf4fc85d4f98b3)) -* **release:** 1.9.0-beta.1 [skip ci] ([146432d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/146432d476f775510441b062935adc47190141e2)) -* **release:** 1.9.0-beta.2 [skip ci] ([5cb5fbf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5cb5fbf5503eec9b34a6691eb993716cc9a821d6)) +* **release:** 1.9.0-beta.3 [skip ci] ([d3e63d9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d3e63d91be79f74e8a3fdb00e692d546c24cead5)) +* **release:** 1.9.0-beta.4 [skip ci] ([2fa04b5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2fa04b58159abf7af890ebc0768fe23d51bf177f)) +* **release:** 1.9.0-beta.5 [skip ci] ([bb62439](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb624399cfc3924825892dd48697fc298ad3b002)) +* **release:** 1.9.0-beta.6 [skip ci] ([54a69de](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/54a69de69e8077e02fd5584783ca62cc2e0ec5bb)) ## [1.9.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.1...v1.9.0-beta.2) (2024-07-05) diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_haiku.py new file mode 100644 index 00000000..ccfbc1d2 --- /dev/null +++ b/examples/anthropic/search_link_graph_haiku.py @@ -0,0 +1,57 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from langchain_openai import AzureChatOpenAI +from langchain_openai import AzureOpenAIEmbeddings + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +llm_model_instance = AzureChatOpenAI( + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], + azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] +) + +embedder_model_instance = AzureOpenAIEmbeddings( + azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], +) + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/azure/csv_scraper_azure.py b/examples/azure/csv_scraper_azure.py index 3124498e..d1871952 100644 --- a/examples/azure/csv_scraper_azure.py +++ b/examples/azure/csv_scraper_azure.py @@ -5,8 +5,6 @@ import os from dotenv import load_dotenv import pandas as pd -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import CSVScraperGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info load_dotenv() @@ -24,25 +22,15 @@ # ************************************************ # Define the configuration for the graph # ************************************************ - -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } + # ************************************************ # Create the CSVScraperGraph instance and run it # ************************************************ diff --git a/examples/azure/csv_scraper_graph_multi_azure.py b/examples/azure/csv_scraper_graph_multi_azure.py index c8a29829..e8ce1961 100644 --- a/examples/azure/csv_scraper_graph_multi_azure.py +++ b/examples/azure/csv_scraper_graph_multi_azure.py @@ -6,8 +6,6 @@ from dotenv import load_dotenv import pandas as pd from scrapegraphai.graphs import CSVScraperMultiGraph -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info load_dotenv() @@ -24,19 +22,15 @@ # ************************************************ # Define the configuration for the graph # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } + # ************************************************ # Create the CSVScraperMultiGraph instance and run it # ************************************************ diff --git a/examples/azure/custom_graph_azure.py b/examples/azure/custom_graph_azure.py deleted file mode 100644 index 33ac1703..00000000 --- a/examples/azure/custom_graph_azure.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -import os -from dotenv import load_dotenv -from langchain_openai import OpenAIEmbeddings -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} -} -# define the nodes for the graph -robot_node = RobotsNode( - input="url", - output=["is_scrapable"], - node_config={ - "llm_model": llm_model_instance, - "force_scraping": True, - "verbose": True, - } -) - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc", "link_urls", "img_urls"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model_instance, - "embedder_model": embedder_model_instance, - "verbose": True, - } -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": llm_model_instance, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - robot_node, - fetch_node, - parse_node, - rag_node, - generate_answer_node, - ], - edges=[ - (robot_node, fetch_node), - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - ], - entry_point=robot_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "Describe the content", - "url": "https://example.com/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/azure/json_scraper_azure.py b/examples/azure/json_scraper_azure.py index 5e634862..522e20f7 100644 --- a/examples/azure/json_scraper_azure.py +++ b/examples/azure/json_scraper_azure.py @@ -4,23 +4,11 @@ import os from dotenv import load_dotenv -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import JSONScraperGraph from scrapegraphai.utils import prettify_exec_info - -# required environment variable in .env -# AZURE_OPENAI_ENDPOINT -# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME -# MODEL_NAME -# AZURE_OPENAI_API_KEY -# OPENAI_API_TYPE -# AZURE_OPENAI_API_VERSION -# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME load_dotenv() - FILE_NAME = "inputs/example.json" curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) @@ -32,23 +20,13 @@ # Initialize the model instances # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } smart_scraper_graph = JSONScraperGraph( diff --git a/examples/azure/json_scraper_multi_azure.py b/examples/azure/json_scraper_multi_azure.py index c6295328..66d7a4bc 100644 --- a/examples/azure/json_scraper_multi_azure.py +++ b/examples/azure/json_scraper_multi_azure.py @@ -2,24 +2,22 @@ Module for showing how JSONScraperMultiGraph multi works """ import os +from dotenv import load_dotenv import json -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import JSONScraperMultiGraph -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) +load_dotenv() + -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } + FILE_NAME = "inputs/example.json" curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) diff --git a/examples/azure/pdf_scraper_azure.py b/examples/azure/pdf_scraper_azure.py index f64712ec..01f558ae 100644 --- a/examples/azure/pdf_scraper_azure.py +++ b/examples/azure/pdf_scraper_azure.py @@ -1,7 +1,5 @@ import os, json from dotenv import load_dotenv -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import PDFScraperGraph load_dotenv() @@ -9,18 +7,13 @@ # ************************************************ # Define the configuration for the graph # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } source = """ diff --git a/examples/azure/scrape_plain_text_azure.py b/examples/azure/scrape_plain_text_azure.py index df8cab79..04d8587f 100644 --- a/examples/azure/scrape_plain_text_azure.py +++ b/examples/azure/scrape_plain_text_azure.py @@ -5,8 +5,6 @@ import os from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.utils import prettify_exec_info load_dotenv() @@ -27,23 +25,13 @@ # Define the configuration for the graph # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } # ************************************************ diff --git a/examples/azure/script_generator_azure.py b/examples/azure/script_generator_azure.py index 17135f07..8c9fd456 100644 --- a/examples/azure/script_generator_azure.py +++ b/examples/azure/script_generator_azure.py @@ -5,8 +5,6 @@ import os from dotenv import load_dotenv from scrapegraphai.graphs import ScriptCreatorGraph -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.utils import prettify_exec_info load_dotenv() @@ -14,19 +12,13 @@ # ************************************************ # Define the configuration for the graph # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance}, - "library": "beautifulsoup" + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } # ************************************************ diff --git a/examples/azure/script_multi_generator_azure.py b/examples/azure/script_multi_generator_azure.py index 389eac03..a3f69fda 100644 --- a/examples/azure/script_multi_generator_azure.py +++ b/examples/azure/script_multi_generator_azure.py @@ -6,30 +6,22 @@ from dotenv import load_dotenv from scrapegraphai.graphs import ScriptCreatorMultiGraph from scrapegraphai.utils import prettify_exec_info -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings load_dotenv() # ************************************************ # Define the configuration for the graph # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance}, - "library": "beautifulsoup" + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } - # ************************************************ # Create the ScriptCreatorGraph instance and run it # ************************************************ diff --git a/examples/azure/search_graph_azure.py b/examples/azure/search_graph_azure.py index fe8efb31..7725e482 100644 --- a/examples/azure/search_graph_azure.py +++ b/examples/azure/search_graph_azure.py @@ -4,8 +4,6 @@ import os from dotenv import load_dotenv -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import SearchGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info load_dotenv() @@ -21,23 +19,13 @@ # Initialize the model instances # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } # ************************************************ diff --git a/examples/azure/search_graph_schema_azure.py b/examples/azure/search_graph_schema_azure.py index f435b547..aa6bf346 100644 --- a/examples/azure/search_graph_schema_azure.py +++ b/examples/azure/search_graph_schema_azure.py @@ -11,8 +11,6 @@ from pydantic import BaseModel, Field from typing import List -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings # ************************************************ # Define the output schema for the graph @@ -29,24 +27,13 @@ class Dishes(BaseModel): # Define the configuration for the graph # ************************************************ - -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } # ************************************************ diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py new file mode 100644 index 00000000..54b26dec --- /dev/null +++ b/examples/azure/search_link_graph_azure.py @@ -0,0 +1,45 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/azure/smart_scraper_azure.py b/examples/azure/smart_scraper_azure.py index 53d4a197..b061a340 100644 --- a/examples/azure/smart_scraper_azure.py +++ b/examples/azure/smart_scraper_azure.py @@ -4,8 +4,6 @@ import os from dotenv import load_dotenv -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info @@ -25,23 +23,13 @@ # Initialize the model instances # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } smart_scraper_graph = SmartScraperGraph( diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py index 91020d1a..971e4333 100644 --- a/examples/azure/smart_scraper_multi_azure.py +++ b/examples/azure/smart_scraper_multi_azure.py @@ -4,8 +4,6 @@ import os, json from dotenv import load_dotenv -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import SmartScraperMultiGraph load_dotenv() @@ -13,24 +11,15 @@ # ************************************************ # Define the configuration for the graph # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } + # ******************************************************* # Create the SmartScraperMultiGraph instance and run it # ******************************************************* diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py index 34fbe3d3..6f15253e 100644 --- a/examples/azure/smart_scraper_schema_azure.py +++ b/examples/azure/smart_scraper_schema_azure.py @@ -6,8 +6,6 @@ from typing import List from pydantic import BaseModel, Field from dotenv import load_dotenv -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import SmartScraperGraph load_dotenv() @@ -27,24 +25,15 @@ class Projects(BaseModel): # Initialize the model instances # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } + # ************************************************ # Create the SmartScraperGraph instance and run it # ************************************************ diff --git a/examples/azure/xml_scraper_azure.py b/examples/azure/xml_scraper_azure.py index 696b8817..6bc010da 100644 --- a/examples/azure/xml_scraper_azure.py +++ b/examples/azure/xml_scraper_azure.py @@ -4,20 +4,9 @@ import os from dotenv import load_dotenv -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.graphs import XMLScraperGraph from scrapegraphai.utils import prettify_exec_info - -# required environment variable in .env -# AZURE_OPENAI_ENDPOINT -# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME -# MODEL_NAME -# AZURE_OPENAI_API_KEY -# OPENAI_API_TYPE -# AZURE_OPENAI_API_VERSION -# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME load_dotenv() FILE_NAME = "inputs/books.xml" @@ -32,23 +21,13 @@ # Initialize the model instances # ************************************************ -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } smart_scraper_graph = XMLScraperGraph( diff --git a/examples/azure/xml_scraper_graph_multi_azure.py b/examples/azure/xml_scraper_graph_multi_azure.py index e0d55bd4..c7a73ed7 100644 --- a/examples/azure/xml_scraper_graph_multi_azure.py +++ b/examples/azure/xml_scraper_graph_multi_azure.py @@ -5,8 +5,6 @@ import os from dotenv import load_dotenv from scrapegraphai.graphs import XMLScraperMultiGraph -from langchain_openai import AzureChatOpenAI -from langchain_openai import AzureOpenAIEmbeddings from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info load_dotenv() @@ -24,19 +22,13 @@ # ************************************************ # Define the configuration for the graph # ************************************************ - -llm_model_instance = AzureChatOpenAI( - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], - azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] -) - -embedder_model_instance = AzureOpenAIEmbeddings( - azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], -) graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False } # ************************************************ diff --git a/examples/bedrock/search_link_graph_bedrock.py b/examples/bedrock/search_link_graph_bedrock.py new file mode 100644 index 00000000..116dea01 --- /dev/null +++ b/examples/bedrock/search_link_graph_bedrock.py @@ -0,0 +1,45 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/deepseek/search_link_graph_deepseek.py b/examples/deepseek/search_link_graph_deepseek.py new file mode 100644 index 00000000..30e4a9b3 --- /dev/null +++ b/examples/deepseek/search_link_graph_deepseek.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/ernie/search_graph_ernie.py b/examples/ernie/search_graph_ernie.py index 22802c6e..c04d9f9b 100644 --- a/examples/ernie/search_graph_ernie.py +++ b/examples/ernie/search_graph_ernie.py @@ -12,15 +12,18 @@ # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("OPENAI_APIKEY") - graph_config = { "llm": { - "api_key": openai_key, - "model": "gpt-3.5-turbo", - }, - "max_results": 2, - "verbose": True, + "model": "ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434"}, + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/ernie/search_link_graph_ernie.py b/examples/ernie/search_link_graph_ernie.py new file mode 100644 index 00000000..466b230c --- /dev/null +++ b/examples/ernie/search_link_graph_ernie.py @@ -0,0 +1,46 @@ +""" +Example of Search Graph +""" +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434"}, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/fireworks/search_link_graph_fireworks.py b/examples/fireworks/search_link_graph_fireworks.py new file mode 100644 index 00000000..a1d3a979 --- /dev/null +++ b/examples/fireworks/search_link_graph_fireworks.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "max_results": 2, + "verbose": True, + "headless": False, +} +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/gemini/search_link_graph_gemini.py b/examples/gemini/search_link_graph_gemini.py new file mode 100644 index 00000000..937038bd --- /dev/null +++ b/examples/gemini/search_link_graph_gemini.py @@ -0,0 +1,44 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, +} +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/search_link_graph_groq.py b/examples/groq/search_link_graph_groq.py new file mode 100644 index 00000000..f940c2a4 --- /dev/null +++ b/examples/groq/search_link_graph_groq.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "headless": False +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py index c1a5d319..f828cdec 100644 --- a/examples/groq/smart_scraper_groq.py +++ b/examples/groq/smart_scraper_groq.py @@ -9,7 +9,6 @@ load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ diff --git a/examples/huggingfacehub/search_link_graph_huggingfacehub.py b/examples/huggingfacehub/search_link_graph_huggingfacehub.py new file mode 100644 index 00000000..a49fb3b9 --- /dev/null +++ b/examples/huggingfacehub/search_link_graph_huggingfacehub.py @@ -0,0 +1,54 @@ +""" +Example of Search Graph +""" +import os +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +# ************************************************ + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py new file mode 100644 index 00000000..5c594270 --- /dev/null +++ b/examples/local_models/search_link_graph_ollama.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SearchLinkGraph instance and run it +# ************************************************ + +smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py index e80413c2..0b3fcbfc 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/local_models/smart_scraper_ollama.py @@ -29,7 +29,7 @@ smart_scraper_graph = SmartScraperGraph( prompt="List me all the titles", - source="https://sport.sky.it/nba?gr=www", + source="https://perinim.github.io/projects", config=graph_config ) diff --git a/examples/openai/search_link_graph_openai.py b/examples/openai/search_link_graph_openai.py new file mode 100644 index 00000000..10d10d4c --- /dev/null +++ b/examples/openai/search_link_graph_openai.py @@ -0,0 +1,36 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": "s", + "model": "gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SearchLinkGraph instance and run it +# ************************************************ + +smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py index 513a9b03..b0515770 100644 --- a/examples/openai/smart_scraper_openai.py +++ b/examples/openai/smart_scraper_openai.py @@ -5,7 +5,8 @@ import os, json from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info - +from dotenv import load_dotenv +load_dotenv() # ************************************************ # Define the configuration for the graph @@ -14,7 +15,7 @@ graph_config = { "llm": { - "api_key": "s", + "api_key": os.getenv("OPENAI_API_KEY"), "model": "gpt-3.5-turbo", }, "verbose": True, diff --git a/pyproject.toml b/pyproject.toml index ef211cd4..c42bf33b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,8 @@ name = "scrapegraphai" -version = "1.9.2" +version = "1.10.0b5" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." @@ -24,7 +25,7 @@ dependencies = [ "beautifulsoup4==4.12.3", "pandas==2.2.2", "python-dotenv==1.0.1", - "tiktoken==0.6.0", + "tiktoken==0.7", "tqdm==4.66.4", "graphviz==0.20.3", "minify-html==0.15.0", @@ -34,7 +35,6 @@ dependencies = [ "undetected-playwright==0.3.0", "semchunk==1.0.1", "html2text==2024.2.26", - "trafilatura==1.10.0", "langchain-fireworks==0.1.3" ] diff --git a/requirements-dev.lock b/requirements-dev.lock index f3d4786c..b0bcaaa0 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -41,7 +41,6 @@ attrs==23.2.0 # via jsonschema # via referencing babel==2.15.0 - # via courlan # via sphinx beautifulsoup4==4.12.3 # via furo @@ -63,11 +62,8 @@ certifi==2024.2.2 # via httpcore # via httpx # via requests - # via trafilatura charset-normalizer==3.3.2 - # via htmldate # via requests - # via trafilatura click==8.1.7 # via burr # via streamlit @@ -75,15 +71,11 @@ click==8.1.7 # via uvicorn contourpy==1.2.1 # via matplotlib -courlan==1.2.0 - # via trafilatura cycler==0.12.1 # via matplotlib dataclasses-json==0.6.6 # via langchain # via langchain-community -dateparser==1.2.0 - # via htmldate defusedxml==0.7.1 # via langchain-anthropic dill==0.3.8 @@ -204,8 +196,6 @@ h11==0.14.0 # via uvicorn html2text==2024.2.26 # via scrapegraphai -htmldate==1.8.1 - # via trafilatura httpcore==1.0.5 # via httpx httplib2==0.22.0 @@ -259,8 +249,6 @@ jsonschema==4.22.0 # via altair jsonschema-specifications==2023.12.1 # via jsonschema -justext==3.0.1 - # via trafilatura kiwisolver==1.4.5 # via matplotlib langchain==0.1.15 @@ -302,12 +290,6 @@ loguru==0.7.2 # via burr lxml==5.2.2 # via free-proxy - # via htmldate - # via justext - # via lxml-html-clean - # via trafilatura -lxml-html-clean==0.1.1 - # via lxml markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 @@ -430,9 +412,7 @@ pytest==8.0.0 pytest-mock==3.14.0 python-dateutil==2.9.0.post0 # via botocore - # via dateparser # via google-cloud-bigquery - # via htmldate # via matplotlib # via pandas python-dotenv==1.0.1 @@ -441,7 +421,6 @@ python-dotenv==1.0.1 python-multipart==0.0.9 # via fastapi pytz==2024.1 - # via dateparser # via pandas pyyaml==6.0.1 # via huggingface-hub @@ -453,7 +432,6 @@ referencing==0.35.1 # via jsonschema # via jsonschema-specifications regex==2024.5.15 - # via dateparser # via tiktoken requests==2.32.2 # via burr @@ -531,11 +509,9 @@ tenacity==8.3.0 # via langchain-community # via langchain-core # via streamlit -tiktoken==0.6.0 +tiktoken==0.7.0 # via langchain-openai # via scrapegraphai -tld==0.13 - # via courlan tokenizers==0.19.1 # via anthropic toml==0.10.2 @@ -555,8 +531,6 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk -trafilatura==1.10.0 - # via scrapegraphai typer==0.12.3 # via fastapi-cli typing-extensions==4.12.0 @@ -586,8 +560,6 @@ typing-inspect==0.9.0 # via sf-hamilton tzdata==2024.1 # via pandas -tzlocal==5.2 - # via dateparser ujson==5.10.0 # via fastapi undetected-playwright==0.3.0 @@ -596,10 +568,7 @@ uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.18 # via botocore - # via courlan - # via htmldate # via requests - # via trafilatura uvicorn==0.29.0 # via burr # via fastapi diff --git a/requirements.lock b/requirements.lock index 21b276eb..7a8bb455 100644 --- a/requirements.lock +++ b/requirements.lock @@ -28,8 +28,6 @@ async-timeout==4.0.3 # via langchain attrs==23.2.0 # via aiohttp -babel==2.15.0 - # via courlan beautifulsoup4==4.12.3 # via google # via scrapegraphai @@ -44,18 +42,11 @@ certifi==2024.2.2 # via httpcore # via httpx # via requests - # via trafilatura charset-normalizer==3.3.2 - # via htmldate # via requests - # via trafilatura -courlan==1.2.0 - # via trafilatura dataclasses-json==0.6.6 # via langchain # via langchain-community -dateparser==1.2.0 - # via htmldate defusedxml==0.7.1 # via langchain-anthropic distro==1.9.0 @@ -150,8 +141,6 @@ h11==0.14.0 # via httpcore html2text==2024.2.26 # via scrapegraphai -htmldate==1.8.1 - # via trafilatura httpcore==1.0.5 # via httpx httplib2==0.22.0 @@ -181,8 +170,6 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -justext==3.0.1 - # via trafilatura langchain==0.1.15 # via scrapegraphai langchain-anthropic==0.1.11 @@ -220,12 +207,6 @@ langsmith==0.1.63 # via langchain-core lxml==5.2.2 # via free-proxy - # via htmldate - # via justext - # via lxml-html-clean - # via trafilatura -lxml-html-clean==0.1.1 - # via lxml marshmallow==3.21.2 # via dataclasses-json minify-html==0.15.0 @@ -298,14 +279,11 @@ pyparsing==3.1.2 # via httplib2 python-dateutil==2.9.0.post0 # via botocore - # via dateparser # via google-cloud-bigquery - # via htmldate # via pandas python-dotenv==1.0.1 # via scrapegraphai pytz==2024.1 - # via dateparser # via pandas pyyaml==6.0.1 # via huggingface-hub @@ -313,7 +291,6 @@ pyyaml==6.0.1 # via langchain-community # via langchain-core regex==2024.5.15 - # via dateparser # via tiktoken requests==2.32.2 # via free-proxy @@ -351,11 +328,9 @@ tenacity==8.3.0 # via langchain # via langchain-community # via langchain-core -tiktoken==0.6.0 +tiktoken==0.7.0 # via langchain-openai # via scrapegraphai -tld==0.13 - # via courlan tokenizers==0.19.1 # via anthropic tqdm==4.66.4 @@ -364,8 +339,6 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk -trafilatura==1.10.0 - # via scrapegraphai typing-extensions==4.12.0 # via anthropic # via anyio @@ -382,17 +355,12 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2024.1 # via pandas -tzlocal==5.2 - # via dateparser undetected-playwright==0.3.0 # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.18 # via botocore - # via courlan - # via htmldate # via requests - # via trafilatura yarl==1.9.4 # via aiohttp diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index b1bf1242..26a0b9e1 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -23,3 +23,4 @@ from .script_creator_multi_graph import ScriptCreatorMultiGraph from .markdown_scraper_graph import MDScraperGraph from .markdown_scraper_multi_graph import MDScraperMultiGraph +from .search_link_graph import SearchLinkGraph diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index ea205bb3..f4efd1fb 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -10,7 +10,6 @@ from ..nodes import ( FetchNode, - RAGNode, GenerateAnswerCSVNode ) @@ -37,14 +36,7 @@ def _create_graph(self): input="csv | csv_dir", output=["doc"], ) - rag_node = RAGNode( - input="user_prompt & doc", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model, - } - ) + generate_answer_node = GenerateAnswerCSVNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -58,12 +50,10 @@ def _create_graph(self): return BaseGraph( nodes=[ fetch_node, - rag_node, generate_answer_node, ], edges=[ - (fetch_node, rag_node), - (rag_node, generate_answer_node) + (fetch_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index b85a34dc..fe54ebec 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -10,7 +10,6 @@ from ..nodes import ( FetchNode, - RAGNode, GenerateAnswerNode ) @@ -62,14 +61,7 @@ def _create_graph(self) -> BaseGraph: input="json | json_dir", output=["doc", "link_urls", "img_urls"], ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -83,12 +75,10 @@ def _create_graph(self) -> BaseGraph: return BaseGraph( nodes=[ fetch_node, - rag_node, generate_answer_node, ], edges=[ - (fetch_node, rag_node), - (rag_node, generate_answer_node) + (fetch_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/markdown_scraper_graph.py b/scrapegraphai/graphs/markdown_scraper_graph.py index 66b161dc..c177facd 100644 --- a/scrapegraphai/graphs/markdown_scraper_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_graph.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph -from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode +from ..nodes import FetchNode, ParseNode, GenerateAnswerNode class MDScraperGraph(AbstractGraph): """ @@ -63,14 +63,6 @@ def _create_graph(self) -> BaseGraph: "chunk_size": self.model_token } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -86,13 +78,11 @@ def _create_graph(self) -> BaseGraph: nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 7e34dab7..1965dc04 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -12,7 +12,6 @@ FetchNode, ParseNode, ImageToTextNode, - RAGNode, GenerateAnswerOmniNode ) @@ -89,14 +88,7 @@ def _create_graph(self) -> BaseGraph: "max_images": self.max_images } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_omni_node = GenerateAnswerOmniNode( input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc", output=["answer"], @@ -112,14 +104,12 @@ def _create_graph(self) -> BaseGraph: fetch_node, parse_node, image_to_text_node, - rag_node, generate_answer_omni_node, ], edges=[ (fetch_node, parse_node), (parse_node, image_to_text_node), - (image_to_text_node, rag_node), - (rag_node, generate_answer_omni_node) + (image_to_text_node, generate_answer_omni_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ @@ -136,4 +126,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 732b4789..049425d0 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -12,7 +12,6 @@ from ..nodes import ( FetchNode, ParseNode, - RAGNode, GenerateAnswerPDFNode ) @@ -76,14 +75,6 @@ def _create_graph(self) -> BaseGraph: } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) generate_answer_node_pdf = GenerateAnswerPDFNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -98,13 +89,11 @@ def _create_graph(self) -> BaseGraph: nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node_pdf, ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node_pdf) + (parse_node, generate_answer_node_pdf) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py new file mode 100644 index 00000000..2e23357c --- /dev/null +++ b/scrapegraphai/graphs/search_link_graph.py @@ -0,0 +1,104 @@ +""" SearchLinkGraph Module """ +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + + +from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) + +class SearchLinkGraph(AbstractGraph): + """ + SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel, optional): The schema for the graph output. Defaults to None. + + Example: + >>> smart_scraper = SearchLinkGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + """ + + def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None): + super().__init__("", config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + + fetch_node = FetchNode( + input="url| local_dir", + output=["doc", "link_urls", "img_urls"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + } + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token + } + ) + search_link_node = SearchLinkNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + search_link_node + ], + edges=[ + (fetch_node, parse_node), + (parse_node, search_link_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("parsed_doc", "No answer found.") \ No newline at end of file diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index ba27b60e..cb4777a8 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -11,7 +11,6 @@ from ..nodes import ( FetchNode, ParseNode, - RAGNode, GenerateAnswerNode ) @@ -78,14 +77,7 @@ def _create_graph(self) -> BaseGraph: "chunk_size": self.model_token } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -100,13 +92,11 @@ def _create_graph(self) -> BaseGraph: nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 28c58bb2..24b1ff0d 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -10,7 +10,6 @@ from ..nodes import ( FetchNode, - RAGNode, GenerateAnswerNode ) @@ -64,14 +63,7 @@ def _create_graph(self) -> BaseGraph: input="xml | xml_dir", output=["doc", "link_urls", "img_urls"] ) - rag_node = RAGNode( - input="user_prompt & doc", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -85,12 +77,10 @@ def _create_graph(self) -> BaseGraph: return BaseGraph( nodes=[ fetch_node, - rag_node, generate_answer_node, ], edges=[ - (fetch_node, rag_node), - (rag_node, generate_answer_node) + (fetch_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index ab96c46a..df990bf4 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -17,6 +17,7 @@ "gpt-4-32k-0613": 32768, "gpt-4o": 128000, "gpt-4o-mini":128000, + }, "azure": { "gpt-3.5-turbo-0125": 16385, diff --git a/scrapegraphai/models/bedrock.py b/scrapegraphai/models/bedrock.py index b7cbe288..06299075 100644 --- a/scrapegraphai/models/bedrock.py +++ b/scrapegraphai/models/bedrock.py @@ -1,5 +1,5 @@ """ -bedrock configuration wrapper +Bedrock Module """ from langchain_aws import ChatBedrock diff --git a/scrapegraphai/models/ernie.py b/scrapegraphai/models/ernie.py index 0b4701e1..75e2a261 100644 --- a/scrapegraphai/models/ernie.py +++ b/scrapegraphai/models/ernie.py @@ -1,5 +1,5 @@ """ -Ollama Module +Ernie Module """ from langchain_community.chat_models import ErnieBotChat diff --git a/scrapegraphai/models/oneapi.py b/scrapegraphai/models/oneapi.py index 00dddbf9..54e846d9 100644 --- a/scrapegraphai/models/oneapi.py +++ b/scrapegraphai/models/oneapi.py @@ -1,5 +1,5 @@ """ -OpenAI Module +OneAPI Module """ from langchain_openai import ChatOpenAI diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index bd95cd28..d1b59500 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -88,7 +88,6 @@ def update_config(self, params: dict, overwrite: bool = False): param (dict): The dictionary to update node_config with. overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None. """ - for key, val in params.items(): if hasattr(self, key) and not overwrite: continue diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 894a42f3..0ce424b1 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -14,7 +14,7 @@ class ConditionalNode(BaseNode): This node type is used to implement branching logic within the graph, allowing for dynamic paths based on the data available in the current state. - It is expected thar exactly two edges are created out of this node. + It is expected that exactly two edges are created out of this node. The first node is chosen for execution if the key exists and has a non-empty value, and the second node is chosen if the key does not exist or is empty. diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 58adb1d4..43657b50 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -125,19 +125,20 @@ def execute(self, state): template=template_no_chunks_csv_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "format_instructions": format_instructions, }, ) chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - else: - prompt = PromptTemplate( + break + + prompt = PromptTemplate( template=template_chunks_csv_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions, }, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index fabb4e66..f764e58b 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -115,16 +115,16 @@ def execute(self, state: dict) -> dict: prompt = PromptTemplate( template=template_no_chunks_prompt, input_variables=["question"], - partial_variables={"context": chunk.page_content, + partial_variables={"context": chunk, "format_instructions": format_instructions}) chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) + break - else: - prompt = PromptTemplate( + prompt = PromptTemplate( template=template_chunks_prompt, input_variables=["question"], - partial_variables={"context": chunk.page_content, + partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}) # Dynamically name the chains based on their index diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index e6ea9206..7a030c6f 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -110,7 +110,7 @@ def execute(self, state: dict) -> dict: template=template_no_chunk_omni_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "format_instructions": format_instructions, "img_desc": imag_desc, }, @@ -118,12 +118,13 @@ def execute(self, state: dict) -> dict: chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - else: - prompt = PromptTemplate( + break + + prompt = PromptTemplate( template=template_chunks_omni_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions, }, diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index c6509f34..db6152bc 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -124,15 +124,15 @@ def execute(self, state): template=template_no_chunks_pdf_prompt, input_variables=["question"], partial_variables={ - "context":chunk.page_content, + "context":chunk, "format_instructions": format_instructions, }, ) chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - else: - prompt = PromptTemplate( + break + prompt = PromptTemplate( template=template_chunks_pdf_prompt, input_variables=["question"], partial_variables={ diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index d2d9caad..cdca1b55 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -75,7 +75,7 @@ def execute(self, state: dict) -> dict: chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x.split()), + token_counter=lambda x: len(x), memoize=False) else: docs_transformed = docs_transformed[0] @@ -83,15 +83,15 @@ def execute(self, state: dict) -> dict: if type(docs_transformed) == Document: chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x.split()), + token_counter=lambda x: len(x), memoize=False) else: - + chunks = chunk(text=docs_transformed, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x.split()), + token_counter=lambda x: len(x), memoize=False) - + state.update({self.output[0]: chunks}) - return state + return state \ No newline at end of file diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 2a0c5f18..b3d289d9 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -4,6 +4,7 @@ # Imports from standard library from typing import List, Optional +import re from tqdm import tqdm # Imports from Langchain @@ -20,7 +21,7 @@ class SearchLinkNode(BaseNode): """ A node that can filter out the relevant links in the webpage content for the user prompt. - Node expects the aleready scrapped links on the webpage and hence it is expected + Node expects the already scrapped links on the webpage and hence it is expected that this node be used after the FetchNode. Attributes: @@ -67,39 +68,10 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression - input_keys = self.get_input_keys(state) - user_prompt = state[input_keys[0]] - parsed_content_chunks = state[input_keys[1]] + parsed_content_chunks = state.get("doc") output_parser = JsonOutputParser() - prompt_relevant_links = """ - You are a website scraper and you have just scraped the following content from a website. - Content: {content} - - You are now tasked with identifying all hyper links within the content that are potentially - relevant to the user task: {user_prompt} - - Assume relevance broadly, including any links that might be related or potentially useful - in relation to the task. - - Sort it in order of importance, the first one should be the most important one, the last one - the least important - - Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain - whether the content at the link is directly relevant. - - Output only a list of relevant links in the format: - [ - "link1", - "link2", - "link3", - . - . - . - ] - """ relevant_links = [] for i, chunk in enumerate( @@ -109,15 +81,47 @@ def execute(self, state: dict) -> dict: disable=not self.verbose, ) ): - merge_prompt = PromptTemplate( - template=prompt_relevant_links, - input_variables=["content", "user_prompt"], - ) - merge_chain = merge_prompt | self.llm_model | output_parser - # merge_chain = merge_prompt | self.llm_model - answer = merge_chain.invoke( - {"content": chunk.page_content, "user_prompt": user_prompt} - ) - relevant_links += answer + try: + # Primary approach: Regular expression to extract links + links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content)) + + relevant_links += links + except Exception as e: + # Fallback approach: Using the LLM to extract links + self.logger.error(f"Error extracting links: {e}. Falling back to LLM.") + prompt_relevant_links = """ + You are a website scraper and you have just scraped the following content from a website. + Content: {content} + + Assume relevance broadly, including any links that might be related or potentially useful + in relation to the task. + + Sort it in order of importance, the first one should be the most important one, the last one + the least important + + Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain + whether the content at the link is directly relevant. + + Output only a list of relevant links in the format: + [ + "link1", + "link2", + "link3", + . + . + . + ] + """ + + merge_prompt = PromptTemplate( + template=prompt_relevant_links, + input_variables=["content", "user_prompt"], + ) + merge_chain = merge_prompt | self.llm_model | output_parser + answer = merge_chain.invoke( + {"content": chunk.page_content} + ) + relevant_links += answer + state.update({self.output[0]: relevant_links}) return state diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index a2ec04db..35123042 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -2,8 +2,6 @@ convert_to_md modul """ import html2text -from trafilatura import extract - def convert_to_md(html): """ Convert HTML to Markdown. @@ -20,6 +18,6 @@ def convert_to_md(html): 'This is a paragraph.\n\n# This is a heading.' Note: All the styles and links are ignored during the conversion. """ - - return extract(filecontent=html,include_images=True, - include_links=True, include_tables=True, output_format="markdown") + h = html2text.HTML2Text() + h.ignore_links = False + return h.handle(html) diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index ac7fc09d..101693e4 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,6 +1,3 @@ -""" -research web module -""" import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults @@ -8,41 +5,39 @@ import requests from bs4 import BeautifulSoup -def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: +def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]: """ Searches the web for a given query using specified search engine options. Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. + port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. Returns: List[str]: A list of URLs as strings that are the search results. Raises: - ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'. + ValueError: If the search engine specified is not supported. Example: >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] - - This function allows switching between Google, DuckDuckGo, and Bing to perform - internet searches, returning a list of result URLs. """ - + if search_engine.lower() == "google": res = [] for url in google_search(query, stop=max_results): res.append(url) return res - + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) return links - + elif search_engine.lower() == "bing": headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" @@ -51,11 +46,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = response = requests.get(search_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - + search_results = [] for result in soup.find_all('li', class_='b_algo', limit=max_results): link = result.find('a')['href'] search_results.append(link) return search_results - - raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing") + + elif search_engine.lower() == "searxng": + url = f"http://localhost:{port}" + params = {"q": query, "format": "json"} + + # Send the GET request to the server + response = requests.get(url, params=params) + + # Parse the response and limit to the specified max_results + data = response.json() + limited_results = data["results"][:max_results] + return limited_results + + else: + raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") diff --git a/tests/graphs/search_link_ollama.py b/tests/graphs/search_link_ollama.py new file mode 100644 index 00000000..3b41f699 --- /dev/null +++ b/tests/graphs/search_link_ollama.py @@ -0,0 +1,26 @@ +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info + +def test_smart_scraper_pipeline(): + graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + }, + "verbose": True, + "headless": False + } + + smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config + ) + + result = smart_scraper_graph.run() + + assert result is not None diff --git a/tests/utils/convert_to_md_test.py b/tests/utils/convert_to_md_test.py index 0b6d552e..72270913 100644 --- a/tests/utils/convert_to_md_test.py +++ b/tests/utils/convert_to_md_test.py @@ -7,7 +7,7 @@ def test_basic_html_to_md(): def test_html_with_links_and_images(): html = '

This is a link and this is an image

' - assert convert_to_md(html) is None + assert convert_to_md(html) is not None def test_html_with_tables(): html = ''' @@ -17,11 +17,11 @@ def test_html_with_tables(): Row 2, Cell 1Row 2, Cell 2 ''' - assert convert_to_md(html) is None + assert convert_to_md(html) is not None def test_empty_html(): html = "" - assert convert_to_md(html) is None + assert convert_to_md(html) is not None def test_complex_html_structure(): html = '''