diff --git a/CHANGELOG.md b/CHANGELOG.md index 0433c0a7..b3bac5dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,44 @@ -## [1.14.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0...v1.14.1) (2024-08-24) +## [1.15.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.0-beta.2...v1.15.0-beta.3) (2024-08-24) + + + +### Bug Fixes + +* update abstract graph ([86fe5fc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/86fe5fcaf1a6ba28786678874378f07fba1db40f)) + +## [1.15.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.0-beta.1...v1.15.0-beta.2) (2024-08-23) ### Bug Fixes -* add claude3.5 sonnet ([ee8f8b3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ee8f8b31ecfe4ffd311528d2f48cb055e4609d99)) +* abstract graph ([cf1fada](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cf1fada36a6716cb0e24bbc5da7509446a964145)) + ### Docs * added sponsors ([b3a2d0d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b3a2d0d65a41f6e645fac3fc84f702fdf64b951c)) +## [1.15.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.1-beta.1...v1.15.0-beta.1) (2024-08-23) + + +### Features + +* ligthweigthing the library ([62f32e9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62f32e994bcb748dfef4f7e1b2e5213a989c33cc)) + + +### Bug Fixes + +* Azure OpenAI issue ([a92b9c6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a92b9c6970049a4ba9dbdf8eff3eeb7f98c6c639)) + +## [1.14.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0...v1.14.1-beta.1) (2024-08-21) + + +### Bug Fixes + +* **models_tokens:** add llama2 and llama3 sizes explicitly ([b05ec16](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b05ec16b252d00c9c9ee7c6d4605b420851c7754)) + + ## [1.14.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.3...v1.14.0) (2024-08-20) diff --git a/README.md b/README.md index c6120f1d..c1156d23 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,28 @@ playwright install **Note**: it is recommended to install the library in a virtual environment to avoid conflicts with other libraries 🐱 +By the way if you to use not mandatory modules it is necessary to install by yourself with the following command: + +### Installing "Other Language Models" + +This group allows you to use additional language models like Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints. +```bash +pip install scrapegraphai[other-language-models] + +``` +### Installing "More Semantic Options" + +This group includes tools for advanced semantic processing, such as Graphviz. +```bash +pip install scrapegraphai[more-semantic-options] +``` +### Installing "More Browser Options" + +This group includes additional browser management options, such as BrowserBase. +```bash +pip install scrapegraphai[more-browser-options] +``` + ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). diff --git a/docs/README.md b/docs/README.md index 676a9e89..037f0117 100644 --- a/docs/README.md +++ b/docs/README.md @@ -9,12 +9,6 @@ markmap: ## **Short-Term Goals** -- Integration with more llm APIs - -- Test proxy rotation implementation - -- Add more search engines inside the SearchInternetNode - - Improve the documentation (ReadTheDocs) - [Issue #102](https://github.com/VinciGit00/Scrapegraph-ai/issues/102) @@ -23,9 +17,6 @@ markmap: ## **Medium-Term Goals** - Node for handling API requests - -- Improve SearchGraph to look into the first 5 results of the search engine - - Make scraping more deterministic - Create DOM tree of the website - HTML tag text embeddings with tags metadata @@ -70,5 +61,3 @@ markmap: - Automatic generation of scraping pipelines from a given prompt - Create API for the library - -- Finetune a LLM for html content diff --git a/docs/source/scrapers/llm.rst b/docs/source/scrapers/llm.rst index e76c56f1..7b1df30e 100644 --- a/docs/source/scrapers/llm.rst +++ b/docs/source/scrapers/llm.rst @@ -194,3 +194,35 @@ We can also pass a model instance for the chat model and the embedding model. Fo "model_instance": embedder_model_instance } } + +Other LLM models +^^^^^^^^^^^^^^^^ + +We can also pass a model instance for the chat model and the embedding model through the **model_instance** parameter. +This feature enables you to utilize a Langchain model instance. +You will discover the model you require within the provided list: + +- `chat model list `_ +- `embedding model list `_. + +For instance, consider **chat model** Moonshot. We can integrate it in the following manner: + +.. code-block:: python + + from langchain_community.chat_models.moonshot import MoonshotChat + + # The configuration parameters are contingent upon the specific model you select + llm_instance_config = { + "model": "moonshot-v1-8k", + "base_url": "https://api.moonshot.cn/v1", + "moonshot_api_key": "MOONSHOT_API_KEY", + } + + llm_model_instance = MoonshotChat(**llm_instance_config) + graph_config = { + "llm": { + "model_instance": llm_model_instance, + "model_tokens": 5000 + }, + } + \ No newline at end of file diff --git a/examples/anthropic/csv_scraper_haiku.py b/examples/anthropic/csv_scraper_anthropic.py similarity index 96% rename from examples/anthropic/csv_scraper_haiku.py rename to examples/anthropic/csv_scraper_anthropic.py index 2e0ebe81..01a26a2b 100644 --- a/examples/anthropic/csv_scraper_haiku.py +++ b/examples/anthropic/csv_scraper_anthropic.py @@ -32,7 +32,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/csv_scraper_graph_multi_haiku.py b/examples/anthropic/csv_scraper_graph_multi_anthropic.py similarity index 96% rename from examples/anthropic/csv_scraper_graph_multi_haiku.py rename to examples/anthropic/csv_scraper_graph_multi_anthropic.py index b833af01..fcc297ab 100644 --- a/examples/anthropic/csv_scraper_graph_multi_haiku.py +++ b/examples/anthropic/csv_scraper_graph_multi_anthropic.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000}, } diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_anthropic.py similarity index 97% rename from examples/anthropic/custom_graph_haiku.py rename to examples/anthropic/custom_graph_anthropic.py index cea14361..6d787484 100644 --- a/examples/anthropic/custom_graph_haiku.py +++ b/examples/anthropic/custom_graph_anthropic.py @@ -18,7 +18,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/json_scraper_haiku.py b/examples/anthropic/json_scraper_anthropic.py similarity index 96% rename from examples/anthropic/json_scraper_haiku.py rename to examples/anthropic/json_scraper_anthropic.py index 2610b658..05ee7fdf 100644 --- a/examples/anthropic/json_scraper_haiku.py +++ b/examples/anthropic/json_scraper_anthropic.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/json_scraper_multi_haiku.py b/examples/anthropic/json_scraper_multi_anthropic.py similarity index 93% rename from examples/anthropic/json_scraper_multi_haiku.py rename to examples/anthropic/json_scraper_multi_anthropic.py index 0327673b..c07fc54f 100644 --- a/examples/anthropic/json_scraper_multi_haiku.py +++ b/examples/anthropic/json_scraper_multi_anthropic.py @@ -11,7 +11,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/pdf_scraper_graph_haiku.py b/examples/anthropic/pdf_scraper_graph_anthropic.py similarity index 96% rename from examples/anthropic/pdf_scraper_graph_haiku.py rename to examples/anthropic/pdf_scraper_graph_anthropic.py index 61be06b4..3e4191a6 100644 --- a/examples/anthropic/pdf_scraper_graph_haiku.py +++ b/examples/anthropic/pdf_scraper_graph_anthropic.py @@ -14,7 +14,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/pdf_scraper_multi_haiku.py b/examples/anthropic/pdf_scraper_multi_anthropic.py similarity index 99% rename from examples/anthropic/pdf_scraper_multi_haiku.py rename to examples/anthropic/pdf_scraper_multi_anthropic.py index 974dd2f8..0c842787 100644 --- a/examples/anthropic/pdf_scraper_multi_haiku.py +++ b/examples/anthropic/pdf_scraper_multi_anthropic.py @@ -11,7 +11,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/scrape_plain_text_haiku.py b/examples/anthropic/scrape_plain_text_anthropic.py similarity index 96% rename from examples/anthropic/scrape_plain_text_haiku.py rename to examples/anthropic/scrape_plain_text_anthropic.py index d3f36638..7ebf84da 100644 --- a/examples/anthropic/scrape_plain_text_haiku.py +++ b/examples/anthropic/scrape_plain_text_anthropic.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/script_generator_haiku.py b/examples/anthropic/script_generator_anthropic.py similarity index 95% rename from examples/anthropic/script_generator_haiku.py rename to examples/anthropic/script_generator_anthropic.py index 889ce0b5..160987cc 100644 --- a/examples/anthropic/script_generator_haiku.py +++ b/examples/anthropic/script_generator_anthropic.py @@ -16,7 +16,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/script_multi_generator_haiku.py b/examples/anthropic/script_multi_generator_anthropic.py similarity index 96% rename from examples/anthropic/script_multi_generator_haiku.py rename to examples/anthropic/script_multi_generator_anthropic.py index f7c69010..c4b3f09b 100644 --- a/examples/anthropic/script_multi_generator_haiku.py +++ b/examples/anthropic/script_multi_generator_anthropic.py @@ -16,7 +16,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, "library": "beautifulsoup" diff --git a/examples/anthropic/search_graph_haiku.py b/examples/anthropic/search_graph_anthropic.py similarity index 95% rename from examples/anthropic/search_graph_haiku.py rename to examples/anthropic/search_graph_anthropic.py index f90d7598..4ae0e6b3 100644 --- a/examples/anthropic/search_graph_haiku.py +++ b/examples/anthropic/search_graph_anthropic.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_anthropic.py similarity index 92% rename from examples/anthropic/search_graph_schema_haiku.py rename to examples/anthropic/search_graph_schema_anthropic.py index c9e7a875..58e1ca0f 100644 --- a/examples/anthropic/search_graph_schema_haiku.py +++ b/examples/anthropic/search_graph_schema_anthropic.py @@ -27,8 +27,9 @@ class Dishes(BaseModel): graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", - "max_tokens": 4000}, + "model": "anthropic/claude-3-haiku-20240307", + "max_tokens": 4000 + }, } # ************************************************ diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_anthropic.py similarity index 91% rename from examples/anthropic/search_link_graph_haiku.py rename to examples/anthropic/search_link_graph_anthropic.py index ccfbc1d2..4d671817 100644 --- a/examples/anthropic/search_link_graph_haiku.py +++ b/examples/anthropic/search_link_graph_anthropic.py @@ -29,8 +29,11 @@ # ************************************************ graph_config = { - "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + "max_tokens": 4000 + }, } # ************************************************ diff --git a/examples/anthropic/smart_scraper_haiku.py b/examples/anthropic/smart_scraper_anthropic.py similarity index 96% rename from examples/anthropic/smart_scraper_haiku.py rename to examples/anthropic/smart_scraper_anthropic.py index f0bb2a57..612363c7 100644 --- a/examples/anthropic/smart_scraper_haiku.py +++ b/examples/anthropic/smart_scraper_anthropic.py @@ -19,7 +19,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/smart_scraper_multi_haiku.py b/examples/anthropic/smart_scraper_multi_anthropic.py similarity index 96% rename from examples/anthropic/smart_scraper_multi_haiku.py rename to examples/anthropic/smart_scraper_multi_anthropic.py index eb2001d4..6e2af361 100644 --- a/examples/anthropic/smart_scraper_multi_haiku.py +++ b/examples/anthropic/smart_scraper_multi_anthropic.py @@ -17,7 +17,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/smart_scraper_schema_haiku.py b/examples/anthropic/smart_scraper_schema_anthropic.py similarity index 96% rename from examples/anthropic/smart_scraper_schema_haiku.py rename to examples/anthropic/smart_scraper_schema_anthropic.py index 83cedd2a..0a444923 100644 --- a/examples/anthropic/smart_scraper_schema_haiku.py +++ b/examples/anthropic/smart_scraper_schema_anthropic.py @@ -33,7 +33,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000}, } diff --git a/examples/anthropic/xml_scraper_haiku.py b/examples/anthropic/xml_scraper_anthropic.py similarity index 96% rename from examples/anthropic/xml_scraper_haiku.py rename to examples/anthropic/xml_scraper_anthropic.py index dd64f571..cd60f0d6 100644 --- a/examples/anthropic/xml_scraper_haiku.py +++ b/examples/anthropic/xml_scraper_anthropic.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000 }, } diff --git a/examples/anthropic/xml_scraper_graph_multi_haiku.py b/examples/anthropic/xml_scraper_graph_multi_anthropic.py similarity index 96% rename from examples/anthropic/xml_scraper_graph_multi_haiku.py rename to examples/anthropic/xml_scraper_graph_multi_anthropic.py index 6b79f709..31b350c2 100644 --- a/examples/anthropic/xml_scraper_graph_multi_haiku.py +++ b/examples/anthropic/xml_scraper_graph_multi_anthropic.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.getenv("ANTHROPIC_API_KEY"), - "model": "claude-3-haiku-20240307", + "model": "anthropic/claude-3-haiku-20240307", "max_tokens": 4000}, } diff --git a/examples/model_instance/.env.example b/examples/model_instance/.env.example new file mode 100644 index 00000000..c5a7ed85 --- /dev/null +++ b/examples/model_instance/.env.example @@ -0,0 +1 @@ +MOONLIGHT_API_KEY="YOUR MOONLIGHT API KEY" \ No newline at end of file diff --git a/examples/model_instance/smart_scraper_with_model_instace.py b/examples/model_instance/smart_scraper_with_model_instace.py new file mode 100644 index 00000000..b362414f --- /dev/null +++ b/examples/model_instance/smart_scraper_with_model_instace.py @@ -0,0 +1,53 @@ +""" +Basic example of scraping pipeline using SmartScraper and model_instace +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from langchain_community.chat_models.moonshot import MoonshotChat +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +llm_instance_config = { + "model": "moonshot-v1-8k", + "base_url": "https://api.moonshot.cn/v1", + "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"), +} + + +llm_model_instance = MoonshotChat(**llm_instance_config) + +graph_config = { + "llm": { + "model_instance": llm_model_instance, + "model_tokens": 10000 + }, + "verbose": True, + "headless": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/moonshot/.env.example b/examples/moonshot/.env.example new file mode 100644 index 00000000..c5a7ed85 --- /dev/null +++ b/examples/moonshot/.env.example @@ -0,0 +1 @@ +MOONLIGHT_API_KEY="YOUR MOONLIGHT API KEY" \ No newline at end of file diff --git a/examples/moonshot/readme.md b/examples/moonshot/readme.md new file mode 100644 index 00000000..6b9b2f21 --- /dev/null +++ b/examples/moonshot/readme.md @@ -0,0 +1 @@ +This folder offer an example of how to use ScrapeGraph-AI with Moonshot and SmartScraperGraph. More usage examples can refer to openai exapmles. \ No newline at end of file diff --git a/examples/moonshot/smart_scraper_with_moonshot.py b/examples/moonshot/smart_scraper_with_moonshot.py new file mode 100644 index 00000000..b362414f --- /dev/null +++ b/examples/moonshot/smart_scraper_with_moonshot.py @@ -0,0 +1,53 @@ +""" +Basic example of scraping pipeline using SmartScraper and model_instace +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from langchain_community.chat_models.moonshot import MoonshotChat +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +llm_instance_config = { + "model": "moonshot-v1-8k", + "base_url": "https://api.moonshot.cn/v1", + "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"), +} + + +llm_model_instance = MoonshotChat(**llm_instance_config) + +graph_config = { + "llm": { + "model_instance": llm_model_instance, + "model_tokens": 10000 + }, + "verbose": True, + "headless": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/pyproject.toml b/pyproject.toml index d47f9b29..b388837c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,8 @@ [project] name = "scrapegraphai" - - -version = "1.14.1" - +version = "1.15.0b3" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." - authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, { name = "Marco Perini", email = "perinim.98@gmail.com" }, @@ -15,17 +11,11 @@ authors = [ dependencies = [ "langchain>=0.2.14", - "langchain-fireworks>=0.1.3", - "langchain_community>=0.2.9", "langchain-google-genai>=1.0.7", - "langchain-google-vertexai>=1.0.7", "langchain-openai>=0.1.22", - "langchain-groq>=0.1.3", - "langchain-aws>=0.1.3", - "langchain-anthropic>=0.1.11", "langchain-mistralai>=0.1.12", - "langchain-huggingface>=0.0.3", - "langchain-nvidia-ai-endpoints>=0.1.6", + "langchain_community>=0.2.9", + "langchain-aws>=0.1.3", "html2text>=2024.2.26", "faiss-cpu>=1.8.0", "beautifulsoup4>=4.12.3", @@ -33,14 +23,12 @@ dependencies = [ "python-dotenv>=1.0.1", "tiktoken>=0.7", "tqdm>=4.66.4", - "graphviz>=0.20.3", "minify-html>=0.15.0", "free-proxy>=1.1.1", "playwright>=1.43.0", - "google>=3.0.0", "undetected-playwright>=0.3.0", + "google>=3.0.0", "semchunk>=1.0.1", - "browserbase>=0.3.0", ] license = "MIT" @@ -79,6 +67,25 @@ requires-python = ">=3.9,<4.0" burr = ["burr[start]==0.22.1"] docs = ["sphinx==6.0", "furo==2024.5.6"] +# Group 1: Other Language Models +other-language-models = [ + "langchain-fireworks>=0.1.3", + "langchain-groq>=0.1.3", + "langchain-anthropic>=0.1.11", + "langchain-huggingface>=0.0.3", + "langchain-nvidia-ai-endpoints>=0.1.6", +] + +# Group 2: More Semantic Options +more-semantic-options = [ + "graphviz>=0.20.3", +] + +# Group 3: More Browser Options +more-browser-options = [ + "browserbase>=0.3.0", +] + [build-system] requires = ["hatchling"] build-backend = "hatchling.build" diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index ae1e90b2..f80e430d 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,8 +7,6 @@ import uuid import warnings from pydantic import BaseModel -from langchain_community.chat_models import ErnieBotChat -from langchain_nvidia_ai_endpoints import ChatNVIDIA from langchain.chat_models import init_chat_model from ..helpers import models_tokens from ..models import ( @@ -65,13 +63,10 @@ def __init__(self, prompt: str, config: dict, self.cache_path = self.config.get("cache_path", False) self.browser_base = self.config.get("browser_base") - # Create the graph self.graph = self._create_graph() self.final_state = None self.execution_info = None - # Set common configuration parameters - verbose = bool(config and config.get("verbose")) if verbose: @@ -89,12 +84,10 @@ def __init__(self, prompt: str, config: dict, self.set_common_params(common_params, overwrite=True) - # set burr config self.burr_kwargs = config.get("burr_kwargs", None) if self.burr_kwargs is not None: self.graph.use_burr = True if "app_instance_id" not in self.burr_kwargs: - # set a random uuid for the app_instance_id to avoid conflicts self.burr_kwargs["app_instance_id"] = str(uuid.uuid4()) self.graph.burr_config = self.burr_kwargs @@ -127,7 +120,6 @@ def _create_llm(self, llm_config: dict) -> object: llm_defaults = {"temperature": 0, "streaming": False} llm_params = {**llm_defaults, **llm_config} - # If model instance is passed directly instead of the model details if "model_instance" in llm_params: try: self.model_token = llm_params["model_tokens"] @@ -147,7 +139,10 @@ def handle_model(model_name, provider, token_key, default_token=8192): warnings.simplefilter("ignore") return init_chat_model(**llm_params) - known_models = ["chatgpt","gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"] + known_models = {"chatgpt","gpt","openai", "azure_openai", "google_genai", + "ollama", "oneapi", "nvidia", "groq", "google_vertexai", + "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", + "fireworks", "anthropic"} if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models: raise ValueError(f"Model '{llm_params['model']}' is not supported") @@ -177,14 +172,14 @@ def handle_model(model_name, provider, token_key, default_token=8192): token_key = model_name if "model_tokens" not in llm_params else llm_params["model_tokens"] return handle_model(model_name, "ollama", token_key) - elif "claude-3-" in llm_params["model"]: - return handle_model(llm_params["model"], "anthropic", "claude3") + elif "anthropic" in llm_params["model"]: + model_name = llm_params["model"].split("anthropic/")[-1] + return handle_model(model_name, "anthropic", model_name) elif llm_params["model"].startswith("mistral"): model_name = llm_params["model"].split("/")[-1] return handle_model(model_name, "mistralai", model_name) - # Instantiate the language model based on the model name (models that do not use the common interface) elif "deepseek" in llm_params["model"]: try: self.model_token = models_tokens["deepseek"][llm_params["model"]] @@ -194,6 +189,8 @@ def handle_model(model_name, provider, token_key, default_token=8192): return DeepSeek(llm_params) elif "ernie" in llm_params["model"]: + from langchain_community.chat_models import ErnieBotChat + try: self.model_token = models_tokens["ernie"][llm_params["model"]] except KeyError: @@ -202,7 +199,6 @@ def handle_model(model_name, provider, token_key, default_token=8192): return ErnieBotChat(llm_params) elif "oneapi" in llm_params["model"]: - # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["oneapi"][llm_params["model"]] @@ -211,6 +207,8 @@ def handle_model(model_name, provider, token_key, default_token=8192): return OneApi(llm_params) elif "nvidia" in llm_params["model"]: + from langchain_nvidia_ai_endpoints import ChatNVIDIA + try: self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 711aa75f..1e3e1910 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -51,70 +51,80 @@ "gemini-1.5-pro-latest": 128000, "models/embedding-001": 2048 }, - "ollama": { "command-r": 12800, - "codellama": 16000, - "dbrx": 32768, - "deepseek-coder:33b": 16000, - "falcon": 2048, - "llama2": 4096, - "llama3": 8192, - "llama3:70b": 8192, - "llama3.1":128000, - "llama3.1:8b": 128000, - "llama3.1:70b": 128000, - "lama3.1:405b": 128000, - "scrapegraph": 8192, - "llava": 4096, - "mixtral:8x22b-instruct": 65536, - "mistral-openorca": 32000, - "nomic-embed-text": 8192, - "nous-hermes2:34b": 4096, - "orca-mini": 2048, - "phi3:3.8b": 12800, - "qwen:0.5b": 32000, - "qwen:1.8b": 32000, - "qwen:4b": 32000, - "qwen:14b": 32000, - "qwen:32b": 32000, - "qwen:72b": 32000, - "qwen:110b": 32000, - "stablelm-zephyr": 8192, - "wizardlm2:8x22b": 65536, - # embedding models - "shaw/dmeta-embedding-zh-small-q4": 8192, - "shaw/dmeta-embedding-zh-q4": 8192, - "chevalblanc/acge_text_embedding": 8192, - "martcreation/dmeta-embedding-zh": 8192, - "snowflake-arctic-embed": 8192, - "mxbai-embed-large": 512 + "google_vertexai": { + "gemini-1.5-flash": 128000, + "gemini-1.5-pro": 128000, + "gemini-1.0-pro": 128000, + }, + "ollama": { + "command-r": 12800, + "codellama": 16000, + "dbrx": 32768, + "deepseek-coder:33b": 16000, + "falcon": 2048, + "llama2": 4096, + "llama2:7b": 4096, + "llama2:13b": 4096, + "llama2:70b": 4096, + "llama3": 8192, + "llama3:8b": 8192, + "llama3:70b": 8192, + "llama3.1":128000, + "llama3.1:8b": 128000, + "llama3.1:70b": 128000, + "lama3.1:405b": 128000, + "scrapegraph": 8192, + "llava": 4096, + "mixtral:8x22b-instruct": 65536, + "mistral-openorca": 32000, + "nomic-embed-text": 8192, + "nous-hermes2:34b": 4096, + "orca-mini": 2048, + "phi3:3.8b": 12800, + "qwen:0.5b": 32000, + "qwen:1.8b": 32000, + "qwen:4b": 32000, + "qwen:14b": 32000, + "qwen:32b": 32000, + "qwen:72b": 32000, + "qwen:110b": 32000, + "stablelm-zephyr": 8192, + "wizardlm2:8x22b": 65536, + # embedding models + "shaw/dmeta-embedding-zh-small-q4": 8192, + "shaw/dmeta-embedding-zh-q4": 8192, + "chevalblanc/acge_text_embedding": 8192, + "martcreation/dmeta-embedding-zh": 8192, + "snowflake-arctic-embed": 8192, + "mxbai-embed-large": 512, }, "oneapi": { - "qwen-turbo": 6000 + "qwen-turbo": 6000, }, - "nvidia": { + "nvdia": { "meta/llama3-70b-instruct": 419, "meta/llama3-8b-instruct": 419, "nemotron-4-340b-instruct": 1024, - "databricks/dbrx-instruct": 4096, - "google/codegemma-7b": 8192, - "google/gemma-2b": 2048, - "google/gemma-7b": 8192, - "google/recurrentgemma-2b": 2048, - "meta/codellama-70b": 16384, - "meta/llama2-70b": 4096, - "microsoft/phi-3-mini-128k-instruct": 122880, - "mistralai/mistral-7b-instruct-v0.2": 4096, - "mistralai/mistral-large": 8192, - "mistralai/mixtral-8x22b-instruct-v0.1": 32768, - "mistralai/mixtral-8x7b-instruct-v0.1": 8192, - "snowflake/arctic": 16384, + "databricks/dbrx-instruct": 4096, + "google/codegemma-7b": 8192, + "google/gemma-2b": 2048, + "google/gemma-7b": 8192, + "google/recurrentgemma-2b": 2048, + "meta/codellama-70b": 16384, + "meta/llama2-70b": 4096, + "microsoft/phi-3-mini-128k-instruct": 122880, + "mistralai/mistral-7b-instruct-v0.2": 4096, + "mistralai/mistral-large": 8192, + "mistralai/mixtral-8x22b-instruct-v0.1": 32768, + "mistralai/mixtral-8x7b-instruct-v0.1": 8192, + "snowflake/arctic": 16384, }, "groq": { "llama3-8b-8192": 8192, "llama3-70b-8192": 8192, "mixtral-8x7b-32768": 32768, "gemma-7b-it": 8192, - "claude-3-haiku-20240307'": 8192 + "claude-3-haiku-20240307'": 8192, }, "claude": { "claude_instant": 100000, @@ -125,12 +135,7 @@ "claude-3-opus-20240229": 200000, "claude-3-sonnet-20240229": 200000, "claude-3-haiku-20240307": 200000, - "claude-3-5-sonnet-20240620": 200000 - }, - "google_vertexai": { - "gemini-1.5-flash": 128000, - "gemini-1.5-pro": 128000, - "gemini-1.0-pro": 128000 + "claude-3-5-sonnet-20240620": 200000, }, "bedrock": { "anthropic.claude-3-5-sonnet-20240620": 200000, @@ -150,7 +155,7 @@ "amazon.titan-embed-text-v1": 8000, "amazon.titan-embed-text-v2:0": 8000, "cohere.embed-english-v3": 512, - "cohere.embed-multilingual-v3": 512 + "cohere.embed-multilingual-v3": 512, }, "mistralai": { "mistral-large-latest": 128000, @@ -160,7 +165,7 @@ "open-mistral-7b": 32000, "open-mixtral-8x7b": 32000, "open-mixtral-8x22b": 64000, - "open-codestral-mamba": 256000 + "open-codestral-mamba": 256000, }, "hugging_face": { "xai-org/grok-1": 8192, @@ -194,11 +199,11 @@ "TheBloke/dolphin-2.7-mixtral-8x7b-GGUF": 32768, "deepseek-ai/DeepSeek-V2": 131072, "deepseek-ai/DeepSeek-V2-Chat": 131072, - "claude-3-haiku": 200000 + "claude-3-haiku": 200000, }, "deepseek": { "deepseek-chat": 28672, - "deepseek-coder": 16384 + "deepseek-coder": 16384, }, "ernie": { "ernie-bot-turbo": 4096, @@ -208,16 +213,16 @@ "ernie-bot-2-base-zh": 4096, "ernie-bot-2-base-en": 4096, "ernie-bot-2-base-en-zh": 4096, - "ernie-bot-2-base-zh-en": 4096 + "ernie-bot-2-base-zh-en": 4096, }, "fireworks": { "llama-v2-7b": 4096, "mixtral-8x7b-instruct": 4096, - "nomic-ai/nomic-embed-text-v1.5": 8192, + "nomic-ai/nomic-embed-text-v1.5": 8192, "llama-3.1-405B-instruct": 131072, "llama-3.1-70B-instruct": 131072, "llama-3.1-8B-instruct": 131072, "mixtral-moe-8x22B-instruct": 65536, - "mixtral-moe-8x7B-instruct": 65536 + "mixtral-moe-8x7B-instruct": 65536, }, } diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index b60c3652..4119ee9a 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -10,7 +10,6 @@ from langchain_core.documents import Document from ..utils.cleanup_html import cleanup_html from ..docloaders import ChromiumLoader -from ..docloaders.browser_base import browser_base_fetch from ..utils.convert_to_md import convert_to_md from ..utils.logging import get_logger from .base_node import BaseNode @@ -269,6 +268,8 @@ def handle_web_source(self, state, source): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: + from ..docloaders.browser_base import browser_base_fetch + data = browser_base_fetch(self.browser_base.get("api_key"), self.browser_base.get("project_id"), [source]) diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index b7d7471a..0907dfb9 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -9,7 +9,8 @@ from tqdm import tqdm from ..utils.logging import get_logger from .base_node import BaseNode -from ..prompts.generate_answer_node_csv_prompts import TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV +from ..prompts.generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV, + TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV) class GenerateAnswerCSVNode(BaseNode): """ @@ -95,14 +96,14 @@ def execute(self, state): else: output_parser = JsonOutputParser() - TEMPLATE_NO_CHUKS_CSV_prompt = TEMPLATE_NO_CHUKS_CSV - TEMPLATE_CHUKS_CSV_prompt = TEMPLATE_CHUKS_CSV - TEMPLATE_MERGE_CSV_prompt = TEMPLATE_MERGE_CSV + TEMPLATE_NO_CHUKS_CSV_PROMPT = TEMPLATE_NO_CHUKS_CSV + TEMPLATE_CHUKS_CSV_PROMPT = TEMPLATE_CHUKS_CSV + TEMPLATE_MERGE_CSV_PROMPT = TEMPLATE_MERGE_CSV if self.additional_info is not None: - TEMPLATE_NO_CHUKS_CSV_prompt = self.additional_info + TEMPLATE_NO_CHUKS_CSV - TEMPLATE_CHUKS_CSV_prompt = self.additional_info + TEMPLATE_CHUKS_CSV - TEMPLATE_MERGE_CSV_prompt = self.additional_info + TEMPLATE_MERGE_CSV + TEMPLATE_NO_CHUKS_CSV_PROMPT = self.additional_info + TEMPLATE_NO_CHUKS_CSV + TEMPLATE_CHUKS_CSV_PROMPT = self.additional_info + TEMPLATE_CHUKS_CSV + TEMPLATE_MERGE_CSV_PROMPT = self.additional_info + TEMPLATE_MERGE_CSV format_instructions = output_parser.get_format_instructions() @@ -110,7 +111,7 @@ def execute(self, state): if len(doc) == 1: prompt = PromptTemplate( - template=TEMPLATE_NO_CHUKS_CSV_prompt, + template=TEMPLATE_NO_CHUKS_CSV_PROMPT, input_variables=["question"], partial_variables={ "context": doc, @@ -127,7 +128,7 @@ def execute(self, state): tqdm(doc, desc="Processing chunks", disable=not self.verbose) ): prompt = PromptTemplate( - template=TEMPLATE_CHUKS_CSV_prompt, + template=TEMPLATE_CHUKS_CSV_PROMPT, input_variables=["question"], partial_variables={ "context": chunk, @@ -144,7 +145,7 @@ def execute(self, state): batch_results = async_runner.invoke({"question": user_prompt}) merge_prompt = PromptTemplate( - template = TEMPLATE_MERGE_CSV_prompt, + template = TEMPLATE_MERGE_CSV_PROMPT, input_variables=["context", "question"], partial_variables={"format_instructions": format_instructions}, ) @@ -153,4 +154,4 @@ def execute(self, state): answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 4f077091..a7c5e5bb 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -67,10 +67,8 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index 4d12b985..9ba38283 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -58,10 +58,8 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] @@ -88,10 +86,8 @@ def execute(self, state: dict) -> dict: }, ) - # Execute the chain to get probable tags tag_answer = tag_prompt | self.llm_model | output_parser probable_tags = tag_answer.invoke({"question": user_prompt}) - # Update the dictionary with probable tags state.update({self.output[0]: probable_tags}) return state diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index cd71b8e1..a765da28 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -103,7 +103,6 @@ async def _async_execute(self, state: dict, batchsize: int) -> dict: if graph_instance is None: raise ValueError("graph instance is required for concurrent execution") - # Assign depth level to the graph if "graph_depth" in graph_instance.config: graph_instance.config["graph_depth"] += 1 else: @@ -113,14 +112,12 @@ async def _async_execute(self, state: dict, batchsize: int) -> dict: participants = [] - # semaphore to limit the number of concurrent tasks semaphore = asyncio.Semaphore(batchsize) async def _async_run(graph): async with semaphore: return await asyncio.to_thread(graph.run) - # creates a deepcopy of the graph instance for each endpoint for url in urls: instance = copy.copy(graph_instance) instance.source = url diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index f00880e9..f2559a09 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -56,21 +56,17 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] answers = input_data[1] - # merge the answers in one string answers_str = "" for i, answer in enumerate(answers): answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n" - # Initialize the output parser if self.node_config.get("schema", None) is not None: output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: @@ -90,6 +86,5 @@ def execute(self, state: dict) -> dict: merge_chain = prompt_template | self.llm_model | output_parser answer = merge_chain.invoke({"user_prompt": user_prompt}) - # Update the state with the generated answer state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index dbecdbf9..3e8ed5ac 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -59,13 +59,11 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - # Parse the document docs_transformed = input_data[0] + if self.parse_html: docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) docs_transformed = docs_transformed[0] @@ -77,7 +75,6 @@ def execute(self, state: dict) -> dict: else: docs_transformed = docs_transformed[0] - # Adapt the chunk size, leaving room for the reply, the prompt and the schema chunk_size = self.node_config.get("chunk_size", 4096) chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index ea5efe7a..868044a0 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -80,10 +80,8 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] @@ -102,7 +100,6 @@ def execute(self, state: dict) -> dict: self.logger.info("--- (updated chunks metadata) ---") - # check if embedder_model is provided, if not use llm_model if self.embedder_model is not None: embeddings = self.embedder_model elif 'embeddings' in self.node_config: @@ -144,23 +141,17 @@ def execute(self, state: dict) -> dict: pipeline_compressor = DocumentCompressorPipeline( transformers=[redundant_filter, relevant_filter] ) - # redundant + relevant filter compressor compression_retriever = ContextualCompressionRetriever( base_compressor=pipeline_compressor, base_retriever=retriever ) - # relevant filter compressor only - # compression_retriever = ContextualCompressionRetriever( - # base_compressor=relevant_filter, base_retriever=retriever - # ) - compressed_docs = compression_retriever.invoke(user_prompt) self.logger.info("--- (tokens compressed and vector stored) ---") state.update({self.output[0]: compressed_docs}) return state - + def _create_default_embedder(self, llm_config=None) -> object: """ @@ -223,7 +214,6 @@ def _create_embedder(self, embedder_config: dict) -> object: embedder_params = {**embedder_config} if "model_instance" in embedder_config: return embedder_params["model_instance"] - # Instantiate the embedding model based on the model name if "openai" in embedder_params["model"]: return OpenAIEmbeddings(api_key=embedder_params["api_key"]) if "azure" in embedder_params["model"]: diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index b33d49c1..6f9bc352 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -75,10 +75,8 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] source = input_data[0] diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index b23e8e8b..df1b6277 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -67,7 +67,6 @@ def execute(self, state: dict) -> dict: input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] @@ -79,10 +78,8 @@ def execute(self, state: dict) -> dict: input_variables=["user_prompt"], ) - # Execute the chain to get the search query search_answer = search_prompt | self.llm_model | output_parser - - # Ollama: Use no json format when creating the search query + if isinstance(self.llm_model, ChatOllama) and self.llm_model.format == 'json': self.llm_model.format = None search_query = search_answer.invoke({"user_prompt": user_prompt})[0] @@ -96,9 +93,7 @@ def execute(self, state: dict) -> dict: search_engine=self.search_engine) if len(answer) == 0: - # raise an exception if no answer is found raise ValueError("Zero results found for the search query.") - # Update the state with the generated answer state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index c39c469d..60c3e1aa 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -49,7 +49,6 @@ def __init__( self.filter_config = {**default_filters.filter_dict, **provided_filter_config} self.filter_links = True else: - # Skip filtering if not enabled self.filter_config = None self.filter_links = False @@ -58,29 +57,26 @@ def __init__( def _is_same_domain(self, url, domain): if not self.filter_links or not self.filter_config.get("diff_domain_filter", True): - return True # Skip the domain filter if not enabled + return True parsed_url = urlparse(url) parsed_domain = urlparse(domain) return parsed_url.netloc == parsed_domain.netloc def _is_image_url(self, url): if not self.filter_links: - return False # Skip image filtering if filtering is not enabled - + return False image_extensions = self.filter_config.get("img_exts", []) return any(url.lower().endswith(ext) for ext in image_extensions) def _is_language_url(self, url): if not self.filter_links: - return False # Skip language filtering if filtering is not enabled + return False lang_indicators = self.filter_config.get("lang_indicators", []) parsed_url = urlparse(url) query_params = parse_qs(parsed_url.query) - # Check if the URL path or query string indicates a language-specific version return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators) - def _is_potentially_irrelevant(self, url): if not self.filter_links: return False # Skip irrelevant URL filtering if filtering is not enabled @@ -88,12 +84,11 @@ def _is_potentially_irrelevant(self, url): irrelevant_keywords = self.filter_config.get("irrelevant_keywords", []) return any(keyword in url.lower() for keyword in irrelevant_keywords) - + def execute(self, state: dict) -> dict: """ - Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also - ensure that all links are navigable. - + Filter out relevant links from the webpage that are relavant to prompt. + Out of the filtered links, also ensure that all links are navigable. Args: state (dict): The current state of the graph. The input keys will be used to fetch the correct data types from the state. @@ -108,7 +103,6 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - parsed_content_chunks = state.get("doc") source_url = state.get("url") or state.get("local_dir") output_parser = JsonOutputParser() @@ -148,7 +142,7 @@ def execute(self, state: dict) -> dict: except Exception as e: # Fallback approach: Using the LLM to extract links self.logger.error(f"Error extracting links: {e}. Falling back to LLM.") - + merge_prompt = PromptTemplate( template=TEMPLATE_RELEVANT_LINKS, input_variables=["content", "user_prompt"], diff --git a/scrapegraphai/nodes/search_node_with_context.py b/scrapegraphai/nodes/search_node_with_context.py index 37a05d0f..7343b64c 100644 --- a/scrapegraphai/nodes/search_node_with_context.py +++ b/scrapegraphai/nodes/search_node_with_context.py @@ -58,10 +58,8 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] doc = input_data[1] @@ -71,7 +69,6 @@ def execute(self, state: dict) -> dict: result = [] - # Use tqdm to add progress bar for i, chunk in enumerate( tqdm(doc, desc="Processing chunks", disable=not self.verbose) ): diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index e8e43cb5..dfa3a64e 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -43,7 +43,8 @@ def execute(self, state: dict) -> dict: correct data types from the state. Returns: - dict: The updated state with the output key containing the audio generated from the text. + dict: The updated state with the output + key containing the audio generated from the text. Raises: KeyError: If the input keys are not found in the state, indicating that the @@ -52,15 +53,11 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - # get the text to translate text2translate = str(next(iter(input_data[0].values()))) - # text2translate = str(input_data[0]) audio = self.tts_model.run(text2translate) diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 23c9f803..6c7c3c4c 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -28,35 +28,28 @@ def cleanup_html(html_content: str, base_url: str) -> str: soup = BeautifulSoup(html_content, 'html.parser') - # Title Extraction title_tag = soup.find('title') title = title_tag.get_text() if title_tag else "" - # Script and Style Tag Removal for tag in soup.find_all(['script', 'style']): tag.extract() - # Links extraction link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)] - # Images extraction images = soup.find_all('img') image_urls = [] for image in images: if 'src' in image.attrs: - # if http or https is not present in the image url, join it with the base url if 'http' not in image['src']: image_urls.append(urljoin(base_url, image['src'])) else: image_urls.append(image['src']) - # Body Extraction (if it exists) body_content = soup.find('body') if body_content: - # Minify the HTML within the body tag minimized_body = minify(str(body_content)) return title, minimized_body, link_urls, image_urls else: - raise ValueError(f"""No HTML body content found, please try setting the 'headless' + raise ValueError(f"""No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}""") diff --git a/scrapegraphai/utils/convert_to_csv.py b/scrapegraphai/utils/convert_to_csv.py index 44897c7c..850f9416 100644 --- a/scrapegraphai/utils/convert_to_csv.py +++ b/scrapegraphai/utils/convert_to_csv.py @@ -29,9 +29,8 @@ def convert_to_csv(data: dict, filename: str, position: str = None) -> None: """ if ".csv" in filename: - filename = filename.replace(".csv", "") # Remove .csv extension + filename = filename.replace(".csv", "") - # Get the directory of the caller script if position is not provided if position is None: caller_dir = os.path.dirname(os.path.abspath(sys.argv[0])) position = caller_dir @@ -40,7 +39,7 @@ def convert_to_csv(data: dict, filename: str, position: str = None) -> None: if not isinstance(data, dict): raise TypeError("Input data must be a dictionary") - os.makedirs(position, exist_ok=True) # Create directory if needed + os.makedirs(position, exist_ok=True) df = pd.DataFrame.from_dict(data, orient='index') df.to_csv(os.path.join(position, f"{filename}.csv"), index=False) @@ -52,4 +51,4 @@ def convert_to_csv(data: dict, filename: str, position: str = None) -> None: raise PermissionError( f"You don't have permission to write to '{position}'.") from pe except Exception as e: - raise e # Re-raise other potential errors + raise e diff --git a/scrapegraphai/utils/convert_to_json.py b/scrapegraphai/utils/convert_to_json.py index 45b1ea55..4e1711f1 100644 --- a/scrapegraphai/utils/convert_to_json.py +++ b/scrapegraphai/utils/convert_to_json.py @@ -28,15 +28,15 @@ def convert_to_json(data: dict, filename: str, position: str = None) -> None: Saves a JSON file named 'output.json' at '/path/to/save'. Notes: - This function automatically ensures the directory exists before attempting to write the file. If the directory does not exist, it will attempt to create it. + This function automatically ensures the directory exists before + attempting to write the file. + If the directory does not exist, it will attempt to create it. """ if ".json" in filename: filename = filename.replace(".json", "") # Remove .json extension - # Get the directory of the caller script if position is None: - # Get directory of the main script caller_dir = os.path.dirname(os.path.abspath(sys.argv[0])) position = caller_dir diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index 123f3457..ff0bbbd7 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -18,7 +18,8 @@ def convert_to_md(html: str, url: str = None) -> str:

This is a heading.

") 'This is a paragraph.\n\n# This is a heading.' - Note: All the styles and links are ignored during the conversion. """ + Note: All the styles and links are ignored during the conversion. + """ h = html2text.HTML2Text() h.ignore_links = False diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py index 335bcbf1..44f40aff 100644 --- a/scrapegraphai/utils/logging.py +++ b/scrapegraphai/utils/logging.py @@ -48,7 +48,6 @@ def _set_library_root_logger() -> None: DEFAULT_HANDLER = logging.StreamHandler() # sys.stderr as stream - # https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176 if sys.stderr is None: sys.stderr = open(os.devnull, "w", encoding="utf-8") @@ -66,7 +65,8 @@ def get_logger(name: Optional[str] = None) -> logging.Logger: If no name is provided, the root logger for the library is returned. Args: - name (Optional[str]): The name of the logger. If None, the root logger for the library is returned. + name (Optional[str]): The name of the logger. + If None, the root logger for the library is returned. Returns: logging.Logger: The logger with the specified name. @@ -199,7 +199,8 @@ def warning_once(self, *args, **kwargs): """ Emit a warning log with the same message only once. - This function is added as a method to the logging.Logger class. It emits a warning log with the same message only once, + This function is added as a method to the logging.Logger class. + It emits a warning log with the same message only once, even if it is called multiple times with the same message. Args: diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index 107397e9..f4bd2ea5 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -31,11 +31,9 @@ def parse_expression(expression, state: dict) -> list: incorrect adjacency of operators, and empty expressions. """ - # Check for empty expression if not expression: raise ValueError("Empty expression.") - # Check for adjacent state keys without an operator between them pattern = r'\b(' + '|'.join(re.escape(key) for key in state.keys()) + \ r')(\b\s*\b)(' + '|'.join(re.escape(key) for key in state.keys()) + r')\b' @@ -43,37 +41,29 @@ def parse_expression(expression, state: dict) -> list: raise ValueError( "Adjacent state keys found without an operator between them.") - # Remove spaces expression = expression.replace(" ", "") - # Check for operators with empty adjacent tokens or at the start/end if expression[0] in '&|' or expression[-1] in '&|' or \ '&&' in expression or '||' in expression or \ '&|' in expression or '|&' in expression: raise ValueError("Invalid operator usage.") - # Check for balanced parentheses and valid operator placement open_parentheses = close_parentheses = 0 for i, char in enumerate(expression): if char == '(': open_parentheses += 1 elif char == ')': close_parentheses += 1 - # Check for invalid operator sequences if char in "&|" and i + 1 < len(expression) and expression[i + 1] in "&|": raise ValueError( "Invalid operator placement: operators cannot be adjacent.") - # Check for missing or balanced parentheses if open_parentheses != close_parentheses: raise ValueError("Missing or unbalanced parentheses in expression.") - # Helper function to evaluate an expression without parentheses def evaluate_simple_expression(exp): - # Split the expression by the OR operator and process each segment for or_segment in exp.split('|'): - # Check if all elements in an AND segment are in state and_segment = or_segment.split('&') if all(elem.strip() in state for elem in and_segment): return [elem.strip() for elem in and_segment if elem.strip() in state] @@ -85,9 +75,7 @@ def evaluate_expression(expression): start = expression.rfind('(') end = expression.find(')', start) sub_exp = expression[start + 1:end] - # Replace the evaluated part with a placeholder and then evaluate it sub_result = evaluate_simple_expression(sub_exp) - # For simplicity in handling, join sub-results with OR to reprocess them later expression = expression[:start] + \ '|'.join(sub_result) + expression[end+1:] return evaluate_simple_expression(expression) @@ -97,7 +85,6 @@ def evaluate_expression(expression): if not temp_result: raise ValueError("No state keys matched the expression.") - # Remove redundant state keys from the result, without changing their order final_result = [] for key in temp_result: if key not in final_result: diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py index 8905ed5f..14910b3f 100644 --- a/scrapegraphai/utils/sys_dynamic_import.py +++ b/scrapegraphai/utils/sys_dynamic_import.py @@ -6,7 +6,7 @@ import sys import typing -import importlib.util # noqa: F401 +import importlib.util if typing.TYPE_CHECKING: import types @@ -36,7 +36,6 @@ def srcfile_import(modpath: str, modname: str) -> "types.ModuleType": module = importlib.util.module_from_spec(spec) - # adds the module to the global scope sys.modules[modname] = module spec.loader.exec_module(module) @@ -56,7 +55,7 @@ def dynamic_import(modname: str, message: str = "") -> None: """ if modname not in sys.modules: try: - import importlib # noqa: F401 + import importlib module = importlib.import_module(modname) sys.modules[modname] = module