diff --git a/CHANGELOG.md b/CHANGELOG.md
index 31738a95..c7edf62f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,11 +22,15 @@
## [1.24.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.24.0...v1.24.1) (2024-09-26)
+
### Bug Fixes
* script creator multi ([9905be8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9905be8a37dc1ff4b90fe9b8be987887253be8bd))
## [1.24.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.23.1...v1.24.0) (2024-09-26)
+* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
+
+## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27)
### Features
@@ -51,6 +55,14 @@
* **release:** 1.22.0-beta.1 [skip ci] ([f42a95f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f42a95faa05de39bd9cfc05e377d4b3da372e482))
* **release:** 1.22.0-beta.2 [skip ci] ([431c09f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/431c09f551ac28581674c6061f055fde0350ed4c))
* **release:** 1.22.0-beta.3 [skip ci] ([e5ac020](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e5ac0205d1e04a8b31e86166c3673915b70fd1e3))
+* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
+
+## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27)
+
+
+### Features
+
+* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25)
diff --git a/README.md b/README.md
index d2539518..94beb617 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
- More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
-
-This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
-```bash
-pip install scrapegraphai[other-language-models]
+ This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
+ ```bash
+ pip install scrapegraphai[other-language-models]
```
- Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz.
@@ -55,23 +54,9 @@ pip install scrapegraphai[other-language-models]
pip install scrapegraphai[more-browser-options]
```
-- faiss Options: this group includes faiss integration
-
- ```bash
- pip install scrapegraphai[faiss-cpu]
- ```
-
-
-### Installing "More Browser Options"
-
-This group includes an ocr scraper for websites
-```bash
-pip install scrapegraphai[screenshot_scraper]
-```
-
## 💻 Usage
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
diff --git a/examples/anthropic/csv_scraper_haiku.py b/examples/anthropic/csv_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/csv_scraper_haiku.py
rename to examples/anthropic/csv_scraper_anthropic.py
diff --git a/examples/anthropic/csv_scraper_graph_multi_haiku.py b/examples/anthropic/csv_scraper_graph_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/csv_scraper_graph_multi_haiku.py
rename to examples/anthropic/csv_scraper_graph_multi_anthropic.py
diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/custom_graph_haiku.py
rename to examples/anthropic/custom_graph_anthropic.py
diff --git a/examples/anthropic/depth_search_graph_anthropic.py b/examples/anthropic/depth_search_graph_anthropic.py
new file mode 100644
index 00000000..8cac7bea
--- /dev/null
+++ b/examples/anthropic/depth_search_graph_anthropic.py
@@ -0,0 +1,28 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+graph_config = {
+ "llm": {
+ "api_key": os.getenv("ANTHROPIC_API_KEY"),
+ "model": "openai/gpt-4o-mini",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/anthropic/json_scraper_haiku.py b/examples/anthropic/json_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/json_scraper_haiku.py
rename to examples/anthropic/json_scraper_anthropic.py
diff --git a/examples/anthropic/json_scraper_multi_haiku.py b/examples/anthropic/json_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/json_scraper_multi_haiku.py
rename to examples/anthropic/json_scraper_multi_anthropic.py
diff --git a/examples/anthropic/pdf_scraper_graph_haiku.py b/examples/anthropic/pdf_scraper_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/pdf_scraper_graph_haiku.py
rename to examples/anthropic/pdf_scraper_graph_anthropic.py
diff --git a/examples/anthropic/pdf_scraper_multi_haiku.py b/examples/anthropic/pdf_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/pdf_scraper_multi_haiku.py
rename to examples/anthropic/pdf_scraper_multi_anthropic.py
diff --git a/examples/anthropic/rate_limit_haiku.py b/examples/anthropic/rate_limit_anthropic.py
similarity index 100%
rename from examples/anthropic/rate_limit_haiku.py
rename to examples/anthropic/rate_limit_anthropic.py
diff --git a/examples/anthropic/scrape_plain_text_haiku.py b/examples/anthropic/scrape_plain_text_anthropic.py
similarity index 100%
rename from examples/anthropic/scrape_plain_text_haiku.py
rename to examples/anthropic/scrape_plain_text_anthropic.py
diff --git a/examples/anthropic/script_generator_haiku.py b/examples/anthropic/script_generator_anthropic.py
similarity index 100%
rename from examples/anthropic/script_generator_haiku.py
rename to examples/anthropic/script_generator_anthropic.py
diff --git a/examples/anthropic/script_multi_generator_haiku.py b/examples/anthropic/script_multi_generator_anthropic.py
similarity index 100%
rename from examples/anthropic/script_multi_generator_haiku.py
rename to examples/anthropic/script_multi_generator_anthropic.py
diff --git a/examples/anthropic/search_graph_haiku.py b/examples/anthropic/search_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/search_graph_haiku.py
rename to examples/anthropic/search_graph_anthropic.py
diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_anthropic.py
similarity index 100%
rename from examples/anthropic/search_graph_schema_haiku.py
rename to examples/anthropic/search_graph_schema_anthropic.py
diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/search_link_graph_haiku.py
rename to examples/anthropic/search_link_graph_anthropic.py
diff --git a/examples/anthropic/smart_scraper_haiku.py b/examples/anthropic/smart_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_haiku.py
rename to examples/anthropic/smart_scraper_anthropic.py
diff --git a/examples/anthropic/smart_scraper_multi_haiku.py b/examples/anthropic/smart_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_multi_haiku.py
rename to examples/anthropic/smart_scraper_multi_anthropic.py
diff --git a/examples/anthropic/smart_scraper_multi_concat_haiku.py b/examples/anthropic/smart_scraper_multi_concat_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_multi_concat_haiku.py
rename to examples/anthropic/smart_scraper_multi_concat_anthropic.py
diff --git a/examples/anthropic/smart_scraper_schema_haiku.py b/examples/anthropic/smart_scraper_schema_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_schema_haiku.py
rename to examples/anthropic/smart_scraper_schema_anthropic.py
diff --git a/examples/anthropic/xml_scraper_haiku.py b/examples/anthropic/xml_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/xml_scraper_haiku.py
rename to examples/anthropic/xml_scraper_anthropic.py
diff --git a/examples/anthropic/xml_scraper_graph_multi_haiku.py b/examples/anthropic/xml_scraper_graph_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/xml_scraper_graph_multi_haiku.py
rename to examples/anthropic/xml_scraper_graph_multi_anthropic.py
diff --git a/examples/azure/code_generator_graph_azure.py b/examples/azure/code_generator_graph_azure.py
index ad48933f..4bad1b0d 100644
--- a/examples/azure/code_generator_graph_azure.py
+++ b/examples/azure/code_generator_graph_azure.py
@@ -28,7 +28,7 @@ class Projects(BaseModel):
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False,
diff --git a/examples/azure/csv_scraper_azure.py b/examples/azure/csv_scraper_azure.py
index efc99758..272527b3 100644
--- a/examples/azure/csv_scraper_azure.py
+++ b/examples/azure/csv_scraper_azure.py
@@ -25,7 +25,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/csv_scraper_graph_multi_azure.py b/examples/azure/csv_scraper_graph_multi_azure.py
index d9160c40..cccbf88e 100644
--- a/examples/azure/csv_scraper_graph_multi_azure.py
+++ b/examples/azure/csv_scraper_graph_multi_azure.py
@@ -25,7 +25,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/depth_search_graph_azure.py b/examples/azure/depth_search_graph_azure.py
new file mode 100644
index 00000000..88b2cd1b
--- /dev/null
+++ b/examples/azure/depth_search_graph_azure.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": os.environ["AZURE_OPENAI_KEY"],
+ "model": "azure_openai/gpt-4o",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/azure/json_scraper_azure.py b/examples/azure/json_scraper_azure.py
index 483544fe..5ba54f7b 100644
--- a/examples/azure/json_scraper_azure.py
+++ b/examples/azure/json_scraper_azure.py
@@ -23,7 +23,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/json_scraper_multi_azure.py b/examples/azure/json_scraper_multi_azure.py
index ecf97280..befc4e84 100644
--- a/examples/azure/json_scraper_multi_azure.py
+++ b/examples/azure/json_scraper_multi_azure.py
@@ -12,7 +12,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/pdf_scraper_azure.py b/examples/azure/pdf_scraper_azure.py
index f8926489..02b3b7e6 100644
--- a/examples/azure/pdf_scraper_azure.py
+++ b/examples/azure/pdf_scraper_azure.py
@@ -10,7 +10,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py
index cfd05f1f..892996c7 100644
--- a/examples/azure/rate_limit_azure.py
+++ b/examples/azure/rate_limit_azure.py
@@ -26,7 +26,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o",
"rate_limit": {
"requests_per_second": 1
},
diff --git a/examples/azure/scrape_plain_text_azure.py b/examples/azure/scrape_plain_text_azure.py
index ef0d7d1c..9ea18d07 100644
--- a/examples/azure/scrape_plain_text_azure.py
+++ b/examples/azure/scrape_plain_text_azure.py
@@ -28,7 +28,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/script_generator_azure.py b/examples/azure/script_generator_azure.py
index 12f5d6be..b2bbb220 100644
--- a/examples/azure/script_generator_azure.py
+++ b/examples/azure/script_generator_azure.py
@@ -15,7 +15,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/script_multi_generator_azure.py b/examples/azure/script_multi_generator_azure.py
index a1bb8dbd..8c52cb95 100644
--- a/examples/azure/script_multi_generator_azure.py
+++ b/examples/azure/script_multi_generator_azure.py
@@ -16,7 +16,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/search_graph_azure.py b/examples/azure/search_graph_azure.py
index 13547e06..949f134c 100644
--- a/examples/azure/search_graph_azure.py
+++ b/examples/azure/search_graph_azure.py
@@ -22,7 +22,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/search_graph_schema_azure.py b/examples/azure/search_graph_schema_azure.py
index 629c92ab..e8c10093 100644
--- a/examples/azure/search_graph_schema_azure.py
+++ b/examples/azure/search_graph_schema_azure.py
@@ -30,7 +30,7 @@ class Dishes(BaseModel):
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py
index aec2297b..42ed07ad 100644
--- a/examples/azure/search_link_graph_azure.py
+++ b/examples/azure/search_link_graph_azure.py
@@ -15,7 +15,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_azure.py b/examples/azure/smart_scraper_azure.py
index bf3bc8d7..933dc5b0 100644
--- a/examples/azure/smart_scraper_azure.py
+++ b/examples/azure/smart_scraper_azure.py
@@ -26,7 +26,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py
index f1f3451e..e066eaf1 100644
--- a/examples/azure/smart_scraper_multi_azure.py
+++ b/examples/azure/smart_scraper_multi_azure.py
@@ -14,7 +14,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_multi_concat_azure.py b/examples/azure/smart_scraper_multi_concat_azure.py
index e3870a4c..06d08b9a 100644
--- a/examples/azure/smart_scraper_multi_concat_azure.py
+++ b/examples/azure/smart_scraper_multi_concat_azure.py
@@ -15,7 +15,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py
index d0816bf5..d2766ecb 100644
--- a/examples/azure/smart_scraper_schema_azure.py
+++ b/examples/azure/smart_scraper_schema_azure.py
@@ -29,7 +29,7 @@ class Projects(BaseModel):
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/xml_scraper_azure.py b/examples/azure/xml_scraper_azure.py
index ecfb8743..1c40f3e7 100644
--- a/examples/azure/xml_scraper_azure.py
+++ b/examples/azure/xml_scraper_azure.py
@@ -24,7 +24,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/xml_scraper_graph_multi_azure.py b/examples/azure/xml_scraper_graph_multi_azure.py
index db4db108..972eb823 100644
--- a/examples/azure/xml_scraper_graph_multi_azure.py
+++ b/examples/azure/xml_scraper_graph_multi_azure.py
@@ -25,7 +25,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o",
},
"verbose": True,
"headless": False
diff --git a/examples/bedrock/depth_search_graph_bedrock.py b/examples/bedrock/depth_search_graph_bedrock.py
new file mode 100644
index 00000000..2ab88291
--- /dev/null
+++ b/examples/bedrock/depth_search_graph_bedrock.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "client": "client_name",
+ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+ "temperature": 0.0
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/deepseek/depth_search_graph_deepseek.py b/examples/deepseek/depth_search_graph_deepseek.py
new file mode 100644
index 00000000..064690a5
--- /dev/null
+++ b/examples/deepseek/depth_search_graph_deepseek.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+deepseek_key = os.getenv("DEEPSEEK_APIKEY")
+
+graph_config = {
+ "llm": {
+ "model": "deepseek/deepseek-chat",
+ "api_key": deepseek_key,
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py
index 57d422e5..a3082cf7 100644
--- a/examples/ernie/custom_graph_ernie.py
+++ b/examples/ernie/custom_graph_ernie.py
@@ -14,7 +14,7 @@
# Define the configuration for the graph
# ************************************************
-graph_config = {
+graph_config = {
"llm": {
"model": "ernie/ernie-bot-turbo",
"ernie_client_id": "",
diff --git a/examples/ernie/depth_search_graph_ernie.py b/examples/ernie/depth_search_graph_ernie.py
new file mode 100644
index 00000000..99470d8d
--- /dev/null
+++ b/examples/ernie/depth_search_graph_ernie.py
@@ -0,0 +1,26 @@
+"""
+depth_search_graph_opeani example
+"""
+from scrapegraphai.graphs import DepthSearchGraph
+
+graph_config = {
+ "llm": {
+ "model": "ernie/ernie-bot-turbo",
+ "ernie_client_id": "",
+ "ernie_client_secret": "",
+ "temperature": 0.1
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py
new file mode 100644
index 00000000..6e2670a0
--- /dev/null
+++ b/examples/extras/html_mode.py
@@ -0,0 +1,49 @@
+"""
+Basic example of scraping pipeline using SmartScraper
+By default smart scraper converts in md format the
+code. If you want to just use the original code, you have
+to specify in the confi
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+ "llm": {
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ "model": "openai/gpt-4o",
+ },
+ "html_mode": True,
+ "verbose": True,
+ "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+ prompt="List me what does the company do, the name and a contact email.",
+ source="https://scrapegraphai.com/",
+ config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/extras/reasoning.py b/examples/extras/reasoning.py
new file mode 100644
index 00000000..80e57faa
--- /dev/null
+++ b/examples/extras/reasoning.py
@@ -0,0 +1,46 @@
+"""
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+ "llm": {
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ "model": "openai/gpt-4o",
+ },
+ "reasoning": True,
+ "verbose": True,
+ "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+ prompt="List me what does the company do, the name and a contact email.",
+ source="https://scrapegraphai.com/",
+ config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/fireworks/depth_search_graph_fireworks.py b/examples/fireworks/depth_search_graph_fireworks.py
new file mode 100644
index 00000000..f467be9f
--- /dev/null
+++ b/examples/fireworks/depth_search_graph_fireworks.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": fireworks_api_key,
+ "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/google_genai/depth_search_graph_gemini.py b/examples/google_genai/depth_search_graph_gemini.py
new file mode 100644
index 00000000..956341f4
--- /dev/null
+++ b/examples/google_genai/depth_search_graph_gemini.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": gemini_key,
+ "model": "google_genai/gemini-pro",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/google_vertexai/depth_search_graph_gemini.py b/examples/google_vertexai/depth_search_graph_gemini.py
new file mode 100644
index 00000000..13bba630
--- /dev/null
+++ b/examples/google_vertexai/depth_search_graph_gemini.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": gemini_key,
+ "model": "google_vertexai/gemini-1.5-pro",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/groq/depth_search_graph_groq.py b/examples/groq/depth_search_graph_groq.py
new file mode 100644
index 00000000..2d1ed8b1
--- /dev/null
+++ b/examples/groq/depth_search_graph_groq.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+ "llm": {
+ "model": "groq/gemma-7b-it",
+ "api_key": groq_key,
+ "temperature": 0
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py
index cec007b7..06b2f089 100644
--- a/examples/huggingfacehub/custom_graph_huggingfacehub.py
+++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py
@@ -4,7 +4,6 @@
import os
from dotenv import load_dotenv
-
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from scrapegraphai.graphs import BaseGraph
diff --git a/examples/huggingfacehub/depth_search_graph_huggingfacehub.py b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py
new file mode 100644
index 00000000..48df3e37
--- /dev/null
+++ b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py
@@ -0,0 +1,38 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+
+load_dotenv()
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+
+llm_model_instance = HuggingFaceEndpoint(
+ repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+ api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+
+graph_config = {
+ "llm": {"model_instance": llm_model_instance},
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/local_models/depth_search_graph_ollama.py b/examples/local_models/depth_search_graph_ollama.py
new file mode 100644
index 00000000..d0f960b5
--- /dev/null
+++ b/examples/local_models/depth_search_graph_ollama.py
@@ -0,0 +1,32 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "model": "ollama/llama3.1",
+ "temperature": 0,
+ "format": "json", # Ollama needs the format to be specified explicitly
+ # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py
index 6e9c3da3..e80bf5ec 100644
--- a/examples/local_models/json_scraper_multi_ollama.py
+++ b/examples/local_models/json_scraper_multi_ollama.py
@@ -15,6 +15,7 @@
"verbose": True,
"headless": False,
}
+
FILE_NAME = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py
index 35503bd7..5a5b3cea 100644
--- a/examples/local_models/smart_scraper_schema_ollama.py
+++ b/examples/local_models/smart_scraper_schema_ollama.py
@@ -24,7 +24,6 @@ class Projects(BaseModel):
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
-
"verbose": True,
"headless": False
}
diff --git a/examples/mistral/depth_search_graph_mistral.py b/examples/mistral/depth_search_graph_mistral.py
new file mode 100644
index 00000000..ae18ffba
--- /dev/null
+++ b/examples/mistral/depth_search_graph_mistral.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+mistral_key = os.getenv("MISTRAL_API_KEY")
+
+graph_config = {
+ "llm": {
+ "api_key": mistral_key,
+ "model": "mistralai/open-mistral-nemo",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/nemotron/depth_search_graph_nemotron.py b/examples/nemotron/depth_search_graph_nemotron.py
new file mode 100644
index 00000000..edd80463
--- /dev/null
+++ b/examples/nemotron/depth_search_graph_nemotron.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": os.getenv("NEMOTRON_KEY"),
+ "model": "claude-3-haiku-20240307",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/oneapi/depth_search_graph_onenapi.py b/examples/oneapi/depth_search_graph_onenapi.py
new file mode 100644
index 00000000..7a2e7f3e
--- /dev/null
+++ b/examples/oneapi/depth_search_graph_onenapi.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": "***************************",
+ "model": "oneapi/qwen-turbo",
+ "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/openai/depth_search_graph_openai.py b/examples/openai/depth_search_graph_openai.py
new file mode 100644
index 00000000..dff07ad4
--- /dev/null
+++ b/examples/openai/depth_search_graph_openai.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": openai_key,
+ "model": "openai/gpt-4o-mini",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/together/code_generator_graph_togehter.py b/examples/together/code_generator_graph_togehter.py
new file mode 100644
index 00000000..aefbeba4
--- /dev/null
+++ b/examples/together/code_generator_graph_togehter.py
@@ -0,0 +1,60 @@
+"""
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+ title: str = Field(description="The title of the project")
+ description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+ projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+together_key = os.getenv("TOGETHER_KEY")
+
+graph_config = {
+ "llm": {
+ "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+ "api_key": together_key,
+ },
+ "verbose": True,
+ "headless": False,
+ "reduction": 2,
+ "max_iterations": {
+ "overall": 10,
+ "syntax": 3,
+ "execution": 3,
+ "validation": 3,
+ "semantic": 3
+ },
+ "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io/projects/",
+ schema=Projects,
+ config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py
new file mode 100644
index 00000000..7a2e7f3e
--- /dev/null
+++ b/examples/together/depth_search_graph_together.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": "***************************",
+ "model": "oneapi/qwen-turbo",
+ "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/pyproject.toml b/pyproject.toml
index f0bce7b0..f6033f60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,6 @@ name = "scrapegraphai"
version = "1.25.2"
-
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
authors = [
{ name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
@@ -31,6 +30,8 @@ dependencies = [
"undetected-playwright>=0.3.0",
"google>=3.0.0",
"langchain-ollama>=0.1.3",
+ "qdrant-client>=1.11.3",
+ "fastembed>=0.3.6"
"semchunk>=2.2.0",
"transformers>=4.44.2"
]
@@ -100,11 +101,6 @@ screenshot_scraper = [
"pillow>=10.4.0",
]
-# Group 5: Faiss CPU
-faiss-cpu = [
- "faiss-cpu>=1.8.0",
-]
-
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 1d9d469a..3423cef0 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -64,6 +64,8 @@ click==8.1.7
# via burr
# via streamlit
# via uvicorn
+coloredlogs==15.0.1
+ # via onnxruntime
contourpy==1.2.1
# via matplotlib
cycler==0.12.1
@@ -84,9 +86,13 @@ fastapi==0.112.0
# via burr
fastapi-pagination==0.12.26
# via burr
+fastembed==0.3.6
+ # via scrapegraphai
filelock==3.15.4
# via huggingface-hub
# via transformers
+flatbuffers==24.3.25
+ # via onnxruntime
fonttools==4.53.1
# via matplotlib
free-proxy==1.1.1
@@ -132,11 +138,19 @@ greenlet==3.0.3
grpcio==1.65.4
# via google-api-core
# via grpcio-status
+ # via grpcio-tools
+ # via qdrant-client
grpcio-status==1.62.3
# via google-api-core
+grpcio-tools==1.62.3
+ # via qdrant-client
h11==0.14.0
# via httpcore
# via uvicorn
+h2==4.1.0
+ # via httpx
+hpack==4.0.0
+ # via h2
html2text==2024.2.26
# via scrapegraphai
httpcore==1.0.5
@@ -149,11 +163,17 @@ httpx==0.27.0
# via langsmith
# via ollama
# via openai
+ # via qdrant-client
httpx-sse==0.4.0
# via langchain-mistralai
huggingface-hub==0.24.5
+ # via fastembed
# via tokenizers
# via transformers
+humanfriendly==10.0
+ # via coloredlogs
+hyperframe==6.0.1
+ # via h2
idna==3.7
# via anyio
# via httpx
@@ -218,6 +238,7 @@ langsmith==0.1.121
# via langchain-core
loguru==0.7.2
# via burr
+ # via fastembed
lxml==5.3.0
# via free-proxy
markdown-it-py==3.0.0
@@ -236,8 +257,12 @@ minify-html==0.15.0
# via scrapegraphai
mistral-common==1.4.1
# via scrapegraphai
+mmh3==4.1.0
+ # via fastembed
mpire==2.10.2
# via semchunk
+mpmath==1.3.0
+ # via sympy
multidict==6.0.5
# via aiohttp
# via yarl
@@ -249,19 +274,27 @@ narwhals==1.3.0
# via altair
numpy==1.26.4
# via contourpy
+ # via fastembed
# via langchain
# via langchain-aws
# via langchain-community
# via matplotlib
+ # via onnx
+ # via onnxruntime
# via opencv-python-headless
# via pandas
# via pyarrow
# via pydeck
+ # via qdrant-client
# via sf-hamilton
# via streamlit
# via transformers
ollama==0.3.2
# via langchain-ollama
+onnx==1.17.0
+ # via fastembed
+onnxruntime==1.19.2
+ # via fastembed
openai==1.40.3
# via burr
# via langchain-openai
@@ -275,6 +308,7 @@ packaging==24.1
# via langchain-core
# via marshmallow
# via matplotlib
+ # via onnxruntime
# via pytest
# via sphinx
# via streamlit
@@ -284,6 +318,7 @@ pandas==2.2.2
# via sf-hamilton
# via streamlit
pillow==10.4.0
+ # via fastembed
# via matplotlib
# via mistral-common
# via streamlit
@@ -294,6 +329,8 @@ playwright==1.45.1
# via undetected-playwright
pluggy==1.5.0
# via pytest
+portalocker==2.10.1
+ # via qdrant-client
proto-plus==1.24.0
# via google-ai-generativelanguage
# via google-api-core
@@ -303,6 +340,9 @@ protobuf==4.25.4
# via google-generativeai
# via googleapis-common-protos
# via grpcio-status
+ # via grpcio-tools
+ # via onnx
+ # via onnxruntime
# via proto-plus
# via streamlit
pyarrow==17.0.0
@@ -326,6 +366,7 @@ pydantic==2.8.2
# via mistral-common
# via openai
# via pydantic-settings
+ # via qdrant-client
pydantic-core==2.20.1
# via pydantic
pydantic-settings==2.5.2
@@ -343,6 +384,8 @@ pylint==3.2.6
pyparsing==3.1.2
# via httplib2
# via matplotlib
+pystemmer==2.2.0.1
+ # via fastembed
pytest==8.0.0
# via pytest-mock
pytest-mock==3.14.0
@@ -361,6 +404,8 @@ pyyaml==6.0.2
# via langchain-community
# via langchain-core
# via transformers
+qdrant-client==1.11.3
+ # via scrapegraphai
referencing==0.35.1
# via jsonschema
# via jsonschema-specifications
@@ -369,6 +414,7 @@ regex==2024.7.24
# via transformers
requests==2.32.3
# via burr
+ # via fastembed
# via free-proxy
# via google-api-core
# via huggingface-hub
@@ -395,6 +441,8 @@ semchunk==2.2.0
# via scrapegraphai
sentencepiece==0.2.0
# via mistral-common
+setuptools==75.1.0
+ # via grpcio-tools
sf-hamilton==1.73.1
# via burr
six==1.16.0
@@ -406,6 +454,7 @@ sniffio==1.3.1
# via httpx
# via openai
snowballstemmer==2.2.0
+ # via fastembed
# via sphinx
soupsieve==2.5
# via beautifulsoup4
@@ -434,6 +483,8 @@ starlette==0.37.2
# via fastapi
streamlit==1.37.1
# via burr
+sympy==1.13.3
+ # via onnxruntime
tenacity==8.5.0
# via langchain
# via langchain-community
@@ -444,6 +495,7 @@ tiktoken==0.7.0
# via mistral-common
# via scrapegraphai
tokenizers==0.19.1
+ # via fastembed
# via langchain-mistralai
# via transformers
toml==0.10.2
@@ -456,6 +508,7 @@ tomlkit==0.13.0
tornado==6.4.1
# via streamlit
tqdm==4.66.5
+ # via fastembed
# via google-generativeai
# via huggingface-hub
# via mpire
@@ -495,6 +548,7 @@ uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.19
# via botocore
+ # via qdrant-client
# via requests
uvicorn==0.30.5
# via burr
diff --git a/requirements.lock b/requirements.lock
index 84e25a0f..8949648a 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -41,6 +41,8 @@ certifi==2024.7.4
# via requests
charset-normalizer==3.3.2
# via requests
+coloredlogs==15.0.1
+ # via onnxruntime
dataclasses-json==0.6.7
# via langchain-community
dill==0.3.8
@@ -49,9 +51,13 @@ distro==1.9.0
# via openai
exceptiongroup==1.2.2
# via anyio
+fastembed==0.3.6
+ # via scrapegraphai
filelock==3.15.4
# via huggingface-hub
# via transformers
+flatbuffers==24.3.25
+ # via onnxruntime
free-proxy==1.1.1
# via scrapegraphai
frozenlist==1.4.1
@@ -87,10 +93,18 @@ greenlet==3.0.3
grpcio==1.65.1
# via google-api-core
# via grpcio-status
+ # via grpcio-tools
+ # via qdrant-client
grpcio-status==1.62.2
# via google-api-core
+grpcio-tools==1.62.3
+ # via qdrant-client
h11==0.14.0
# via httpcore
+h2==4.1.0
+ # via httpx
+hpack==4.0.0
+ # via h2
html2text==2024.2.26
# via scrapegraphai
httpcore==1.0.5
@@ -103,11 +117,17 @@ httpx==0.27.0
# via langsmith
# via ollama
# via openai
+ # via qdrant-client
httpx-sse==0.4.0
# via langchain-mistralai
huggingface-hub==0.24.1
+ # via fastembed
# via tokenizers
# via transformers
+humanfriendly==10.0
+ # via coloredlogs
+hyperframe==6.0.1
+ # via h2
idna==3.7
# via anyio
# via httpx
@@ -156,6 +176,8 @@ langsmith==0.1.121
# via langchain
# via langchain-community
# via langchain-core
+loguru==0.7.2
+ # via fastembed
lxml==5.2.2
# via free-proxy
marshmallow==3.21.3
@@ -164,8 +186,12 @@ minify-html==0.15.0
# via scrapegraphai
mistral-common==1.4.1
# via scrapegraphai
+mmh3==4.1.0
+ # via fastembed
mpire==2.10.2
# via semchunk
+mpmath==1.3.0
+ # via sympy
multidict==6.0.5
# via aiohttp
# via yarl
@@ -174,14 +200,22 @@ multiprocess==0.70.16
mypy-extensions==1.0.0
# via typing-inspect
numpy==1.26.4
+ # via fastembed
# via langchain
# via langchain-aws
# via langchain-community
+ # via onnx
+ # via onnxruntime
# via opencv-python-headless
# via pandas
+ # via qdrant-client
# via transformers
ollama==0.3.2
# via langchain-ollama
+onnx==1.17.0
+ # via fastembed
+onnxruntime==1.19.2
+ # via fastembed
openai==1.41.0
# via langchain-openai
opencv-python-headless==4.10.0.84
@@ -192,14 +226,18 @@ packaging==24.1
# via huggingface-hub
# via langchain-core
# via marshmallow
+ # via onnxruntime
# via transformers
pandas==2.2.2
# via scrapegraphai
pillow==10.4.0
+ # via fastembed
# via mistral-common
playwright==1.45.1
# via scrapegraphai
# via undetected-playwright
+portalocker==2.10.1
+ # via qdrant-client
proto-plus==1.24.0
# via google-ai-generativelanguage
# via google-api-core
@@ -209,6 +247,9 @@ protobuf==4.25.3
# via google-generativeai
# via googleapis-common-protos
# via grpcio-status
+ # via grpcio-tools
+ # via onnx
+ # via onnxruntime
# via proto-plus
pyasn1==0.6.0
# via pyasn1-modules
@@ -226,6 +267,7 @@ pydantic==2.8.2
# via mistral-common
# via openai
# via pydantic-settings
+ # via qdrant-client
pydantic-core==2.20.1
# via pydantic
pydantic-settings==2.5.2
@@ -236,6 +278,8 @@ pygments==2.18.0
# via mpire
pyparsing==3.1.2
# via httplib2
+pystemmer==2.2.0.1
+ # via fastembed
python-dateutil==2.9.0.post0
# via botocore
# via pandas
@@ -250,6 +294,8 @@ pyyaml==6.0.1
# via langchain-community
# via langchain-core
# via transformers
+qdrant-client==1.11.3
+ # via scrapegraphai
referencing==0.35.1
# via jsonschema
# via jsonschema-specifications
@@ -257,6 +303,7 @@ regex==2024.5.15
# via tiktoken
# via transformers
requests==2.32.3
+ # via fastembed
# via free-proxy
# via google-api-core
# via huggingface-hub
@@ -279,17 +326,23 @@ semchunk==2.2.0
# via scrapegraphai
sentencepiece==0.2.0
# via mistral-common
+setuptools==75.1.0
+ # via grpcio-tools
six==1.16.0
# via python-dateutil
sniffio==1.3.1
# via anyio
# via httpx
# via openai
+snowballstemmer==2.2.0
+ # via fastembed
soupsieve==2.5
# via beautifulsoup4
sqlalchemy==2.0.31
# via langchain
# via langchain-community
+sympy==1.13.3
+ # via onnxruntime
tenacity==8.5.0
# via langchain
# via langchain-community
@@ -299,9 +352,11 @@ tiktoken==0.7.0
# via mistral-common
# via scrapegraphai
tokenizers==0.19.1
+ # via fastembed
# via langchain-mistralai
# via transformers
tqdm==4.66.4
+ # via fastembed
# via google-generativeai
# via huggingface-hub
# via mpire
@@ -333,6 +388,7 @@ uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.19
# via botocore
+ # via qdrant-client
# via requests
yarl==1.9.4
# via aiohttp
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
index efd6bd7e..b5ffcc47 100644
--- a/scrapegraphai/graphs/__init__.py
+++ b/scrapegraphai/graphs/__init__.py
@@ -26,3 +26,4 @@
from .screenshot_scraper_graph import ScreenshotScraperGraph
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
from .code_generator_graph import CodeGeneratorGraph
+from .depth_search_graph import DepthSearchGraph
diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
new file mode 100644
index 00000000..13b39129
--- /dev/null
+++ b/scrapegraphai/graphs/depth_search_graph.py
@@ -0,0 +1,151 @@
+"""
+... Module
+"""
+from typing import Optional
+import logging
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from ..utils.save_code_to_file import save_code_to_file
+from ..nodes import (
+ FetchNodeLevelK,
+ ParseNodeDepthK,
+ DescriptionNode,
+ RAGNode,
+ GenerateAnswerNodeKLevel
+)
+
+class DepthSearchGraph(AbstractGraph):
+ """
+ CodeGeneratorGraph is a script generator pipeline that generates
+ the function extract_data(html: str) -> dict() for
+ extracting the wanted information from a HTML page. The
+ code generated is in Python and uses the library BeautifulSoup.
+ It requires a user prompt, a source URL, and an output schema.
+
+ Attributes:
+ prompt (str): The prompt for the graph.
+ source (str): The source of the graph.
+ config (dict): Configuration parameters for the graph.
+ schema (BaseModel): The schema for the graph output.
+ llm_model: An instance of a language model client, configured for generating answers.
+ embedder_model: An instance of an embedding model client,
+ configured for generating embeddings.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+ headless (bool): A flag indicating whether to run the graph in headless mode.
+ library (str): The library used for web scraping (beautiful soup).
+
+ Args:
+ prompt (str): The prompt for the graph.
+ source (str): The source of the graph.
+ config (dict): Configuration parameters for the graph.
+ schema (BaseModel): The schema for the graph output.
+
+ Example:
+ >>> code_gen = CodeGeneratorGraph(
+ ... "List me all the attractions in Chioggia.",
+ ... "https://en.wikipedia.org/wiki/Chioggia",
+ ... {"llm": {"model": "openai/gpt-3.5-turbo"}}
+ ... )
+ >>> result = code_gen.run()
+ )
+ """
+
+ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
+
+ super().__init__(prompt, config, source, schema)
+
+ self.input_key = "url" if source.startswith("http") else "local_dir"
+
+ def _create_graph(self) -> BaseGraph:
+ """
+ Creates the graph of nodes representing the workflow for web scraping.
+
+ Returns:
+ BaseGraph: A graph instance representing the web scraping workflow.
+ """
+
+ fetch_node_k = FetchNodeLevelK(
+ input="url| local_dir",
+ output=["docs"],
+ node_config={
+ "loader_kwargs": self.config.get("loader_kwargs", {}),
+ "force": self.config.get("force", False),
+ "cut": self.config.get("cut", True),
+ "browser_base": self.config.get("browser_base"),
+ "depth": self.config.get("depth", 1),
+ "only_inside_links": self.config.get("only_inside_links", False)
+ }
+ )
+
+ parse_node_k = ParseNodeDepthK(
+ input="docs",
+ output=["docs"],
+ node_config={
+ "verbose": self.config.get("verbose", False)
+ }
+ )
+
+ description_node = DescriptionNode(
+ input="docs",
+ output=["docs"],
+ node_config={
+ "llm_model": self.llm_model,
+ "verbose": self.config.get("verbose", False),
+ "cache_path": self.config.get("cache_path", False)
+ }
+ )
+
+ rag_node = RAGNode (
+ input="docs",
+ output=["vectorial_db"],
+ node_config={
+ "llm_model": self.llm_model,
+ "embedder_model": self.config.get("embedder_model", False),
+ "verbose": self.config.get("verbose", False),
+ }
+ )
+
+ generate_answer_k = GenerateAnswerNodeKLevel(
+ input="vectorial_db",
+ output=["answer"],
+ node_config={
+ "llm_model": self.llm_model,
+ "embedder_model": self.config.get("embedder_model", False),
+ "verbose": self.config.get("verbose", False),
+ }
+
+ )
+
+ return BaseGraph(
+ nodes=[
+ fetch_node_k,
+ parse_node_k,
+ description_node,
+ rag_node,
+ generate_answer_k
+ ],
+ edges=[
+ (fetch_node_k, parse_node_k),
+ (parse_node_k, description_node),
+ (description_node, rag_node),
+ (rag_node, generate_answer_k)
+ ],
+ entry_point=fetch_node_k,
+ graph_name=self.__class__.__name__
+ )
+
+ def run(self) -> str:
+ """
+ Executes the scraping process and returns the generated code.
+
+ Returns:
+ str: The generated code.
+ """
+
+ inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+ self.final_state, self.execution_info = self.graph.execute(inputs)
+
+ docs = self.final_state.get("answer", "No answer")
+
+ return docs
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
index e2bda63f..60407624 100644
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -9,6 +9,7 @@
from ..nodes import (
FetchNode,
ParseNode,
+ ReasoningNode,
GenerateAnswerNode
)
@@ -69,7 +70,6 @@ def _create_graph(self) -> BaseGraph:
"scrape_do": self.config.get("scrape_do")
}
)
-
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
@@ -89,19 +89,87 @@ def _create_graph(self) -> BaseGraph:
}
)
+ if self.config.get("html_mode") is False:
+ parse_node = ParseNode(
+ input="doc",
+ output=["parsed_doc"],
+ node_config={
+ "llm_model": self.llm_model,
+ "chunk_size": self.model_token
+ }
+ )
+
+ if self.config.get("reasoning"):
+ reasoning_node = ReasoningNode(
+ input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+ output=["answer"],
+ node_config={
+ "llm_model": self.llm_model,
+ "additional_info": self.config.get("additional_info"),
+ "schema": self.schema,
+ }
+ )
+
+ if self.config.get("html_mode") is False and self.config.get("reasoning") is True:
+
+ return BaseGraph(
+ nodes=[
+ fetch_node,
+ parse_node,
+ reasoning_node,
+ generate_answer_node,
+ ],
+ edges=[
+ (fetch_node, parse_node),
+ (parse_node, reasoning_node),
+ (reasoning_node, generate_answer_node)
+ ],
+ entry_point=fetch_node,
+ graph_name=self.__class__.__name__
+ )
+
+ elif self.config.get("html_mode") is True and self.config.get("reasoning") is True:
+
+ return BaseGraph(
+ nodes=[
+ fetch_node,
+ reasoning_node,
+ generate_answer_node,
+ ],
+ edges=[
+ (fetch_node, reasoning_node),
+ (reasoning_node, generate_answer_node)
+ ],
+ entry_point=fetch_node,
+ graph_name=self.__class__.__name__
+ )
+
+ elif self.config.get("html_mode") is True and self.config.get("reasoning") is False:
+ return BaseGraph(
+ nodes=[
+ fetch_node,
+ generate_answer_node,
+ ],
+ edges=[
+ (fetch_node, generate_answer_node)
+ ],
+ entry_point=fetch_node,
+ graph_name=self.__class__.__name__
+ )
+
return BaseGraph(
- nodes=[
- fetch_node,
- parse_node,
- generate_answer_node,
- ],
- edges=[
- (fetch_node, parse_node),
- (parse_node, generate_answer_node)
- ],
- entry_point=fetch_node,
- graph_name=self.__class__.__name__
- )
+ nodes=[
+ fetch_node,
+ parse_node,
+ generate_answer_node,
+ ],
+ edges=[
+ (fetch_node, parse_node),
+ (parse_node, generate_answer_node)
+ ],
+ entry_point=fetch_node,
+ graph_name=self.__class__.__name__
+ )
def run(self) -> str:
"""
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index 5a56ac1e..edb195a5 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -27,3 +27,8 @@
from .html_analyzer_node import HtmlAnalyzerNode
from .generate_code_node import GenerateCodeNode
from .search_node_with_context import SearchLinksWithContext
+from .reasoning_node import ReasoningNode
+from .fetch_node_level_k import FetchNodeLevelK
+from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
+from .description_node import DescriptionNode
+from .parse_node_depth_k import ParseNodeDepthK
diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
new file mode 100644
index 00000000..4201a61d
--- /dev/null
+++ b/scrapegraphai/nodes/description_node.py
@@ -0,0 +1,67 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from tqdm import tqdm
+from langchain.prompts import PromptTemplate
+from langchain_core.runnables import RunnableParallel
+from .base_node import BaseNode
+from ..prompts.description_node_prompts import DESCRIPTION_NODE_PROMPT
+
+class DescriptionNode(BaseNode):
+ """
+ A node responsible for compressing the input tokens and storing the document
+ in a vector database for retrieval. Relevant chunks are stored in the state.
+
+ It allows scraping of big documents without exceeding the token limit of the language model.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "DESCRIPTION",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+ self.llm_model = node_config["llm_model"]
+ self.verbose = (
+ False if node_config is None else node_config.get("verbose", False)
+ )
+ self.cache_path = node_config.get("cache_path", False)
+
+ def execute(self, state: dict) -> dict:
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ docs = [elem for elem in state.get("docs")]
+
+ chains_dict = {}
+
+ for i, chunk in enumerate(tqdm(docs, desc="Processing chunks", disable=not self.verbose)):
+ prompt = PromptTemplate(
+ template=DESCRIPTION_NODE_PROMPT,
+ partial_variables={"content": chunk.get("document")}
+ )
+ chain_name = f"chunk{i+1}"
+ chains_dict[chain_name] = prompt | self.llm_model
+
+ async_runner = RunnableParallel(**chains_dict)
+ batch_results = async_runner.invoke({})
+
+
+ for i in range(1, len(docs)+1):
+ docs[i-1]["summary"] = batch_results.get(f"chunk{i}").content
+
+ state.update({self.output[0]: docs})
+
+ return state
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
new file mode 100644
index 00000000..d321b33c
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -0,0 +1,214 @@
+from typing import List, Optional
+from .base_node import BaseNode
+from ..docloaders import ChromiumLoader
+from ..utils.cleanup_html import cleanup_html
+from ..utils.convert_to_md import convert_to_md
+from langchain_core.documents import Document
+from bs4 import BeautifulSoup
+from urllib.parse import quote, urljoin
+
+class FetchNodeLevelK(BaseNode):
+ """
+ A node responsible for fetching the HTML content of a specified URL and all its sub-links
+ recursively up to a certain level of hyperlink the graph. This content is then used to update
+ the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously
+ (with proxy protection).
+
+ Attributes:
+ embedder_model: An optional model for embedding the fetched content.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+ cache_path (str): Path to cache fetched content.
+ headless (bool): Whether to run the Chromium browser in headless mode.
+ loader_kwargs (dict): Additional arguments for the content loader.
+ browser_base (dict): Optional configuration for the browser base API.
+ depth (int): Maximum depth of hyperlink graph traversal.
+ only_inside_links (bool): Whether to fetch only internal links.
+ min_input_len (int): Minimum required length of input data.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "FetchLevelK",
+ ):
+ """
+ Initializes the FetchNodeLevelK instance.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (Optional[dict]): Additional configuration for the node.
+ node_name (str): The name of the node (default is "FetchLevelK").
+ """
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.embedder_model = node_config.get("embedder_model", None)
+ self.verbose = node_config.get("verbose", False) if node_config else False
+ self.cache_path = node_config.get("cache_path", False)
+ self.headless = node_config.get("headless", True) if node_config else True
+ self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {}
+ self.browser_base = node_config.get("browser_base", None)
+ self.depth = node_config.get("depth", 1) if node_config else 1
+ self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False
+ self.min_input_len = 1
+
+ def execute(self, state: dict) -> dict:
+ """
+ Executes the node's logic to fetch the HTML content of a specified URL and its sub-links
+ recursively, then updates the graph's state with the fetched content.
+
+ Args:
+ state (dict): The current state of the graph.
+
+ Returns:
+ dict: The updated state with a new output key containing the fetched HTML content.
+
+ Raises:
+ KeyError: If the input key is not found in the state.
+ """
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ input_keys = self.get_input_keys(state)
+ input_data = [state[key] for key in input_keys]
+ source = input_data[0]
+
+ documents = [{"source": source}]
+ loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {}
+
+ for _ in range(self.depth):
+ documents = self.obtain_content(documents, loader_kwargs)
+
+ filtered_documents = [doc for doc in documents if 'document' in doc]
+ state.update({self.output[0]: filtered_documents})
+ return state
+
+ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
+ """
+ Fetches the HTML content of a given source URL.
+
+ Args:
+ source (str): The URL to fetch content from.
+ loader_kwargs (dict): Additional arguments for the content loader.
+
+ Returns:
+ Optional[str]: The fetched HTML content or None if fetching failed.
+ """
+ self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+
+ if self.browser_base is not None:
+ try:
+ from ..docloaders.browser_base import browser_base_fetch
+ except ImportError:
+ raise ImportError("""The browserbase module is not installed.
+ Please install it using `pip install browserbase`.""")
+
+ data = browser_base_fetch(self.browser_base.get("api_key"),
+ self.browser_base.get("project_id"), [source])
+ document = [Document(page_content=content, metadata={"source": source}) for content in data]
+ else:
+ loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
+ document = loader.load()
+ return document
+
+ def extract_links(self, html_content: str) -> list:
+ """
+ Extracts all hyperlinks from the HTML content.
+
+ Args:
+ html_content (str): The HTML content to extract links from.
+
+ Returns:
+ list: A list of extracted hyperlinks.
+ """
+ soup = BeautifulSoup(html_content, 'html.parser')
+ links = [link['href'] for link in soup.find_all('a', href=True)]
+ self.logger.info(f"Extracted {len(links)} links.")
+ return links
+
+ def get_full_links(self, base_url: str, links: list) -> list:
+ """
+ Converts relative URLs to full URLs based on the base URL.
+
+ Args:
+ base_url (str): The base URL for resolving relative links.
+ links (list): A list of links to convert.
+
+ Returns:
+ list: A list of full URLs.
+ """
+ full_links = []
+ for link in links:
+ if self.only_inside_links and link.startswith("http"):
+ continue
+ full_link = link if link.startswith("http") else urljoin(base_url, link)
+ full_links.append(full_link)
+ return full_links
+
+ def obtain_content(self, documents: List, loader_kwargs) -> List:
+ """
+ Iterates through documents, fetching and updating content recursively.
+
+ Args:
+ documents (List): A list of documents containing the source URLs.
+ loader_kwargs (dict): Additional arguments for the content loader.
+
+ Returns:
+ List: The updated list of documents with fetched content.
+ """
+ new_documents = []
+ for doc in documents:
+ source = doc['source']
+ if 'document' not in doc:
+ document = self.fetch_content(source, loader_kwargs)
+
+ if not document or not document[0].page_content.strip():
+ self.logger.warning(f"Failed to fetch content for {source}")
+ documents.remove(doc)
+ continue
+
+ doc['document'] = document
+ links = self.extract_links(doc['document'][0].page_content)
+ full_links = self.get_full_links(source, links)
+
+ for link in full_links:
+ if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
+ new_documents.append({"source": link})
+
+ documents.extend(new_documents)
+ return documents
+
+ def process_links(self, base_url: str, links: list,
+ loader_kwargs, depth: int, current_depth: int = 1) -> dict:
+ """
+ Processes a list of links recursively up to a given depth.
+
+ Args:
+ base_url (str): The base URL for resolving relative links.
+ links (list): A list of links to process.
+ loader_kwargs (dict): Additional arguments for the content loader.
+ depth (int): The maximum depth for recursion.
+ current_depth (int): The current depth of recursion (default is 1).
+
+ Returns:
+ dict: A dictionary containing processed link content.
+ """
+ content_dict = {}
+ for idx, link in enumerate(links, start=1):
+ full_link = link if link.startswith("http") else urljoin(base_url, link)
+ self.logger.info(f"Processing link {idx}: {full_link}")
+ link_content = self.fetch_content(full_link, loader_kwargs)
+
+ if current_depth < depth:
+ new_links = self.extract_links(link_content)
+ content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1))
+ else:
+ self.logger.warning(f"Failed to fetch content for {full_link}")
+ return content_dict
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
index 15686ec1..d5034a1e 100644
--- a/scrapegraphai/nodes/generate_answer_node.py
+++ b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,3 +1,6 @@
+"""
+generate_answer_node module
+"""
from typing import List, Optional
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
@@ -15,6 +18,26 @@
)
class GenerateAnswerNode(BaseNode):
+ """
+ Initializes the GenerateAnswerNode class.
+
+ Args:
+ input (str): The input data type for the node.
+ output (List[str]): The output data type(s) for the node.
+ node_config (Optional[dict]): Configuration dictionary for the node,
+ which includes the LLM model, verbosity, schema, and other settings.
+ Defaults to None.
+ node_name (str): The name of the node. Defaults to "GenerateAnswer".
+
+ Attributes:
+ llm_model: The language model specified in the node configuration.
+ verbose (bool): Whether verbose mode is enabled.
+ force (bool): Whether to force certain behaviors, overriding defaults.
+ script_creator (bool): Whether the node is in script creation mode.
+ is_md_scraper (bool): Whether the node is scraping markdown data.
+ additional_info (Optional[str]): Any additional information to be
+ included in the prompt templates.
+ """
def __init__(
self,
input: str,
@@ -100,7 +123,9 @@ def execute(self, state: dict) -> dict:
prompt = PromptTemplate(
template=template_chunks_prompt,
input_variables=["question"],
- partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}
+ partial_variables={"context": chunk,
+ "chunk_id": i + 1,
+ "format_instructions": format_instructions}
)
chain_name = f"chunk{i+1}"
chains_dict[chain_name] = prompt | self.llm_model
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
new file mode 100644
index 00000000..291109f2
--- /dev/null
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -0,0 +1,150 @@
+"""
+GenerateAnswerNodeKLevel Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from tqdm import tqdm
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain_openai import ChatOpenAI, AzureChatOpenAI
+from langchain_mistralai import ChatMistralAI
+from langchain_aws import ChatBedrock
+from ..utils.output_parser import get_structured_output_parser, get_pydantic_output_parser
+from .base_node import BaseNode
+from ..prompts import (
+ TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE,
+ TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
+)
+
+class GenerateAnswerNodeKLevel(BaseNode):
+ """
+ A node responsible for compressing the input tokens and storing the document
+ in a vector database for retrieval. Relevant chunks are stored in the state.
+
+ It allows scraping of big documents without exceeding the token limit of the language model.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "GANLK",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.llm_model = node_config["llm_model"]
+ self.embedder_model = node_config.get("embedder_model", None)
+ self.verbose = node_config.get("verbose", False)
+ self.force = node_config.get("force", False)
+ self.script_creator = node_config.get("script_creator", False)
+ self.is_md_scraper = node_config.get("is_md_scraper", False)
+ self.additional_info = node_config.get("additional_info")
+
+ def execute(self, state: dict) -> dict:
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ user_prompt = state.get("user_prompt")
+
+ if self.node_config.get("schema", None) is not None:
+ if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
+ self.llm_model = self.llm_model.with_structured_output(
+ schema=self.node_config["schema"]
+ )
+ output_parser = get_structured_output_parser(self.node_config["schema"])
+ format_instructions = "NA"
+ else:
+ if not isinstance(self.llm_model, ChatBedrock):
+ output_parser = get_pydantic_output_parser(self.node_config["schema"])
+ format_instructions = output_parser.get_format_instructions()
+ else:
+ output_parser = None
+ format_instructions = ""
+ else:
+ if not isinstance(self.llm_model, ChatBedrock):
+ output_parser = JsonOutputParser()
+ format_instructions = output_parser.get_format_instructions()
+ else:
+ output_parser = None
+ format_instructions = ""
+
+ if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \
+ and not self.script_creator \
+ or self.force \
+ and not self.script_creator or self.is_md_scraper:
+ template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD
+ template_chunks_prompt = TEMPLATE_CHUNKS_MD
+ template_merge_prompt = TEMPLATE_MERGE_MD
+ else:
+ template_no_chunks_prompt = TEMPLATE_NO_CHUNKS
+ template_chunks_prompt = TEMPLATE_CHUNKS
+ template_merge_prompt = TEMPLATE_MERGE
+
+ if self.additional_info is not None:
+ template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt
+ template_chunks_prompt = self.additional_info + template_chunks_prompt
+ template_merge_prompt = self.additional_info + template_merge_prompt
+
+ client = state["vectorial_db"]
+
+ if state.get("embeddings"):
+ import openai
+ openai_client = openai.Client()
+
+ answer_db = client.search(
+ collection_name="collection",
+ query_vector=openai_client.embeddings.create(
+ input=["What is the best to use for vector search scaling?"],
+ model=state.get("embeddings").get("model"),
+ )
+ .data[0]
+ .embedding,
+ )
+ else:
+ answer_db = client.query(
+ collection_name="vectorial_collection",
+ query_text=user_prompt
+ )
+
+ chains_dict = {}
+ elems =[state.get("docs")[elem.id-1] for elem in answer_db if elem.score>0.5]
+
+ for i, chunk in enumerate(tqdm(elems,
+ desc="Processing chunks", disable=not self.verbose)):
+ prompt = PromptTemplate(
+ template=template_chunks_prompt,
+ input_variables=["format_instructions"],
+ partial_variables={"context": chunk.get("document"),
+ "chunk_id": i + 1,
+ }
+ )
+ chain_name = f"chunk{i+1}"
+ chains_dict[chain_name] = prompt | self.llm_model
+
+ async_runner = RunnableParallel(**chains_dict)
+ batch_results = async_runner.invoke({"format_instructions": user_prompt})
+
+ merge_prompt = PromptTemplate(
+ template=template_merge_prompt,
+ input_variables=["context", "question"],
+ partial_variables={"format_instructions": format_instructions}
+ )
+
+ merge_chain = merge_prompt | self.llm_model
+ if output_parser:
+ merge_chain = merge_chain | output_parser
+ answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+
+ state["answer"] = answer
+
+ return state
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index cc72aaf4..746b10a5 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -26,7 +26,6 @@
from .base_node import BaseNode
from jsonschema import validate, ValidationError
-
class GenerateCodeNode(BaseNode):
"""
A node that generates Python code for a function that extracts data
@@ -96,7 +95,7 @@ def execute(self, state: dict) -> dict:
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
- RuntimeError: If the maximum number of iterations is
+ RuntimeError: If the maximum number of iterations is
reached without obtaining the desired code.
"""
@@ -170,7 +169,7 @@ def overall_reasoning_loop(self, state: dict) -> dict:
self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
state = self.semantic_comparison_loop(state)
if state["errors"]["semantic"]:
- continue
+ continue
break
if state["iteration"] == self.max_iterations["overall"] and \
@@ -195,9 +194,9 @@ def syntax_reasoning_loop(self, state: dict) -> dict:
state["errors"]["syntax"] = [syntax_message]
self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
analysis = syntax_focused_analysis(state, self.llm_model)
- self.logger.info(f"""--- (Regenerating Code
+ self.logger.info(f"""--- (Regenerating Code
to fix the Error) ---""")
- state["generated_code"] = syntax_focused_code_generation(state,
+ state["generated_code"] = syntax_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
@@ -217,14 +216,14 @@ def execution_reasoning_loop(self, state: dict) -> dict:
self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
analysis = execution_focused_analysis(state, self.llm_model)
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
- state["generated_code"] = execution_focused_code_generation(state,
+ state["generated_code"] = execution_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def validation_reasoning_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["validation"]):
- validation, errors = self.validate_dict(state["execution_result"],
+ validation, errors = self.validate_dict(state["execution_result"],
self.output_schema.schema())
if validation:
state["errors"]["validation"] = []
@@ -240,7 +239,7 @@ def validation_reasoning_loop(self, state: dict) -> dict:
def semantic_comparison_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["semantic"]):
- comparison_result = self.semantic_comparison(state["execution_result"],
+ comparison_result = self.semantic_comparison(state["execution_result"],
state["reference_answer"])
if comparison_result["are_semantically_equivalent"]:
state["errors"]["semantic"] = []
@@ -342,7 +341,7 @@ def create_sandbox_and_execute(self, function_code):
if not extract_data:
raise NameError("Function 'extract_data' not found in the generated code.")
- result = extract_data(self.raw_html)
+ result = extract_data(self.raw_html)
return True, result
except Exception as e:
return False, f"Error during execution: {str(e)}"
@@ -357,5 +356,5 @@ def validate_dict(self, data: dict, schema):
validate(instance=data, schema=schema)
return True, None
except ValidationError as e:
- errors = e.errors()
+ errors = [e.message]
return False, errors
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
index b07c4040..26304dcd 100644
--- a/scrapegraphai/nodes/html_analyzer_node.py
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -4,10 +4,7 @@
from typing import List, Optional
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
from .base_node import BaseNode
from ..utils import reduce_html
from ..prompts import (
diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
new file mode 100644
index 00000000..6427b051
--- /dev/null
+++ b/scrapegraphai/nodes/parse_node_depth_k.py
@@ -0,0 +1,67 @@
+"""
+ParseNodeDepthK Module
+"""
+from typing import List, Optional
+from langchain_community.document_transformers import Html2TextTransformer
+from .base_node import BaseNode
+
+class ParseNodeDepthK(BaseNode):
+ """
+ A node responsible for parsing HTML content from a series of documents.
+
+ This node enhances the scraping workflow by allowing for targeted extraction of
+ content, thereby optimizing the processing of large HTML documents.
+
+ Attributes:
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "ParseNodeDepthK",
+ ):
+ super().__init__(node_name, "node", input, output, 1, node_config)
+
+ self.verbose = (
+ False if node_config is None else node_config.get("verbose", False)
+ )
+
+ def execute(self, state: dict) -> dict:
+ """
+ Executes the node's logic to parse the HTML documents content.
+
+ Args:
+ state (dict): The current state of the graph. The input keys will be used to fetch the
+ correct data from the state.
+
+ Returns:
+ dict: The updated state with the output key containing the parsed content chunks.
+
+ Raises:
+ KeyError: If the input keys are not found in the state, indicating that the
+ necessary information for parsing the content is missing.
+ """
+
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ input_keys = self.get_input_keys(state)
+ input_data = [state[key] for key in input_keys]
+
+ documents = input_data[0]
+
+ for doc in documents:
+ document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
+ doc["document"] = document_md[0].page_content
+
+ state.update({self.output[0]: documents})
+
+ return state
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index 1174beee..b67c50e9 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -1,29 +1,10 @@
"""
RAGNode Module
"""
-import os
-import sys
from typing import List, Optional
-from langchain.docstore.document import Document
-from langchain.retrievers import ContextualCompressionRetriever
-from langchain.retrievers.document_compressors import (
- DocumentCompressorPipeline,
- EmbeddingsFilter,
-)
-from langchain_community.document_transformers import EmbeddingsRedundantFilter
-from langchain_community.vectorstores import FAISS
-from langchain_community.chat_models import ChatOllama
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_aws import BedrockEmbeddings, ChatBedrock
-from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
-from ..utils.logging import get_logger
from .base_node import BaseNode
-from ..helpers import models_tokens
-from ..models import DeepSeek
-
-optional_modules = {"langchain_anthropic", "langchain_fireworks",
- "langchain_groq", "langchain_google_vertexai"}
+from qdrant_client import QdrantClient
+from qdrant_client.models import PointStruct, VectorParams, Distance
class RAGNode(BaseNode):
"""
@@ -34,7 +15,6 @@ class RAGNode(BaseNode):
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
- embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
@@ -58,125 +38,62 @@ def __init__(
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
- self.cache_path = node_config.get("cache_path", False)
def execute(self, state: dict) -> dict:
- # Execution logic
- pass
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ if self.node_config.get("client_type") in ["memory", None]:
+ client = QdrantClient(":memory:")
+ elif self.node_config.get("client_type") == "local_db":
+ client = QdrantClient(path="path/to/db")
+ elif self.node_config.get("client_type") == "image":
+ client = QdrantClient(url="http://localhost:6333")
+ else:
+ raise ValueError("client_type provided not correct")
+
+ docs = [elem.get("summary") for elem in state.get("docs")]
+ ids = [i for i in range(1, len(state.get("docs"))+1)]
+
+ if state.get("embeddings"):
+ import openai
+ openai_client = openai.Client()
+
+ files = state.get("documents")
+
+ array_of_embeddings = []
+ i=0
- def _create_default_embedder(self, llm_config=None) -> object:
- """
- Create an embedding model instance based on the chosen llm model.
+ for file in files:
+ embeddings = openai_client.embeddings.create(input=file,
+ model=state.get("embeddings").get("model"))
+ i+=1
+ points = PointStruct(
+ id=i,
+ vector=embeddings,
+ payload={"text": file},
+ )
- Returns:
- object: An instance of the embedding model client.
+ array_of_embeddings.append(points)
- Raises:
- ValueError: If the model is not supported.
- """
+ collection_name = "collection"
- if isinstance(self.llm_model, ChatGoogleGenerativeAI):
- return GoogleGenerativeAIEmbeddings(
- google_api_key=llm_config["api_key"], model="models/embedding-001"
+ client.create_collection(
+ collection_name,
+ vectors_config=VectorParams(
+ size=1536,
+ distance=Distance.COSINE,
+ ),
)
- if isinstance(self.llm_model, ChatOpenAI):
- return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key,
- base_url=self.llm_model.openai_api_base)
- elif isinstance(self.llm_model, DeepSeek):
- return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
- elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
- return self.llm_model
- elif isinstance(self.llm_model, AzureChatOpenAI):
- return AzureOpenAIEmbeddings()
- elif isinstance(self.llm_model, ChatOllama):
- params = self.llm_model._lc_kwargs
- params.pop("streaming", None)
- params.pop("temperature", None)
- return OllamaEmbeddings(**params)
- elif isinstance(self.llm_model, ChatBedrock):
- return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
- elif all(key in sys.modules for key in optional_modules):
- if isinstance(self.llm_model, ChatFireworks):
- from langchain_fireworks import FireworksEmbeddings
- return FireworksEmbeddings(model=self.llm_model.model_name)
- if isinstance(self.llm_model, ChatNVIDIA):
- from langchain_nvidia import NVIDIAEmbeddings
- return NVIDIAEmbeddings(model=self.llm_model.model_name)
- if isinstance(self.llm_model, ChatHuggingFace):
- from langchain_huggingface import HuggingFaceEmbeddings
- return HuggingFaceEmbeddings(model=self.llm_model.model)
- if isinstance(self.llm_model, ChatVertexAI):
- from langchain_vertexai import VertexAIEmbeddings
- return VertexAIEmbeddings()
- else:
- raise ValueError("Embedding Model missing or not supported")
-
- def _create_embedder(self, embedder_config: dict) -> object:
- """
- Create an embedding model instance based on the configuration provided.
-
- Args:
- embedder_config (dict): Configuration parameters for the embedding model.
-
- Returns:
- object: An instance of the embedding model client.
-
- Raises:
- KeyError: If the model is not supported.
- """
- embedder_params = {**embedder_config}
- if "model_instance" in embedder_config:
- return embedder_params["model_instance"]
- if "openai" in embedder_params["model"]:
- return OpenAIEmbeddings(api_key=embedder_params["api_key"])
- if "azure" in embedder_params["model"]:
- return AzureOpenAIEmbeddings()
- if "ollama" in embedder_params["model"]:
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["ollama"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return OllamaEmbeddings(**embedder_params)
- if "gemini" in embedder_params["model"]:
- try:
- models_tokens["gemini"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return GoogleGenerativeAIEmbeddings(model=embedder_params["model"])
- if "bedrock" in embedder_params["model"]:
- embedder_params["model"] = embedder_params["model"].split("/")[-1]
- client = embedder_params.get("client", None)
- try:
- models_tokens["bedrock"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return BedrockEmbeddings(client=client, model_id=embedder_params["model"])
- if all(key in sys.modules for key in optional_modules):
- if "hugging_face" in embedder_params["model"]:
- from langchain_huggingface import HuggingFaceEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["hugging_face"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return HuggingFaceEmbeddings(model=embedder_params["model"])
- elif "fireworks" in embedder_params["model"]:
- from langchain_fireworks import FireworksEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["fireworks"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return FireworksEmbeddings(model=embedder_params["model"])
- elif "nvidia" in embedder_params["model"]:
- from langchain_nvidia import NVIDIAEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["nvidia"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return NVIDIAEmbeddings(model=embedder_params["model"],
- nvidia_api_key=embedder_params["api_key"])
-
- raise ValueError("Model provided by the configuration not supported")
+ client.upsert(collection_name, points)
+
+ state["vectorial_db"] = client
+ return state
+
+ client.add(
+ collection_name="vectorial_collection",
+ documents=docs,
+ ids=ids
+ )
+
+ state["vectorial_db"] = client
+ return state
diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
new file mode 100644
index 00000000..6b91155c
--- /dev/null
+++ b/scrapegraphai/nodes/reasoning_node.py
@@ -0,0 +1,97 @@
+"""
+PromptRefinerNode Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.chat_models import ChatOllama
+from .base_node import BaseNode
+from ..utils import transform_schema
+from ..prompts import (
+ TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
+)
+
+class ReasoningNode(BaseNode):
+ """
+ A node that refine the user prompt with the use of the schema and additional context and
+ create a precise prompt in subsequent steps that explicitly link elements in the user's
+ original input to their corresponding representations in the JSON schema.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "PromptRefiner",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.llm_model = node_config["llm_model"]
+
+ if isinstance(node_config["llm_model"], ChatOllama):
+ self.llm_model.format="json"
+
+ self.verbose = (
+ True if node_config is None else node_config.get("verbose", False)
+ )
+ self.force = (
+ False if node_config is None else node_config.get("force", False)
+ )
+
+ self.additional_info = node_config.get("additional_info", None)
+
+ self.output_schema = node_config.get("schema")
+
+ def execute(self, state: dict) -> dict:
+ """
+ Generate a refined prompt for the reasoning task based
+ on the user's input and the JSON schema.
+
+ Args:
+ state (dict): The current state of the graph. The input keys will be used
+ to fetch the correct data from the state.
+
+ Returns:
+ dict: The updated state with the output key containing the generated answer.
+
+ Raises:
+ KeyError: If the input keys are not found in the state, indicating
+ that the necessary information for generating an answer is missing.
+ """
+
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ user_prompt = state['user_prompt']
+
+ self.simplefied_schema = transform_schema(self.output_schema.schema())
+
+ if self.additional_info is not None:
+ prompt = PromptTemplate(
+ template=TEMPLATE_REASONING_WITH_CONTEXT,
+ partial_variables={"user_input": user_prompt,
+ "json_schema": str(self.simplefied_schema),
+ "additional_context": self.additional_info})
+ else:
+ prompt = PromptTemplate(
+ template=TEMPLATE_REASONING,
+ partial_variables={"user_input": user_prompt,
+ "json_schema": str(self.simplefied_schema)})
+
+ output_parser = StrOutputParser()
+
+ chain = prompt | self.llm_model | output_parser
+ refined_prompt = chain.invoke({})
+
+ state.update({self.output[0]: refined_prompt})
+ return state
diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py
index f7be89c1..ab34580b 100644
--- a/scrapegraphai/prompts/__init__.py
+++ b/scrapegraphai/prompts/__init__.py
@@ -18,4 +18,5 @@
TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION,
TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION,
TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
- TEMPLATE_SEMANTIC_CODE_GENERATION)
\ No newline at end of file
+ TEMPLATE_SEMANTIC_CODE_GENERATION)
+from .reasoning_node_prompts import TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
\ No newline at end of file
diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py
new file mode 100644
index 00000000..20df481a
--- /dev/null
+++ b/scrapegraphai/prompts/description_node_prompts.py
@@ -0,0 +1,10 @@
+"""
+description node prompts
+"""
+
+DESCRIPTION_NODE_PROMPT = """
+You are a scraper and you have just scraped the
+following content from a website. \n
+Please provide a description summary of maximum of 20 words
+Content of the website: {content}
+"""
\ No newline at end of file
diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py
index 7c098fe2..1b336fb4 100644
--- a/scrapegraphai/prompts/generate_answer_node_prompts.py
+++ b/scrapegraphai/prompts/generate_answer_node_prompts.py
@@ -2,6 +2,7 @@
Generate answer node prompts
"""
+
TEMPLATE_CHUNKS_MD = """
You are a website scraper and you have just scraped the
following content from a website converted in markdown format.
@@ -32,6 +33,7 @@
You are now asked to answer a user question about the content you have scraped.\n
You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n
Make sure that if a maximum number of items is specified in the instructions that you get that maximum number and do not exceed it. \n
+The structure should be coherent. \n
Make sure the output format is a valid JSON and does not contain errors. \n
OUTPUT INSTRUCTIONS: {format_instructions}\n
USER QUESTION: {question}\n
diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py
new file mode 100644
index 00000000..d9caf937
--- /dev/null
+++ b/scrapegraphai/prompts/reasoning_node_prompts.py
@@ -0,0 +1,72 @@
+"""
+Reasoning prompts helper
+"""
+
+TEMPLATE_REASONING = """
+**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file.
+
+**User's Request**:
+{user_input}
+
+**Target JSON Schema**:
+```json
+{json_schema}
+```
+
+**Analysis Instructions**:
+1. **Interpret User Request:**
+* Identify the key information types or entities the user is seeking.
+* Note any specific attributes, relationships, or constraints mentioned.
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, locate its corresponding field in the JSON schema.
+* Explain how the schema structure represents the requested information.
+* Highlight any relevant schema elements not explicitly mentioned in the user's request.
+
+3. **Data Transformation Guidance**:
+* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
+
+This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
+
+**Reasoning Output**:
+[Your detailed analysis based on the above instructions]
+"""
+
+TEMPLATE_REASONING_WITH_CONTEXT = """
+**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file.
+
+**User's Request**:
+{user_input}
+
+**Target JSON Schema**:
+```json
+{json_schema}
+```
+
+**Additional Context**:
+{additional_context}
+
+**Analysis Instructions**:
+1. **Interpret User Request and Context:**
+* Identify the key information types or entities the user is seeking.
+* Note any specific attributes, relationships, or constraints mentioned.
+* Incorporate insights from the additional context to refine understanding of the task.
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, locate its corresponding field in the JSON schema.
+* Explain how the schema structure represents the requested information.
+* Highlight any relevant schema elements not explicitly mentioned in the user's request.
+
+3. **Extraction Strategy**:
+* Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML.
+* Highlight any potential challenges or special considerations mentioned in the context.
+
+4. **Data Transformation Guidance**:
+* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements.
+* Note any special formatting, validation, or business logic considerations from the additional context.
+
+This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format.
+
+**Reasoning Output**:
+[Your detailed analysis based on the above instructions, incorporating insights from the additional context]
+"""
diff --git a/scrapegraphai/utils/1_manual.py b/scrapegraphai/utils/1_manual.py
new file mode 100644
index 00000000..21703b7b
--- /dev/null
+++ b/scrapegraphai/utils/1_manual.py
@@ -0,0 +1,92 @@
+import requests
+import logging
+import time
+from urllib.parse import quote, urljoin
+from typing import Optional
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+import os
+import json
+import markdownify
+
+load_dotenv()
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def fetch_content(token: str, target_url: str, max_retries: int = 5, retry_delay: int = 3) -> Optional[str]:
+ encoded_url = quote(target_url)
+ url = f"http://api.scrape.do?url={encoded_url}&token={token}&render=true&waitUntil=networkidle0"
+
+ for attempt in range(max_retries):
+ try:
+ response = requests.get(url)
+ if response.status_code == 200:
+ logging.info(f"Successfully fetched content from {target_url}")
+ return response.text
+ logging.warning(f"Failed with status {response.status_code}. Retrying in {retry_delay}s...")
+ except requests.RequestException as e:
+ logging.error(f"Error fetching {target_url}: {e}. Retrying in {retry_delay}s...")
+ time.sleep(retry_delay)
+
+ logging.error(f"Failed to fetch {target_url} after {max_retries} attempts.")
+ return None
+
+def extract_links(html_content: str) -> list:
+ soup = BeautifulSoup(html_content, 'html.parser')
+ links = [link['href'] for link in soup.find_all('a', href=True)]
+ logging.info(f"Extracted {len(links)} links.")
+ return links
+
+def process_links(token: str, base_url: str, links: list, depth: int, current_depth: int = 1) -> dict:
+ content_dict = {}
+ for idx, link in enumerate(links, start=1):
+ full_link = link if link.startswith("http") else urljoin(base_url, link)
+ logging.info(f"Processing link {idx}: {full_link}")
+ link_content = fetch_content(token, full_link)
+ if link_content:
+ markdown_content = markdownify.markdownify(link_content, heading_style="ATX")
+ content_dict[full_link] = markdown_content
+ save_content_to_json(content_dict, idx)
+
+ if current_depth < depth:
+ new_links = extract_links(link_content)
+ content_dict.update(process_links(token, full_link, new_links, depth, current_depth + 1))
+ else:
+ logging.warning(f"Failed to fetch content for {full_link}")
+ return content_dict
+
+def save_content_to_json(content_dict: dict, idx: int):
+ if not os.path.exists("downloaded_pages"):
+ os.makedirs("downloaded_pages")
+
+ file_name = f"scraped_content_{idx}.json"
+ file_path = os.path.join("downloaded_pages", file_name)
+
+ with open(file_path, "w", encoding="utf-8") as json_file:
+ json.dump(content_dict, json_file, ensure_ascii=False, indent=4)
+
+ logging.info(f"Content saved to {file_path}")
+
+if __name__ == "__main__":
+ token = os.getenv("TOKEN")
+ target_url = "https://www.wired.com"
+ depth = 2
+
+ if not token or not target_url:
+ logging.error("Please set the TOKEN and TARGET_URL environment variables.")
+ exit(1)
+
+ html_content = fetch_content(token, target_url)
+
+ if html_content:
+ links = extract_links(html_content)
+ logging.info("Links found:")
+ for link in links:
+ logging.info(link)
+
+ content_dict = process_links(token, target_url, links, depth)
+ for link, content in content_dict.items():
+ logging.info(f"Link: {link}")
+ logging.info(f"Content: {content[:500]}...")
+ else:
+ logging.error("Failed to fetch the content.")