From bdcffd6360237b27797546a198ceece55ce4bc81 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 27 Sep 2024 16:41:45 +0200 Subject: [PATCH 1/4] feat: add html_mode to smart_scraper --- examples/extras/html_mode.py | 48 +++++++++++++++++ scrapegraphai/graphs/smart_scraper_graph.py | 57 +++++++++++++-------- 2 files changed, 85 insertions(+), 20 deletions(-) create mode 100644 examples/extras/html_mode.py diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py new file mode 100644 index 00000000..c13ba694 --- /dev/null +++ b/examples/extras/html_mode.py @@ -0,0 +1,48 @@ +""" +Basic example of scraping pipeline using SmartScraper +By default smart scraper converts in md format the +code. +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "html_mode": True, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 0c025c3a..7792ed58 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -69,14 +69,7 @@ def _create_graph(self) -> BaseGraph: "scrape_do": self.config.get("scrape_do") } ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "llm_model": self.llm_model, - "chunk_size": self.model_token - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -88,19 +81,43 @@ def _create_graph(self) -> BaseGraph: } ) + if self.config.get("html_mode") is not True: + + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) + nodes=[ + fetch_node, + generate_answer_node, + ], + edges=[ + (fetch_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + def run(self) -> str: """ From 1e4ee3abdf8dce321977bbc74f1976fba33877bc Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 27 Sep 2024 16:42:51 +0200 Subject: [PATCH 2/4] Update html_mode.py --- examples/extras/html_mode.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py index c13ba694..6e2670a0 100644 --- a/examples/extras/html_mode.py +++ b/examples/extras/html_mode.py @@ -1,7 +1,8 @@ """ Basic example of scraping pipeline using SmartScraper By default smart scraper converts in md format the -code. +code. If you want to just use the original code, you have +to specify in the confi """ import os From 4330179cb65674d65423c1763f90182e85c15a74 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 27 Sep 2024 14:47:04 +0000 Subject: [PATCH 3/4] ci(release): 1.22.0-beta.4 [skip ci] ## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) ### Features * add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70bcbbde..bcc66ecd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) + + +### Features + +* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81)) + ## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25) diff --git a/pyproject.toml b/pyproject.toml index b7e0b1cc..fc61a859 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.22.0b3" +version = "1.22.0b4" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 6d8f5435d1ecd2d90b06aade50abc064f75c9d78 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 27 Sep 2024 15:51:48 +0000 Subject: [PATCH 4/4] ci(release): 1.22.0-beta.5 [skip ci] ## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27) ### Features * add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcc66ecd..689eeec3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27) + + +### Features + +* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24)) + ## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) diff --git a/pyproject.toml b/pyproject.toml index fc61a859..ef0b104a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.22.0b4" +version = "1.22.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [