diff --git a/CHANGELOG.md b/CHANGELOG.md index 70bcbbde..689eeec3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27) + + +### Features + +* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24)) + +## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) + + +### Features + +* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81)) + ## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25) diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py new file mode 100644 index 00000000..6e2670a0 --- /dev/null +++ b/examples/extras/html_mode.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper +By default smart scraper converts in md format the +code. If you want to just use the original code, you have +to specify in the confi +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "html_mode": True, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/pyproject.toml b/pyproject.toml index b7e0b1cc..ef0b104a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.22.0b3" +version = "1.22.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 95c2b460..4ffc6bed 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -70,14 +70,7 @@ def _create_graph(self) -> BaseGraph: "scrape_do": self.config.get("scrape_do") } ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "llm_model": self.llm_model, - "chunk_size": self.model_token - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -89,6 +82,15 @@ def _create_graph(self) -> BaseGraph: } ) + if self.config.get("html_mode") is not True: + + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + if self.config.get("reasoning"): reasoning_node = ReasoningNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -104,11 +106,13 @@ def _create_graph(self) -> BaseGraph: nodes=[ fetch_node, parse_node, + reasoning_node, generate_answer_node, ], edges=[ (fetch_node, parse_node), + (parse_node, generate_answer_node) (parse_node, reasoning_node), (reasoning_node, generate_answer_node) ], @@ -117,18 +121,17 @@ def _create_graph(self) -> BaseGraph: ) return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) + nodes=[ + fetch_node, + generate_answer_node, + ], + edges=[ + (fetch_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + def run(self) -> str: """