From e76a68a782e5bce48d421cb620d0b7bffa412918 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Sun, 13 Oct 2024 23:04:27 +0800 Subject: [PATCH 01/39] fix: remove variable "max_result" not being used in the code --- scrapegraphai/graphs/csv_scraper_multi_graph.py | 1 - scrapegraphai/graphs/json_scraper_multi_graph.py | 1 - scrapegraphai/graphs/script_creator_multi_graph.py | 2 -- scrapegraphai/graphs/smart_scraper_multi_concat_graph.py | 1 - 4 files changed, 5 deletions(-) diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index e7a028f3..284a49ac 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -44,7 +44,6 @@ class CSVScraperMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index 6e5434f0..a6dd22d4 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -44,7 +44,6 @@ class JSONScraperMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index de1ab6f7..a5598936 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -43,8 +43,6 @@ class ScriptCreatorMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) - self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py index 0bd84a55..312d6457 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py @@ -45,7 +45,6 @@ class SmartScraperMultiConcatGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) From e0fc457d1a850f3306d473fbde55dd800133b404 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Mon, 14 Oct 2024 12:14:59 +0000 Subject: [PATCH 02/39] ci(release): 1.26.6-beta.1 [skip ci] ## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14) ### Bug Fixes * remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbe12a5a..3b45f437 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14) + + +### Bug Fixes + +* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918)) + ## [1.26.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.4...v1.26.5) (2024-10-13) diff --git a/pyproject.toml b/pyproject.toml index 6bade627..c0b58843 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.26.5" +version = "1.26.6b1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From cacd9cde004dace1a7dcc27981245632a78b95f3 Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Tue, 15 Oct 2024 14:23:02 +0300 Subject: [PATCH 03/39] feat: add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition --- requirements.txt | 3 +- scrapegraphai/graphs/smart_scraper_graph.py | 138 +++++++++++------- scrapegraphai/prompts/__init__.py | 2 +- .../prompts/generate_answer_node_prompts.py | 4 + 4 files changed, 90 insertions(+), 57 deletions(-) diff --git a/requirements.txt b/requirements.txt index c72ad1bb..6c7a0326 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ undetected-playwright>=0.3.0 semchunk>=1.0.1 langchain-ollama>=0.1.3 simpleeval>=0.9.13 -googlesearch-python>=1.2.5 \ No newline at end of file +googlesearch-python>=1.2.5 +async_timeout>=4.0.3 \ No newline at end of file diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 60407624..478f6634 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -2,7 +2,6 @@ SmartScraperGraph Module """ from typing import Optional -import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph @@ -10,8 +9,10 @@ FetchNode, ParseNode, ReasoningNode, - GenerateAnswerNode + GenerateAnswerNode, + ConditionalNode ) +from ..prompts import REGEN_ADDITIONAL_INFO class SmartScraperGraph(AbstractGraph): """ @@ -89,6 +90,28 @@ def _create_graph(self) -> BaseGraph: } ) + cond_node = None + regen_node = None + if self.config.get("reattempt") is True: + cond_node = ConditionalNode( + input="results", + output=["results"], + node_name="ConditionalNode", + node_config={ + "key_name": "results", + "condition": 'results and results!="NA"', + } + ) + regen_node = GenerateAnswerNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "additional_info": REGEN_ADDITIONAL_INFO, + "schema": self.schema, + } + ) + if self.config.get("html_mode") is False: parse_node = ParseNode( input="doc", @@ -99,6 +122,7 @@ def _create_graph(self) -> BaseGraph: } ) + reasoning_node = None if self.config.get("reasoning"): reasoning_node = ReasoningNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", @@ -109,68 +133,72 @@ def _create_graph(self) -> BaseGraph: "schema": self.schema, } ) + + # Define the graph variation configurations + # (html_mode, reasoning, reattempt) + graph_variation_config = { + (False, True, False): { + "nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node], + "edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node)] + }, + (True, True, False): { + "nodes": [fetch_node, reasoning_node, generate_answer_node], + "edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node)] + }, + (True, False, False): { + "nodes": [fetch_node, generate_answer_node], + "edges": [(fetch_node, generate_answer_node)] + }, + (False, False, False): { + "nodes": [fetch_node, parse_node, generate_answer_node], + "edges": [(fetch_node, parse_node), (parse_node, generate_answer_node)] + }, + (False, True, True): { + "nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + }, + (True, True, True): { + "nodes": [fetch_node, reasoning_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + }, + (True, False, True): { + "nodes": [fetch_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, generate_answer_node), (generate_answer_node, cond_node), + (cond_node, regen_node), (cond_node, None)] + }, + (False, False, True): { + "nodes": [fetch_node, parse_node, generate_answer_node, cond_node, regen_node], + "edges": [(fetch_node, parse_node), (parse_node, generate_answer_node), + (generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)] + } + } - if self.config.get("html_mode") is False and self.config.get("reasoning") is True: - - return BaseGraph( - nodes=[ - fetch_node, - parse_node, - reasoning_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, reasoning_node), - (reasoning_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - - elif self.config.get("html_mode") is True and self.config.get("reasoning") is True: + # Get the current conditions + html_mode = self.config.get("html_mode", False) + reasoning = self.config.get("reasoning", False) + reattempt = self.config.get("reattempt", False) - return BaseGraph( - nodes=[ - fetch_node, - reasoning_node, - generate_answer_node, - ], - edges=[ - (fetch_node, reasoning_node), - (reasoning_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) + # Retrieve the appropriate graph configuration + config = graph_variation_config.get((html_mode, reasoning, reattempt)) - elif self.config.get("html_mode") is True and self.config.get("reasoning") is False: + if config: return BaseGraph( - nodes=[ - fetch_node, - generate_answer_node, - ], - edges=[ - (fetch_node, generate_answer_node) - ], + nodes=config["nodes"], + edges=config["edges"], entry_point=fetch_node, graph_name=self.__class__.__name__ ) + # Default return if no conditions match return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, generate_answer_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - + nodes=[fetch_node, parse_node, generate_answer_node], + edges=[(fetch_node, parse_node), (parse_node, generate_answer_node)], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + def run(self) -> str: """ Executes the scraping process and returns the answer to the prompt. diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py index b23374a4..ea916842 100644 --- a/scrapegraphai/prompts/__init__.py +++ b/scrapegraphai/prompts/__init__.py @@ -5,7 +5,7 @@ from .generate_answer_node_prompts import (TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, - TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD) + TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD, REGEN_ADDITIONAL_INFO) from .generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV) diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py index f9506a7b..a14f27f4 100644 --- a/scrapegraphai/prompts/generate_answer_node_prompts.py +++ b/scrapegraphai/prompts/generate_answer_node_prompts.py @@ -86,3 +86,7 @@ USER QUESTION: {question}\n WEBSITE CONTENT: {context}\n """ + +REGEN_ADDITIONAL_INFO = """ +You are a scraper and you have just failed to scrape the requested information from a website. \n +I want you to try again and provide the missing informations. \n""" From 038d2ef916ff0306c1fa5258a161889281d54235 Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Tue, 15 Oct 2024 15:23:05 +0300 Subject: [PATCH 04/39] refactor cond node structure to fit with the new implementation --- scrapegraphai/graphs/base_graph.py | 7 ++++++- scrapegraphai/graphs/smart_scraper_graph.py | 10 +++++----- scrapegraphai/nodes/conditional_node.py | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 5fa9ff34..d3a9cf85 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -95,7 +95,10 @@ def _set_conditional_node_edges(self): raise ValueError(f"ConditionalNode '{node.node_name}' must have exactly two outgoing edges.") # Assign true_node_name and false_node_name node.true_node_name = outgoing_edges[0][1].node_name - node.false_node_name = outgoing_edges[1][1].node_name + try: + node.false_node_name = outgoing_edges[1][1].node_name + except: + node.false_node_name = None def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: """ @@ -221,6 +224,8 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: node_names = {node.node_name for node in self.nodes} if result in node_names: current_node_name = result + elif result is None: + current_node_name = None else: raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph") diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 478f6634..03e8cd5d 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -94,16 +94,16 @@ def _create_graph(self) -> BaseGraph: regen_node = None if self.config.get("reattempt") is True: cond_node = ConditionalNode( - input="results", - output=["results"], + input="answer", + output=["answer"], node_name="ConditionalNode", node_config={ - "key_name": "results", - "condition": 'results and results!="NA"', + "key_name": "answer", + "condition": 'not answer or answer=="NA"', } ) regen_node = GenerateAnswerNode( - input="user_prompt & results", + input="user_prompt & answer", output=["answer"], node_config={ "llm_model": self.llm_model, diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 02ff61e9..c5ff58f3 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -61,7 +61,7 @@ def execute(self, state: dict) -> dict: str: The name of the next node to execute based on the presence of the key. """ - if self.true_node_name is None or self.false_node_name is None: + if self.true_node_name is None: raise ValueError("ConditionalNode's next nodes are not set properly.") if self.condition: From 612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 18:37:50 +0800 Subject: [PATCH 05/39] feat: implement ScrapeGraph class for only web scraping automation --- scrapegraphai/graphs/scrape_graph.py | 98 ++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 scrapegraphai/graphs/scrape_graph.py diff --git a/scrapegraphai/graphs/scrape_graph.py b/scrapegraphai/graphs/scrape_graph.py new file mode 100644 index 00000000..a08149aa --- /dev/null +++ b/scrapegraphai/graphs/scrape_graph.py @@ -0,0 +1,98 @@ +""" +SmartScraperGraph Module +""" +from typing import Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from ..nodes import ( + FetchNode, + ParseNode, +) + +class ScrapeGraph(AbstractGraph): + """ + ScrapeGraph is a scraping pipeline that automates the process of + extracting information from web pages. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + + Example: + >>> scraper = ScraperGraph( + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + ) + """ + + def __init__(self, source: str, config: dict, prompt: str = "", schema: Optional[BaseModel] = None): + super().__init__(prompt, config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + fetch_node = FetchNode( + input="url| local_dir", + output=["doc"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + "browser_base": self.config.get("browser_base"), + "scrape_do": self.config.get("scrape_do") + } + ) + + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + ], + edges=[ + (fetch_node, parse_node), + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the scraping content. + + Returns: + str: The scraping content. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("parsed_doc", "No document found.") From 3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 19:38:53 +0800 Subject: [PATCH 06/39] feat: Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. (Different from the SmartScraperMultiGraph is that in this case the content is merged before to be processed by the llm.) --- scrapegraphai/graphs/__init__.py | 2 + ...t_scraper_multi_parse_merge_first_graph.py | 103 ++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 5b217bc9..0acec56b 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -25,3 +25,5 @@ from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph +from .smart_scraper_multi_parse_merge_first_graph import SmartScraperMultiParseMergeFirstGraph +from .scrape_graph import ScrapeGraph diff --git a/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py b/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py new file mode 100644 index 00000000..860e2ca2 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py @@ -0,0 +1,103 @@ +""" +SmartScraperMultiGraph Module +""" +from copy import deepcopy +from typing import List, Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .scrape_graph import ScrapeGraph +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode, +) +from ..utils.copy import safe_deepcopy + +class SmartScraperMultiParseMergeFirstGraph(AbstractGraph): + """ + SmartScraperMultiParseMergeFirstGraph is a scraping pipeline that scrapes a + list of URLs and merge the content first and finally generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiGraph is that in this case the content is merged + before to be passed to the llm. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> search_graph = SmartScraperMultiParseMergeFirstGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): + + self.copy_config = safe_deepcopy(config) + self.copy_schema = deepcopy(schema) + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping + and parsing and then merge the content and generates answers to a given prompt. + """ + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["parsed_doc"], + node_config={ + "graph_instance": ScrapeGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & parsed_doc", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.copy_schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and parsing process first and + then concatenate the content and generates answers to a given prompt. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + return self.final_state.get("answer", "No answer found.") From cdb3c1100ee1117afedbc70437317acaf7c7c1d3 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 20:05:03 +0800 Subject: [PATCH 07/39] test: Add scrape_graph test --- tests/graphs/scrape_graph_test.py | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/graphs/scrape_graph_test.py diff --git a/tests/graphs/scrape_graph_test.py b/tests/graphs/scrape_graph_test.py new file mode 100644 index 00000000..00d3f4fb --- /dev/null +++ b/tests/graphs/scrape_graph_test.py @@ -0,0 +1,50 @@ +""" +Module for testing the scrape graph class +""" + +import os +import pytest +import pandas as pd +from dotenv import load_dotenv +from scrapegraphai.graphs import ScrapeGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +@pytest.fixture +def graph_config(): + """Configuration of the graph""" + openai_key = os.getenv("OPENAI_APIKEY") + return { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, + } + +def test_scraping_pipeline(graph_config): + """Start of the scraping pipeline""" + scrape_graph = ScrapeGraph( + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + result = scrape_graph.run() + + assert result is not None + assert isinstance(result, list) + +def test_get_execution_info(graph_config): + """Get the execution info""" + scrape_graph = ScrapeGraph( + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + scrape_graph.run() + + graph_exec_info = scrape_graph.get_execution_info() + + assert graph_exec_info is not None From 464b8b04ea0d51280849173d5eda92d4d4db8612 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 20:05:36 +0800 Subject: [PATCH 08/39] test: Add smart_scraper_multi_parse_merge_first_graph test --- ...aper_multi_parse_merge_first_graph_test.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py diff --git a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py b/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py new file mode 100644 index 00000000..506ce5da --- /dev/null +++ b/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py @@ -0,0 +1,59 @@ +""" +Module for testing the smart scraper class +""" + +import os +import pytest +import pandas as pd +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiParseConcatFirstGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +@pytest.fixture +def graph_config(): + """Configuration of the graph""" + openai_key = os.getenv("OPENAI_APIKEY") + + return { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, + } + +def test_scraping_pipeline(graph_config): + """Start of the scraping pipeline""" + smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config, + ) + + result = smart_scraper_multi_parse_concat_first_graph.run() + + assert result is not None + assert isinstance(result, dict) + +def test_get_execution_info(graph_config): + """Get the execution info""" + smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config, + ) + + smart_scraper_multi_parse_concat_first_graph.run() + + graph_exec_info = smart_scraper_multi_parse_concat_first_graph.get_execution_info() + + assert graph_exec_info is not None From eaa83edc04b803f2a14c7705549fae62c64275fb Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Wed, 16 Oct 2024 15:21:23 +0300 Subject: [PATCH 09/39] update project requirement and add example --- examples/extras/cond_smartscraper_usage.py | 38 ++++++++++++++++++++++ pyproject.toml | 3 +- 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 examples/extras/cond_smartscraper_usage.py diff --git a/examples/extras/cond_smartscraper_usage.py b/examples/extras/cond_smartscraper_usage.py new file mode 100644 index 00000000..54c40712 --- /dev/null +++ b/examples/extras/cond_smartscraper_usage.py @@ -0,0 +1,38 @@ +""" +Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("GROQ_APIKEY"), + "model": "groq/gemma-7b-it", + }, + "verbose": True, + "headless": True, + "reattempt": True #Setting this to True will allow the graph to reattempt the scraping process +} + +# ******************************************************* +# Create the SmartScraperMultiCondGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/pyproject.toml b/pyproject.toml index 6bade627..7a374c97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,8 @@ dependencies = [ "async-timeout>=4.0.3", "transformers>=4.44.2", "googlesearch-python>=1.2.5", - "simpleeval>=1.0.0" + "simpleeval>=1.0.0", + "async_timeout>=4.0.3" ] license = "MIT" From 9266a36b2efdf7027470d59aa14b654d68f7cb51 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 16 Oct 2024 15:54:35 +0000 Subject: [PATCH 10/39] ci(release): 1.27.0-beta.1 [skip ci] ## [1.27.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6-beta.1...v1.27.0-beta.1) (2024-10-16) ### Features * add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b45f437..54d4f9d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6-beta.1...v1.27.0-beta.1) (2024-10-16) + + +### Features + +* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3)) + ## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14) diff --git a/pyproject.toml b/pyproject.toml index 17488a43..be40f076 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.26.6b1" +version = "1.27.0b1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 2512262be81b686f559711584e69c725dd53a187 Mon Sep 17 00:00:00 2001 From: shenghong Date: Thu, 17 Oct 2024 06:46:34 +0800 Subject: [PATCH 11/39] Rename smart_scraper_multi_parse_merge_first_graph_test.py to smart_scraper_multi_parse_merge_first_graph_openai_test.py --- ...=> smart_scraper_multi_parse_merge_first_graph_openai_test.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/graphs/{smart_scraper_multi_parse_merge_first_graph_test.py => smart_scraper_multi_parse_merge_first_graph_openai_test.py} (100%) diff --git a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py b/tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py similarity index 100% rename from tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py rename to tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py From 69ff6495564a5c670b89c0f802ebb1602f0e7cfa Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:36:29 +0800 Subject: [PATCH 12/39] fix: fix the example variable name --- scrapegraphai/graphs/smart_scraper_multi_concat_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py index 312d6457..a13d8aa1 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py @@ -35,11 +35,11 @@ class SmartScraperMultiConcatGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> search_graph = MultipleSearchGraph( + >>> smart_scraper_multi_concat_graph = SmartScraperMultiConcatGraph( ... "What is Chioggia famous for?", ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = search_graph.run() + >>> result = smart_scraper_multi_concat_graph.run() """ def __init__(self, prompt: str, source: List[str], From 94d8042c2a510b29138127e1abd4ddd9e0b49ed0 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:39:42 +0800 Subject: [PATCH 13/39] rename smart_scraper_multi_graph to smart_scraper_multi_abstract_graph --- .../smart_scraper_multi_abstract_graph.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py diff --git a/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py b/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py new file mode 100644 index 00000000..f5ffdf96 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py @@ -0,0 +1,104 @@ +""" +SmartScraperMultiGraph Module +""" +from copy import deepcopy +from typing import List, Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .smart_scraper_graph import SmartScraperGraph +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) +from ..utils.copy import safe_deepcopy + +class SmartScraperMultiAbstractGraph(AbstractGraph): + """ + SmartScraperMultiAbstractGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiGraph is that in this case the content will be abstracted + by llm and then merged finally passed to the llm. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> smart_scraper_multi_abstract_graph = SmartScraperMultiAbstractGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper_multi_abstract_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): + + self.max_results = config.get("max_results", 3) + self.copy_config = safe_deepcopy(config) + self.copy_schema = deepcopy(schema) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["results"], + node_config={ + "graph_instance": SmartScraperGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.copy_schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") From dfc67c670d871fac5116223461a56c9560959eb9 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:49:54 +0800 Subject: [PATCH 14/39] rename the smart_scraper_multi_parse_merge_first_graph to smart_scraper_multi_graph,so delete this file --- ...t_scraper_multi_parse_merge_first_graph.py | 103 ------------------ 1 file changed, 103 deletions(-) delete mode 100644 scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py diff --git a/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py b/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py deleted file mode 100644 index 860e2ca2..00000000 --- a/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -SmartScraperMultiGraph Module -""" -from copy import deepcopy -from typing import List, Optional -from pydantic import BaseModel -from .base_graph import BaseGraph -from .abstract_graph import AbstractGraph -from .scrape_graph import ScrapeGraph -from ..nodes import ( - GraphIteratorNode, - MergeAnswersNode, -) -from ..utils.copy import safe_deepcopy - -class SmartScraperMultiParseMergeFirstGraph(AbstractGraph): - """ - SmartScraperMultiParseMergeFirstGraph is a scraping pipeline that scrapes a - list of URLs and merge the content first and finally generates answers to a given prompt. - It only requires a user prompt and a list of URLs. - The difference with the SmartScraperMultiGraph is that in this case the content is merged - before to be passed to the llm. - - Attributes: - prompt (str): The user prompt to search the internet. - llm_model (dict): The configuration for the language model. - embedder_model (dict): The configuration for the embedder model. - headless (bool): A flag to run the browser in headless mode. - verbose (bool): A flag to display the execution information. - model_token (int): The token limit for the language model. - - Args: - prompt (str): The user prompt to search the internet. - source (List[str]): The source of the graph. - config (dict): Configuration parameters for the graph. - schema (Optional[BaseModel]): The schema for the graph output. - - Example: - >>> search_graph = SmartScraperMultiParseMergeFirstGraph( - ... prompt="Who is Marco Perini?", - ... source= [ - ... "https://perinim.github.io/", - ... "https://perinim.github.io/cv/" - ... ], - ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} - ... ) - >>> result = search_graph.run() - """ - - def __init__(self, prompt: str, source: List[str], - config: dict, schema: Optional[BaseModel] = None): - - self.copy_config = safe_deepcopy(config) - self.copy_schema = deepcopy(schema) - super().__init__(prompt, config, source, schema) - - def _create_graph(self) -> BaseGraph: - """ - Creates the graph of nodes representing the workflow for web scraping - and parsing and then merge the content and generates answers to a given prompt. - """ - graph_iterator_node = GraphIteratorNode( - input="user_prompt & urls", - output=["parsed_doc"], - node_config={ - "graph_instance": ScrapeGraph, - "scraper_config": self.copy_config, - }, - schema=self.copy_schema - ) - - merge_answers_node = MergeAnswersNode( - input="user_prompt & parsed_doc", - output=["answer"], - node_config={ - "llm_model": self.llm_model, - "schema": self.copy_schema - } - ) - - return BaseGraph( - nodes=[ - graph_iterator_node, - merge_answers_node, - ], - edges=[ - (graph_iterator_node, merge_answers_node), - ], - entry_point=graph_iterator_node, - graph_name=self.__class__.__name__ - ) - - def run(self) -> str: - """ - Executes the web scraping and parsing process first and - then concatenate the content and generates answers to a given prompt. - - Returns: - str: The answer to the prompt. - """ - inputs = {"user_prompt": self.prompt, "urls": self.source} - self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") From 78bd40c3b54cd656e0fe2e789e978b59dcb96d5b Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:51:26 +0800 Subject: [PATCH 15/39] modify the graph name --- scrapegraphai/graphs/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 0acec56b..bfb8e300 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -13,7 +13,7 @@ from .csv_scraper_graph import CSVScraperGraph from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph -from .smart_scraper_multi_graph import SmartScraperMultiGraph +from .smart_scraper_multi_abstract_graph import SmartScraperMultiAbstractGraph from .json_scraper_multi_graph import JSONScraperMultiGraph from .csv_scraper_multi_graph import CSVScraperMultiGraph from .xml_scraper_multi_graph import XMLScraperMultiGraph @@ -25,5 +25,5 @@ from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph -from .smart_scraper_multi_parse_merge_first_graph import SmartScraperMultiParseMergeFirstGraph +from .smart_scraper_multi_graph import SmartScraperMultiGraph from .scrape_graph import ScrapeGraph From 6dbac936683042ef2e517a71b6fb1655508a1568 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:52:39 +0800 Subject: [PATCH 16/39] rename the SmartScraperMultiParseMergeFirstGraph to SmartScraperMultiGraph --- .../graphs/smart_scraper_multi_graph.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 5dff3277..2f628e81 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -6,18 +6,20 @@ from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph -from .smart_scraper_graph import SmartScraperGraph +from .scrape_graph import ScrapeGraph from ..nodes import ( GraphIteratorNode, - MergeAnswersNode + MergeAnswersNode, ) from ..utils.copy import safe_deepcopy class SmartScraperMultiGraph(AbstractGraph): """ SmartScraperMultiGraph is a scraping pipeline that scrapes a - list of URLs and generates answers to a given prompt. + list of URLs and merge the content first and finally generates answers to a given prompt. It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiGraph is that in this case the content is merged + before to be passed to the llm. Attributes: prompt (str): The user prompt to search the internet. @@ -34,42 +36,41 @@ class SmartScraperMultiGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> search_graph = MultipleSearchGraph( - ... "What is Chioggia famous for?", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + >>> smart_scraper_multi_graph = SmartScraperMultiGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = search_graph.run() + >>> result = smart_scraper_multi_graph.run() """ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) - super().__init__(prompt, config, source, schema) def _create_graph(self) -> BaseGraph: """ - Creates the graph of nodes representing the workflow for web scraping and searching. - - Returns: - BaseGraph: A graph instance representing the web scraping and searching workflow. + Creates the graph of nodes representing the workflow for web scraping + and parsing and then merge the content and generates answers to a given prompt. """ - graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", - output=["results"], + output=["parsed_doc"], node_config={ - "graph_instance": SmartScraperGraph, + "graph_instance": ScrapeGraph, "scraper_config": self.copy_config, }, schema=self.copy_schema ) merge_answers_node = MergeAnswersNode( - input="user_prompt & results", + input="user_prompt & parsed_doc", output=["answer"], node_config={ "llm_model": self.llm_model, @@ -91,12 +92,12 @@ def _create_graph(self) -> BaseGraph: def run(self) -> str: """ - Executes the web scraping and searching process. + Executes the web scraping and parsing process first and + then concatenate the content and generates answers to a given prompt. Returns: str: The answer to the prompt. """ inputs = {"user_prompt": self.prompt, "urls": self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") From 974f88a77e853884d8a83c0d44a79c013727cc55 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:01:59 +0800 Subject: [PATCH 17/39] rename SmartScraperMultiGraph to SmartScraperMultiLiteGraph --- ...r_multi_graph.py => smart_scraper_multi_lite_graph.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename scrapegraphai/graphs/{smart_scraper_multi_graph.py => smart_scraper_multi_lite_graph.py} (93%) diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py similarity index 93% rename from scrapegraphai/graphs/smart_scraper_multi_graph.py rename to scrapegraphai/graphs/smart_scraper_multi_lite_graph.py index 2f628e81..14e576d9 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py @@ -13,9 +13,9 @@ ) from ..utils.copy import safe_deepcopy -class SmartScraperMultiGraph(AbstractGraph): +class SmartScraperMultiLiteGraph(AbstractGraph): """ - SmartScraperMultiGraph is a scraping pipeline that scrapes a + SmartScraperMultiLiteGraph is a scraping pipeline that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. It only requires a user prompt and a list of URLs. The difference with the SmartScraperMultiGraph is that in this case the content is merged @@ -36,7 +36,7 @@ class SmartScraperMultiGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> smart_scraper_multi_graph = SmartScraperMultiGraph( + >>> smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( ... prompt="Who is Marco Perini?", ... source= [ ... "https://perinim.github.io/", @@ -44,7 +44,7 @@ class SmartScraperMultiGraph(AbstractGraph): ... ], ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = smart_scraper_multi_graph.run() + >>> result = smart_scraper_multi_lite_graph.run() """ def __init__(self, prompt: str, source: List[str], From 3e8f047ab606db4549c5d3b28b681f47b8c08725 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:10:57 +0800 Subject: [PATCH 18/39] Renamed smart_scraper_multi_abstract_graph back to smart_scraper_multi_graph. --- scrapegraphai/graphs/__init__.py | 4 ++-- ...t_graph.py => smart_scraper_multi_graph.py} | 18 +++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) rename scrapegraphai/graphs/{smart_scraper_multi_abstract_graph.py => smart_scraper_multi_graph.py} (84%) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index bfb8e300..9c8bc820 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -13,7 +13,7 @@ from .csv_scraper_graph import CSVScraperGraph from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph -from .smart_scraper_multi_abstract_graph import SmartScraperMultiAbstractGraph +from .smart_scraper_multi_graph import SmartScraperMultiGraph from .json_scraper_multi_graph import JSONScraperMultiGraph from .csv_scraper_multi_graph import CSVScraperMultiGraph from .xml_scraper_multi_graph import XMLScraperMultiGraph @@ -25,5 +25,5 @@ from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph -from .smart_scraper_multi_graph import SmartScraperMultiGraph +from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph from .scrape_graph import ScrapeGraph diff --git a/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py similarity index 84% rename from scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py rename to scrapegraphai/graphs/smart_scraper_multi_graph.py index f5ffdf96..420dc784 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -13,12 +13,12 @@ ) from ..utils.copy import safe_deepcopy -class SmartScraperMultiAbstractGraph(AbstractGraph): +class SmartScraperMultiGraph(AbstractGraph): """ - SmartScraperMultiAbstractGraph is a scraping pipeline that scrapes a + SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. - The difference with the SmartScraperMultiGraph is that in this case the content will be abstracted + The difference with the SmartScraperMultiLiteGraph is that in this case the content will be abstracted by llm and then merged finally passed to the llm. Attributes: @@ -36,11 +36,15 @@ class SmartScraperMultiAbstractGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> smart_scraper_multi_abstract_graph = SmartScraperMultiAbstractGraph( - ... "What is Chioggia famous for?", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + >>> smart_scraper_multi_graph = SmartScraperMultiGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = smart_scraper_multi_abstract_graph.run() + >>> result = smart_scraper_multi_graph.run() """ def __init__(self, prompt: str, source: List[str], From 28dda2b476e1b2da9e39cc212133fcaca7cc5b11 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:14:08 +0800 Subject: [PATCH 19/39] rename graph name --- ...=> smart_scraper_multi_lite_graph_openai_test.py} | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) rename tests/graphs/{smart_scraper_multi_parse_merge_first_graph_openai_test.py => smart_scraper_multi_lite_graph_openai_test.py} (70%) diff --git a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py similarity index 70% rename from tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py rename to tests/graphs/smart_scraper_multi_lite_graph_openai_test.py index 506ce5da..0a0e0a69 100644 --- a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py +++ b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py @@ -6,7 +6,7 @@ import pytest import pandas as pd from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiParseConcatFirstGraph +from scrapegraphai.graphs import SmartScraperMultiLiteGraph from scrapegraphai.utils import prettify_exec_info load_dotenv() @@ -27,7 +27,7 @@ def graph_config(): def test_scraping_pipeline(graph_config): """Start of the scraping pipeline""" - smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( prompt="Who is Marco Perini?", source= [ "https://perinim.github.io/", @@ -36,14 +36,14 @@ def test_scraping_pipeline(graph_config): config=graph_config, ) - result = smart_scraper_multi_parse_concat_first_graph.run() + result = smart_scraper_multi_lite_graph.run() assert result is not None assert isinstance(result, dict) def test_get_execution_info(graph_config): """Get the execution info""" - smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( prompt="Who is Marco Perini?", source= [ "https://perinim.github.io/", @@ -52,8 +52,8 @@ def test_get_execution_info(graph_config): config=graph_config, ) - smart_scraper_multi_parse_concat_first_graph.run() + smart_scraper_multi_lite_graph.run() - graph_exec_info = smart_scraper_multi_parse_concat_first_graph.get_execution_info() + graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() assert graph_exec_info is not None From da2a3c8ec7d9c3c7e805fd6193035bd1bc284375 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:19:00 +0800 Subject: [PATCH 20/39] add smart_scraper_multi_lite_graph example --- .../openai/smart_scraper_multi_lite_openai.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 examples/openai/smart_scraper_multi_lite_openai.py diff --git a/examples/openai/smart_scraper_multi_lite_openai.py b/examples/openai/smart_scraper_multi_lite_openai.py new file mode 100644 index 00000000..69eeafc7 --- /dev/null +++ b/examples/openai/smart_scraper_multi_lite_openai.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) From d84d29538985ef8d04badfed547c6fdc73d7774d Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 18 Oct 2024 20:18:25 +0000 Subject: [PATCH 21/39] ci(release): 1.27.0-beta.2 [skip ci] ## [1.27.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.1...v1.27.0-beta.2) (2024-10-18) ### Bug Fixes * refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80)) ### CI * **release:** 1.26.6 [skip ci] ([a4634c7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a4634c73312b5c08581a8d670d53b7eebe8dadc1)) --- CHANGELOG.md | 12 ++++++++++++ pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f320431..9631d303 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## [1.27.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.1...v1.27.0-beta.2) (2024-10-18) + + +### Bug Fixes + +* refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80)) + + +### CI + +* **release:** 1.26.6 [skip ci] ([a4634c7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a4634c73312b5c08581a8d670d53b7eebe8dadc1)) + ## [1.27.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6-beta.1...v1.27.0-beta.1) (2024-10-16) diff --git a/pyproject.toml b/pyproject.toml index 46e533e0..a7b15a89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b1" +version = "1.27.0b2" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 9cd9a874f91bbbb2990444818e8ab2d0855cc361 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Fri, 18 Oct 2024 22:34:42 +0200 Subject: [PATCH 22/39] chore: fix example Committing even though this is not the bug we were looking for --- examples/together/depth_search_graph_together.py | 7 +++---- requirements-dev.lock | 12 +----------- requirements.lock | 6 +----- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py index 7a2e7f3e..fb7b4d9e 100644 --- a/examples/together/depth_search_graph_together.py +++ b/examples/together/depth_search_graph_together.py @@ -7,13 +7,12 @@ load_dotenv() -openai_key = os.getenv("OPENAI_APIKEY") +together_key = os.getenv("TOGETHER_APIKEY") graph_config = { "llm": { - "api_key": "***************************", - "model": "oneapi/qwen-turbo", - "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, }, "verbose": True, "headless": False, diff --git a/requirements-dev.lock b/requirements-dev.lock index bca5e9c2..61bd3e2b 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -30,8 +30,6 @@ anyio==4.4.0 astroid==3.2.4 # via pylint async-timeout==4.0.3 - # via aiohttp - # via langchain # via scrapegraphai attrs==24.2.0 # via aiohttp @@ -80,9 +78,6 @@ distro==1.9.0 # via openai docutils==0.19 # via sphinx -exceptiongroup==1.2.2 - # via anyio - # via pytest fastapi==0.112.0 # via burr fastapi-pagination==0.12.26 @@ -136,6 +131,7 @@ graphviz==0.20.3 # via burr greenlet==3.0.3 # via playwright + # via sqlalchemy grpcio==1.65.4 # via google-api-core # via grpcio-status @@ -504,9 +500,6 @@ tokenizers==0.19.1 # via transformers toml==0.10.2 # via streamlit -tomli==2.0.1 - # via pylint - # via pytest tomlkit==0.13.0 # via pylint tornado==6.4.1 @@ -524,8 +517,6 @@ transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 # via altair - # via anyio - # via astroid # via fastapi # via fastapi-pagination # via google-generativeai @@ -540,7 +531,6 @@ typing-extensions==4.12.2 # via sqlalchemy # via streamlit # via typing-inspect - # via uvicorn typing-inspect==0.9.0 # via dataclasses-json # via sf-hamilton diff --git a/requirements.lock b/requirements.lock index 38be6e68..c2c40996 100644 --- a/requirements.lock +++ b/requirements.lock @@ -19,8 +19,6 @@ anyio==4.4.0 # via httpx # via openai async-timeout==4.0.3 - # via aiohttp - # via langchain # via scrapegraphai attrs==23.2.0 # via aiohttp @@ -50,8 +48,6 @@ dill==0.3.8 # via multiprocess distro==1.9.0 # via openai -exceptiongroup==1.2.2 - # via anyio fastembed==0.3.6 # via scrapegraphai filelock==3.15.4 @@ -91,6 +87,7 @@ googlesearch-python==1.2.5 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy grpcio==1.65.1 # via google-api-core # via grpcio-status @@ -371,7 +368,6 @@ tqdm==4.66.4 transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 - # via anyio # via google-generativeai # via huggingface-hub # via langchain-core From f576afaf0c1dd6d1dbf79fd5e642f6dca9dbe862 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 20 Oct 2024 08:15:19 +0000 Subject: [PATCH 23/39] ci(release): 1.27.0-beta.3 [skip ci] ## [1.27.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.2...v1.27.0-beta.3) (2024-10-20) ### Features * implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254)) * Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4)) ### Bug Fixes * fix the example variable name ([69ff649](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/69ff6495564a5c670b89c0f802ebb1602f0e7cfa)) ### chore * fix example ([9cd9a87](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd9a874f91bbbb2990444818e8ab2d0855cc361)) ### Test * Add scrape_graph test ([cdb3c11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cdb3c1100ee1117afedbc70437317acaf7c7c1d3)) * Add smart_scraper_multi_parse_merge_first_graph test ([464b8b0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/464b8b04ea0d51280849173d5eda92d4d4db8612)) --- CHANGELOG.md | 24 ++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9631d303..6c029ea3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,27 @@ +## [1.27.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.2...v1.27.0-beta.3) (2024-10-20) + + +### Features + +* implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254)) +* Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4)) + + +### Bug Fixes + +* fix the example variable name ([69ff649](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/69ff6495564a5c670b89c0f802ebb1602f0e7cfa)) + + +### chore + +* fix example ([9cd9a87](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd9a874f91bbbb2990444818e8ab2d0855cc361)) + + +### Test + +* Add scrape_graph test ([cdb3c11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cdb3c1100ee1117afedbc70437317acaf7c7c1d3)) +* Add smart_scraper_multi_parse_merge_first_graph test ([464b8b0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/464b8b04ea0d51280849173d5eda92d4d4db8612)) + ## [1.27.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.1...v1.27.0-beta.2) (2024-10-18) diff --git a/pyproject.toml b/pyproject.toml index a7b15a89..85d7b442 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b2" +version = "1.27.0b3" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 2991ca8dd207cc83409a84c261a1f87e5da47e01 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 21 Oct 2024 09:33:40 +0200 Subject: [PATCH 24/39] add examples smart scraper lite --- .../smart_scraper_multi_lite_anthropic.py | 35 ++++++++++++++ .../azure/smart_scraper_multi_lite_azure.py | 35 ++++++++++++++ .../smart_scraper_multi_lite_bedrock.py | 29 ++++++++++++ .../smart_scraper_multi_lite_deepseek.py | 35 ++++++++++++++ .../ernie/smart_scraper_multi_lite_ernie.py | 35 ++++++++++++++ .../smart_scraper_multi_lite_fireworks.py | 35 ++++++++++++++ .../smart_scraper_multi_lite_gemini.py | 0 .../smart_scraper_multi_lite_google_genai.py | 34 ++++++++++++++ ...mart_scraper_multi_lite_google_vertexai.py | 35 ++++++++++++++ .../smart_scraper_multi_lite_vertex.py | 0 .../groq/smart_scraper_multi_lite_groq.py | 35 ++++++++++++++ ...smart_scraper_multi_lite_huggingfacehub.py | 34 ++++++++++++++ ...smart_scraper_multi_lite_uhggingfacehub.py | 0 .../smart_scraper_multi_lite_ollama.py | 45 ++++++++++++++++++ .../smart_scraper_multi_lite_mistral.py | 35 ++++++++++++++ .../smart_scraper_multi_lite_moonshot.py | 34 ++++++++++++++ .../smart_scraper_multi_lite_nemotron.py | 46 +++++++++++++++++++ .../oneapi/smart_scraper_multi_lite_oneapi.py | 43 +++++++++++++++++ .../smart_scraper_multi_lite_together.py | 43 +++++++++++++++++ 19 files changed, 588 insertions(+) create mode 100644 examples/anthropic/smart_scraper_multi_lite_anthropic.py create mode 100644 examples/azure/smart_scraper_multi_lite_azure.py create mode 100644 examples/bedrock/smart_scraper_multi_lite_bedrock.py create mode 100644 examples/deepseek/smart_scraper_multi_lite_deepseek.py create mode 100644 examples/ernie/smart_scraper_multi_lite_ernie.py create mode 100644 examples/fireworks/smart_scraper_multi_lite_fireworks.py create mode 100644 examples/google_genai/smart_scraper_multi_lite_gemini.py create mode 100644 examples/google_genai/smart_scraper_multi_lite_google_genai.py create mode 100644 examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py create mode 100644 examples/google_vertexai/smart_scraper_multi_lite_vertex.py create mode 100644 examples/groq/smart_scraper_multi_lite_groq.py create mode 100644 examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py create mode 100644 examples/huggingfacehub/smart_scraper_multi_lite_uhggingfacehub.py create mode 100644 examples/local_models/smart_scraper_multi_lite_ollama.py create mode 100644 examples/mistral/smart_scraper_multi_lite_mistral.py create mode 100644 examples/moonshot/smart_scraper_multi_lite_moonshot.py create mode 100644 examples/nemotron/smart_scraper_multi_lite_nemotron.py create mode 100644 examples/oneapi/smart_scraper_multi_lite_oneapi.py create mode 100644 examples/together/smart_scraper_multi_lite_together.py diff --git a/examples/anthropic/smart_scraper_multi_lite_anthropic.py b/examples/anthropic/smart_scraper_multi_lite_anthropic.py new file mode 100644 index 00000000..7cf3c09d --- /dev/null +++ b/examples/anthropic/smart_scraper_multi_lite_anthropic.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/azure/smart_scraper_multi_lite_azure.py b/examples/azure/smart_scraper_multi_lite_azure.py new file mode 100644 index 00000000..b9046d9f --- /dev/null +++ b/examples/azure/smart_scraper_multi_lite_azure.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-4o" + }, + "verbose": True, + "headless": False +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/bedrock/smart_scraper_multi_lite_bedrock.py b/examples/bedrock/smart_scraper_multi_lite_bedrock.py new file mode 100644 index 00000000..5cb26067 --- /dev/null +++ b/examples/bedrock/smart_scraper_multi_lite_bedrock.py @@ -0,0 +1,29 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + } +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_multi_lite_deepseek.py b/examples/deepseek/smart_scraper_multi_lite_deepseek.py new file mode 100644 index 00000000..eb5eea01 --- /dev/null +++ b/examples/deepseek/smart_scraper_multi_lite_deepseek.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("DEEPSEEK_API_KEY"), + "model": "deepseek/deepseek-coder-33b-instruct", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/ernie/smart_scraper_multi_lite_ernie.py b/examples/ernie/smart_scraper_multi_lite_ernie.py new file mode 100644 index 00000000..777a760e --- /dev/null +++ b/examples/ernie/smart_scraper_multi_lite_ernie.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ERNIE_API_KEY"), + "model": "ernie/ernie-bot-4", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/fireworks/smart_scraper_multi_lite_fireworks.py b/examples/fireworks/smart_scraper_multi_lite_fireworks.py new file mode 100644 index 00000000..4ffaf6bb --- /dev/null +++ b/examples/fireworks/smart_scraper_multi_lite_fireworks.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("FIREWORKS_API_KEY"), + "model": "fireworks/llama-v2-70b-chat", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/google_genai/smart_scraper_multi_lite_gemini.py b/examples/google_genai/smart_scraper_multi_lite_gemini.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/google_genai/smart_scraper_multi_lite_google_genai.py b/examples/google_genai/smart_scraper_multi_lite_google_genai.py new file mode 100644 index 00000000..e14e2ceb --- /dev/null +++ b/examples/google_genai/smart_scraper_multi_lite_google_genai.py @@ -0,0 +1,34 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GOOGLE_API_KEY"), + "model": "gemini-pro", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py b/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py new file mode 100644 index 00000000..5c293416 --- /dev/null +++ b/examples/google_vertexai/smart_scraper_multi_lite_google_vertexai.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "project": os.getenv("GOOGLE_CLOUD_PROJECT"), + "location": "us-central1", + "model": "text-bison@001", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_multi_lite_vertex.py b/examples/google_vertexai/smart_scraper_multi_lite_vertex.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/groq/smart_scraper_multi_lite_groq.py b/examples/groq/smart_scraper_multi_lite_groq.py new file mode 100644 index 00000000..9c8e4d1d --- /dev/null +++ b/examples/groq/smart_scraper_multi_lite_groq.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GROQ_API_KEY"), + "model": "mixtral-8x7b-32768", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py new file mode 100644 index 00000000..2d7a3a45 --- /dev/null +++ b/examples/huggingfacehub/smart_scraper_multi_lite_huggingfacehub.py @@ -0,0 +1,34 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("HUGGINGFACEHUB_API_TOKEN"), + "model": "huggingfacehub/meta-llama/Llama-2-70b-chat-hf", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/smart_scraper_multi_lite_uhggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_lite_uhggingfacehub.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/local_models/smart_scraper_multi_lite_ollama.py b/examples/local_models/smart_scraper_multi_lite_ollama.py new file mode 100644 index 00000000..f09c4cb4 --- /dev/null +++ b/examples/local_models/smart_scraper_multi_lite_ollama.py @@ -0,0 +1,45 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/mistral/smart_scraper_multi_lite_mistral.py b/examples/mistral/smart_scraper_multi_lite_mistral.py new file mode 100644 index 00000000..ce2d19bf --- /dev/null +++ b/examples/mistral/smart_scraper_multi_lite_mistral.py @@ -0,0 +1,35 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("MISTRAL_API_KEY"), + "model": "mistral/mistral-medium", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/moonshot/smart_scraper_multi_lite_moonshot.py b/examples/moonshot/smart_scraper_multi_lite_moonshot.py new file mode 100644 index 00000000..b3e2b7be --- /dev/null +++ b/examples/moonshot/smart_scraper_multi_lite_moonshot.py @@ -0,0 +1,34 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("MOONSHOT_API_KEY"), + "model": "moonshot/moonshot-v1-8b", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_multi_lite_nemotron.py b/examples/nemotron/smart_scraper_multi_lite_nemotron.py new file mode 100644 index 00000000..7639d820 --- /dev/null +++ b/examples/nemotron/smart_scraper_multi_lite_nemotron.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_API_KEY"), + "model": "nemotron/nemotron-3-8b-chat", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_multi_lite_oneapi.py b/examples/oneapi/smart_scraper_multi_lite_oneapi.py new file mode 100644 index 00000000..8cf66dea --- /dev/null +++ b/examples/oneapi/smart_scraper_multi_lite_oneapi.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/smart_scraper_multi_lite_together.py b/examples/together/smart_scraper_multi_lite_together.py new file mode 100644 index 00000000..8cf66dea --- /dev/null +++ b/examples/together/smart_scraper_multi_lite_together.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) From b84883bfd12f4d1b4a0528e2c0503b649ea1e1fb Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 21 Oct 2024 09:39:17 +0200 Subject: [PATCH 25/39] add smartscraper lite --- .../anthropic/smart_scraper_lite_anthropic.py | 32 +++++++++++++ examples/azure/smart_scraper_lite_azure.py | 31 ++++++++++++ .../bedrock/smart_scraper_lite_bedrock.py | 26 ++++++++++ .../deepseek/smart_scraper_lite_deepseek.py | 31 ++++++++++++ examples/ernie/smart_scraper_lite_ernie.py | 31 ++++++++++++ .../fireworks/smart_scraper_lite_fireworks.py | 31 ++++++++++++ .../smart_scraper_lite_google_genai.py | 31 ++++++++++++ .../smart_scraper_lite_google_vertexai.py | 33 +++++++++++++ .../smart_scraper_multi_lite_vertex.py | 47 +++++++++++++++++++ examples/groq/smart_scraper_lite_groq.py | 31 ++++++++++++ .../smart_scraper_lite_huggingfacehub.py | 31 ++++++++++++ .../local_models/smart_scraper_lite_ollama.py | 30 ++++++++++++ .../mistral/smart_scraper_lite_mistral.py | 31 ++++++++++++ .../moonshot/smart_scraper_lite_moonshot.py | 31 ++++++++++++ .../nemotron/smart_scraper_lite_nemotron.py | 32 +++++++++++++ examples/oneapi/smart_scraper_lite_oneapi.py | 32 +++++++++++++ examples/openai/smart_scraper_lite_openai.py | 32 +++++++++++++ .../together/smart_scraper_lite_together.py | 1 + 18 files changed, 544 insertions(+) create mode 100644 examples/anthropic/smart_scraper_lite_anthropic.py create mode 100644 examples/azure/smart_scraper_lite_azure.py create mode 100644 examples/bedrock/smart_scraper_lite_bedrock.py create mode 100644 examples/deepseek/smart_scraper_lite_deepseek.py create mode 100644 examples/ernie/smart_scraper_lite_ernie.py create mode 100644 examples/fireworks/smart_scraper_lite_fireworks.py create mode 100644 examples/google_genai/smart_scraper_lite_google_genai.py create mode 100644 examples/google_vertexai/smart_scraper_lite_google_vertexai.py create mode 100644 examples/groq/smart_scraper_lite_groq.py create mode 100644 examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py create mode 100644 examples/local_models/smart_scraper_lite_ollama.py create mode 100644 examples/mistral/smart_scraper_lite_mistral.py create mode 100644 examples/moonshot/smart_scraper_lite_moonshot.py create mode 100644 examples/nemotron/smart_scraper_lite_nemotron.py create mode 100644 examples/oneapi/smart_scraper_lite_oneapi.py create mode 100644 examples/openai/smart_scraper_lite_openai.py create mode 100644 examples/together/smart_scraper_lite_together.py diff --git a/examples/anthropic/smart_scraper_lite_anthropic.py b/examples/anthropic/smart_scraper_lite_anthropic.py new file mode 100644 index 00000000..698623c6 --- /dev/null +++ b/examples/anthropic/smart_scraper_lite_anthropic.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/azure/smart_scraper_lite_azure.py b/examples/azure/smart_scraper_lite_azure.py new file mode 100644 index 00000000..335c4832 --- /dev/null +++ b/examples/azure/smart_scraper_lite_azure.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-4o" + }, + "verbose": True, + "headless": False +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/smart_scraper_lite_bedrock.py b/examples/bedrock/smart_scraper_lite_bedrock.py new file mode 100644 index 00000000..2bf0471c --- /dev/null +++ b/examples/bedrock/smart_scraper_lite_bedrock.py @@ -0,0 +1,26 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import json +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + } +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_lite_deepseek.py b/examples/deepseek/smart_scraper_lite_deepseek.py new file mode 100644 index 00000000..a70d76b0 --- /dev/null +++ b/examples/deepseek/smart_scraper_lite_deepseek.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("DEEPSEEK_API_KEY"), + "model": "deepseek/deepseek-coder-33b-instruct", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/smart_scraper_lite_ernie.py b/examples/ernie/smart_scraper_lite_ernie.py new file mode 100644 index 00000000..5d3ba9d9 --- /dev/null +++ b/examples/ernie/smart_scraper_lite_ernie.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ERNIE_API_KEY"), + "model": "ernie/ernie-bot-4", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/smart_scraper_lite_fireworks.py b/examples/fireworks/smart_scraper_lite_fireworks.py new file mode 100644 index 00000000..6c9a7745 --- /dev/null +++ b/examples/fireworks/smart_scraper_lite_fireworks.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("FIREWORKS_API_KEY"), + "model": "fireworks/llama-v2-70b-chat", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/smart_scraper_lite_google_genai.py b/examples/google_genai/smart_scraper_lite_google_genai.py new file mode 100644 index 00000000..9b776735 --- /dev/null +++ b/examples/google_genai/smart_scraper_lite_google_genai.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GOOGLE_API_KEY"), + "model": "gemini-pro", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/smart_scraper_lite_google_vertexai.py b/examples/google_vertexai/smart_scraper_lite_google_vertexai.py new file mode 100644 index 00000000..eca61bbb --- /dev/null +++ b/examples/google_vertexai/smart_scraper_lite_google_vertexai.py @@ -0,0 +1,33 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "project": os.getenv("GOOGLE_CLOUD_PROJECT"), + "location": "us-central1", + "model": "text-bison@001", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/google_vertexai/smart_scraper_multi_lite_vertex.py b/examples/google_vertexai/smart_scraper_multi_lite_vertex.py index e69de29b..60ff3638 100644 --- a/examples/google_vertexai/smart_scraper_multi_lite_vertex.py +++ b/examples/google_vertexai/smart_scraper_multi_lite_vertex.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "project": os.getenv("GOOGLE_CLOUD_PROJECT"), + "location": "us-central1", + "model": "text-bison@001", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/smart_scraper_lite_groq.py b/examples/groq/smart_scraper_lite_groq.py new file mode 100644 index 00000000..5fe6022f --- /dev/null +++ b/examples/groq/smart_scraper_lite_groq.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("GROQ_API_KEY"), + "model": "mixtral-8x7b-32768", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py new file mode 100644 index 00000000..4faa8a47 --- /dev/null +++ b/examples/huggingfacehub/smart_scraper_lite_huggingfacehub.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("HUGGINGFACEHUB_API_TOKEN"), + "model": "huggingfacehub/meta-llama/Llama-2-70b-chat-hf", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/smart_scraper_lite_ollama.py b/examples/local_models/smart_scraper_lite_ollama.py new file mode 100644 index 00000000..2cf6c402 --- /dev/null +++ b/examples/local_models/smart_scraper_lite_ollama.py @@ -0,0 +1,30 @@ +""" +Basic example of scraping pipeline using SmartScraper + +""" +import json +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", + "base_url": "http://localhost:11434", + }, + "verbose": True, + "headless": False +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/smart_scraper_lite_mistral.py b/examples/mistral/smart_scraper_lite_mistral.py new file mode 100644 index 00000000..390371f9 --- /dev/null +++ b/examples/mistral/smart_scraper_lite_mistral.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("MISTRAL_API_KEY"), + "model": "mistral/mistral-medium", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/moonshot/smart_scraper_lite_moonshot.py b/examples/moonshot/smart_scraper_lite_moonshot.py new file mode 100644 index 00000000..509027fb --- /dev/null +++ b/examples/moonshot/smart_scraper_lite_moonshot.py @@ -0,0 +1,31 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/smart_scraper_lite_nemotron.py b/examples/nemotron/smart_scraper_lite_nemotron.py new file mode 100644 index 00000000..6c1d8528 --- /dev/null +++ b/examples/nemotron/smart_scraper_lite_nemotron.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_API_KEY"), + "model": "nemotron/nemotron-3.5-turbo", + "base_url": "http://127.0.0.1:3000/v1", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/smart_scraper_lite_oneapi.py b/examples/oneapi/smart_scraper_lite_oneapi.py new file mode 100644 index 00000000..b271acb3 --- /dev/null +++ b/examples/oneapi/smart_scraper_lite_oneapi.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ONEAPI_API_KEY"), + "model": "oneapi/gpt-3.5-turbo", + "base_url": "http://127.0.0.1:3000/v1", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/smart_scraper_lite_openai.py b/examples/openai/smart_scraper_lite_openai.py new file mode 100644 index 00000000..5de725bb --- /dev/null +++ b/examples/openai/smart_scraper_lite_openai.py @@ -0,0 +1,32 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "verbose": True, + "headless": False, +} + +smart_scraper_lite_graph = SmartScraperLiteGraph( + prompt="Who is Marco Perini?", + source="https://perinim.github.io/", + config=graph_config +) + +result = smart_scraper_lite_graph.run() +print(json.dumps(result, indent=4)) + +graph_exec_info = smart_scraper_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/together/smart_scraper_lite_together.py b/examples/together/smart_scraper_lite_together.py new file mode 100644 index 00000000..0519ecba --- /dev/null +++ b/examples/together/smart_scraper_lite_together.py @@ -0,0 +1 @@ + \ No newline at end of file From 52b6bf5fb8c570aa8ef026916230c5d52996f887 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 21 Oct 2024 10:12:53 +0200 Subject: [PATCH 26/39] feat: refactoring of ScrapeGraph to SmartScraperLiteGraph --- scrapegraphai/graphs/__init__.py | 2 +- .../{scrape_graph.py => smart_scraper_lite_graph.py} | 9 +++++---- scrapegraphai/graphs/smart_scraper_multi_lite_graph.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) rename scrapegraphai/graphs/{scrape_graph.py => smart_scraper_lite_graph.py} (92%) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 9c8bc820..2c75f0f7 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -26,4 +26,4 @@ from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph -from .scrape_graph import ScrapeGraph +from .smart_scraper_lite_graph import SmartScraperLiteGraph diff --git a/scrapegraphai/graphs/scrape_graph.py b/scrapegraphai/graphs/smart_scraper_lite_graph.py similarity index 92% rename from scrapegraphai/graphs/scrape_graph.py rename to scrapegraphai/graphs/smart_scraper_lite_graph.py index a08149aa..77437145 100644 --- a/scrapegraphai/graphs/scrape_graph.py +++ b/scrapegraphai/graphs/smart_scraper_lite_graph.py @@ -10,9 +10,9 @@ ParseNode, ) -class ScrapeGraph(AbstractGraph): +class SmartScraperLiteGraph(AbstractGraph): """ - ScrapeGraph is a scraping pipeline that automates the process of + SmartScraperLiteGraph is a scraping pipeline that automates the process of extracting information from web pages. Attributes: @@ -30,7 +30,7 @@ class ScrapeGraph(AbstractGraph): schema (BaseModel): The schema for the graph output. Example: - >>> scraper = ScraperGraph( + >>> scraper = SmartScraperLiteGraph( ... "https://en.wikipedia.org/wiki/Chioggia", ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) @@ -38,7 +38,8 @@ class ScrapeGraph(AbstractGraph): ) """ - def __init__(self, source: str, config: dict, prompt: str = "", schema: Optional[BaseModel] = None): + def __init__(self, source: str, config: dict, prompt: str = "", + schema: Optional[BaseModel] = None): super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" diff --git a/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py index 14e576d9..bb17bd03 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py @@ -6,7 +6,7 @@ from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph -from .scrape_graph import ScrapeGraph +from .smart_scraper_lite_graph import SmartScraperLiteGraph from ..nodes import ( GraphIteratorNode, MergeAnswersNode, @@ -63,7 +63,7 @@ def _create_graph(self) -> BaseGraph: input="user_prompt & urls", output=["parsed_doc"], node_config={ - "graph_instance": ScrapeGraph, + "graph_instance": SmartScraperLiteGraph, "scraper_config": self.copy_config, }, schema=self.copy_schema From 3d6bbcdaa3828ff257adb22f2f7c1a46343de5b5 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Mon, 21 Oct 2024 08:14:25 +0000 Subject: [PATCH 27/39] ci(release): 1.27.0-beta.4 [skip ci] ## [1.27.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.3...v1.27.0-beta.4) (2024-10-21) ### Features * refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c029ea3..1c2d2eeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.3...v1.27.0-beta.4) (2024-10-21) + + +### Features + +* refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887)) + ## [1.27.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.2...v1.27.0-beta.3) (2024-10-20) diff --git a/pyproject.toml b/pyproject.toml index 85d7b442..912533e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b3" +version = "1.27.0b4" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 0ea00c078f2811f0d1b356bd84cafde80763c703 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 21 Oct 2024 10:30:21 +0200 Subject: [PATCH 28/39] feat: refactoring of export functions --- scrapegraphai/utils/__init__.py | 3 +- scrapegraphai/utils/convert_to_csv.py | 55 -------------------------- scrapegraphai/utils/convert_to_json.py | 52 ------------------------ 3 files changed, 1 insertion(+), 109 deletions(-) delete mode 100644 scrapegraphai/utils/convert_to_csv.py delete mode 100644 scrapegraphai/utils/convert_to_json.py diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index d5badca9..22f6a4bc 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -1,8 +1,6 @@ """ __init__.py file for utils folder """ -from .convert_to_csv import convert_to_csv -from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers from .save_audio_from_bytes import save_audio_from_bytes @@ -28,3 +26,4 @@ validation_focused_code_generation, semantic_focused_code_generation) from .save_code_to_file import save_code_to_file +from .data_export import export_to_json, export_to_csv, export_to_xml diff --git a/scrapegraphai/utils/convert_to_csv.py b/scrapegraphai/utils/convert_to_csv.py deleted file mode 100644 index e0664541..00000000 --- a/scrapegraphai/utils/convert_to_csv.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Module that given a filename and a position saves the file in the csv format -""" -import os -import sys -import pandas as pd - -def convert_to_csv(data: dict, filename: str, position: str = None) -> None: - """ - Converts a dictionary to a CSV file and saves it at a specified location. - - Args: - data (dict): The data to be converted into CSV format. - filename (str): The name of the output CSV file, without the '.csv' extension. - position (str, optional): The file path where the CSV should be saved. - Defaults to the directory of the caller script if not provided. - - Returns: - None: The function does not return anything. - - Raises: - FileNotFoundError: If the specified directory does not exist. - PermissionError: If write permissions are lacking for the directory. - TypeError: If `data` is not a dictionary. - Exception: For other issues that may arise during the creation or saving of the CSV file. - - Example: - >>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') - Saves a CSV file named 'output.csv' at '/path/to/save'. - """ - - if ".csv" in filename: - filename = filename.replace(".csv", "") - - if position is None: - caller_dir = os.path.dirname(os.path.abspath(sys.argv[0])) - position = caller_dir - - try: - if not isinstance(data, dict): - raise TypeError("Input data must be a dictionary") - - os.makedirs(position, exist_ok=True) - - df = pd.DataFrame.from_dict(data, orient='index') - df.to_csv(os.path.join(position, f"{filename}.csv"), index=False) - - except FileNotFoundError as fnfe: - raise FileNotFoundError( - f"The specified directory '{position}' does not exist.") from fnfe - except PermissionError as pe: - raise PermissionError( - f"You don't have permission to write to '{position}'.") from pe - except Exception as e: - raise e diff --git a/scrapegraphai/utils/convert_to_json.py b/scrapegraphai/utils/convert_to_json.py deleted file mode 100644 index 4e1711f1..00000000 --- a/scrapegraphai/utils/convert_to_json.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Convert to json module -""" -import json -import os -import sys - -def convert_to_json(data: dict, filename: str, position: str = None) -> None: - """ - Converts a dictionary to a JSON file and saves it at a specified location. - - Args: - data (dict): The data to be converted into JSON format. - filename (str): The name of the output JSON file, without the '.json' extension. - position (str, optional): The file path where the JSON file should be saved. - Defaults to the directory of the caller script if not provided. - - Returns: - None: The function does not return anything. - - Raises: - ValueError: If 'filename' contains '.json'. - FileNotFoundError: If the specified directory does not exist. - PermissionError: If write permissions are lacking for the directory. - - Example: - >>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save') - Saves a JSON file named 'output.json' at '/path/to/save'. - - Notes: - This function automatically ensures the directory exists before - attempting to write the file. - If the directory does not exist, it will attempt to create it. - """ - - if ".json" in filename: - filename = filename.replace(".json", "") # Remove .json extension - - if position is None: - caller_dir = os.path.dirname(os.path.abspath(sys.argv[0])) - position = caller_dir - - try: - os.makedirs(position, exist_ok=True) - with open(os.path.join(position, f"{filename}.json"), "w", encoding="utf-8") as f: - f.write(json.dumps(data)) - except FileNotFoundError as fnfe: - raise FileNotFoundError( - f"The specified directory '{position}' does not exist.") from fnfe - except PermissionError as pe: - raise PermissionError( - f"You don't have permission to write to '{position}'.") from pe From 5002c713d5a76b2c2e4313f888d9768e3f3142e1 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 22 Oct 2024 07:06:26 +0000 Subject: [PATCH 29/39] ci(release): 1.27.0-beta.5 [skip ci] ## [1.27.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.4...v1.27.0-beta.5) (2024-10-22) ### Features * refactoring of export functions ([0ea00c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0ea00c078f2811f0d1b356bd84cafde80763c703)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c2d2eeb..8d4aea50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.4...v1.27.0-beta.5) (2024-10-22) + + +### Features + +* refactoring of export functions ([0ea00c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0ea00c078f2811f0d1b356bd84cafde80763c703)) + ## [1.27.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.3...v1.27.0-beta.4) (2024-10-21) diff --git a/pyproject.toml b/pyproject.toml index 912533e2..b006de1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b4" +version = "1.27.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 23 Oct 2024 12:08:00 +0200 Subject: [PATCH 30/39] feat: add integration with scrape.do --- scrapegraphai/nodes/fetch_node.py | 6 +++--- scrapegraphai/nodes/fetch_node_level_k.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 4cd549a5..d90864e9 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -270,10 +270,10 @@ def handle_web_source(self, state, source): else: loader_kwargs = {} - if self.node_config is not None: + if self.node_config: loader_kwargs = self.node_config.get("loader_kwargs", {}) - if self.browser_base is not None: + if self.browser_base: try: from ..docloaders.browser_base import browser_base_fetch except ImportError: @@ -285,7 +285,7 @@ def handle_web_source(self, state, source): document = [Document(page_content=content, metadata={"source": source}) for content in data] - elif self.scrape_do is not None: + elif self.scrape_do: from ..docloaders.scrape_do import scrape_do_fetch if (self.scrape_do.get("use_proxy") is None) or \ self.scrape_do.get("geoCode") is None or \ diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index 0f772edf..ce8e4042 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -57,6 +57,7 @@ def __init__( self.headless = node_config.get("headless", True) if node_config else True self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {} self.browser_base = node_config.get("browser_base", None) + self.scrape_do = node_config.get("scrape_do", None) self.depth = node_config.get("depth", 1) if node_config else 1 self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False self.min_input_len = 1 @@ -115,6 +116,11 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: self.browser_base.get("project_id"), [source]) document = [Document(page_content=content, metadata={"source": source}) for content in data] + elif self.scrape_do: + from ..docloaders.scrape_do import scrape_do_fetch + data = scrape_do_fetch(self.scrape_do.get("api_key"), source) + document = [Document(page_content=data, + metadata={"source": source})] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() From 94b9836ef6cd9c24bb8c04d7049d5477cc8ed807 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 23 Oct 2024 10:09:36 +0000 Subject: [PATCH 31/39] ci(release): 1.27.0-beta.6 [skip ci] ## [1.27.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.5...v1.27.0-beta.6) (2024-10-23) ### Features * add integration with scrape.do ([ae275ec](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d4aea50..a3615122 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.5...v1.27.0-beta.6) (2024-10-23) + + +### Features + +* add integration with scrape.do ([ae275ec](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635)) + ## [1.27.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.4...v1.27.0-beta.5) (2024-10-22) diff --git a/pyproject.toml b/pyproject.toml index b006de1a..b3fc2f11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b5" +version = "1.27.0b6" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From f658092dffb20ea111cc00950f617057482788f4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 23 Oct 2024 12:15:16 +0200 Subject: [PATCH 32/39] feat: refactoring of get_probable_tags node --- scrapegraphai/nodes/get_probable_tags_node.py | 10 ++-------- scrapegraphai/prompts/__init__.py | 1 + .../prompts/get_probable_tags_node_prompts.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 8 deletions(-) create mode 100644 scrapegraphai/prompts/get_probable_tags_node_prompts.py diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index 9ba38283..e34bbbb4 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -4,6 +4,7 @@ from typing import List, Optional from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate +from ..prompts import TEMPLATE_GET_PROBABLE_TAGS from ..utils.logging import get_logger from .base_node import BaseNode @@ -68,14 +69,7 @@ def execute(self, state: dict) -> dict: output_parser = CommaSeparatedListOutputParser() format_instructions = output_parser.get_format_instructions() - template = """ - PROMPT: - You are a website scraper that knows all the types of html tags. - You are now asked to list all the html tags where you think you can find the information of the asked question.\n - INSTRUCTIONS: {format_instructions} \n - WEBPAGE: The webpage is: {webpage} \n - QUESTION: The asked question is the following: {question} - """ + template = TEMPLATE_GET_PROBABLE_TAGS tag_prompt = PromptTemplate( template=template, diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py index ea916842..15889108 100644 --- a/scrapegraphai/prompts/__init__.py +++ b/scrapegraphai/prompts/__init__.py @@ -36,3 +36,4 @@ from .reasoning_node_prompts import (TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT) from .merge_generated_scripts_prompts import TEMPLATE_MERGE_SCRIPTS_PROMPT +from .get_probable_tags_node_prompts import TEMPLATE_GET_PROBABLE_TAGS diff --git a/scrapegraphai/prompts/get_probable_tags_node_prompts.py b/scrapegraphai/prompts/get_probable_tags_node_prompts.py new file mode 100644 index 00000000..ed86e163 --- /dev/null +++ b/scrapegraphai/prompts/get_probable_tags_node_prompts.py @@ -0,0 +1,12 @@ +""" +Get probable tags node prompts +""" + +TEMPLATE_GET_PROBABLE_TAGS = """ + PROMPT: + You are a website scraper that knows all the types of html tags. + You are now asked to list all the html tags where you think you can find the information of the asked question.\n + INSTRUCTIONS: {format_instructions} \n + WEBPAGE: The webpage is: {webpage} \n + QUESTION: The asked question is the following: {question} +""" From 407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 24 Oct 2024 06:45:14 +0000 Subject: [PATCH 33/39] ci(release): 1.27.0-beta.7 [skip ci] ## [1.27.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.6...v1.27.0-beta.7) (2024-10-24) ### Features * refactoring of get_probable_tags node ([f658092](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f658092dffb20ea111cc00950f617057482788f4)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3615122..346cf772 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.6...v1.27.0-beta.7) (2024-10-24) + + +### Features + +* refactoring of get_probable_tags node ([f658092](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f658092dffb20ea111cc00950f617057482788f4)) + ## [1.27.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.5...v1.27.0-beta.6) (2024-10-23) diff --git a/pyproject.toml b/pyproject.toml index b3fc2f11..0fab27b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b6" +version = "1.27.0b7" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 4f1ed939e671e46bb546b6b605db87e87c0d66ee Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 24 Oct 2024 06:55:58 +0000 Subject: [PATCH 34/39] ci(release): 1.27.0-beta.8 [skip ci] ## [1.27.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.7...v1.27.0-beta.8) (2024-10-24) ### Bug Fixes * removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4)) ### CI * **release:** 1.26.7 [skip ci] ([ec9ef2b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ec9ef2bcda9aa81f66b943829fcdb22fe265976e)) --- CHANGELOG.md | 12 ++++++++++++ pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b908800e..71c7f6dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## [1.27.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.7...v1.27.0-beta.8) (2024-10-24) + + +### Bug Fixes + +* removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4)) + + +### CI + +* **release:** 1.26.7 [skip ci] ([ec9ef2b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ec9ef2bcda9aa81f66b943829fcdb22fe265976e)) + ## [1.27.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.6...v1.27.0-beta.7) (2024-10-24) diff --git a/pyproject.toml b/pyproject.toml index 553c574c..e12c8ff7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b7" +version = "1.27.0b8" From 51c55eb3a2984ba60572edbcdea4c30620e18d76 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 24 Oct 2024 09:10:51 +0200 Subject: [PATCH 35/39] feat: add model integration gpt4 --- scrapegraphai/nodes/generate_answer_from_image_node.py | 4 ++-- scrapegraphai/prompts/description_node_prompts.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_from_image_node.py b/scrapegraphai/nodes/generate_answer_from_image_node.py index 7134cabe..9359b2bb 100644 --- a/scrapegraphai/nodes/generate_answer_from_image_node.py +++ b/scrapegraphai/nodes/generate_answer_from_image_node.py @@ -71,10 +71,10 @@ async def execute_async(self, state: dict) -> dict: images = state.get('screenshots', []) analyses = [] - supported_models = ("gpt-4o", "gpt-4o-mini", "gpt-4-turbo") + supported_models = ("gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4") if self.node_config["config"]["llm"]["model"].split("/")[-1]not in supported_models: - raise ValueError(f"""Model '{self.node_config['config']['llm']['model']}' + raise ValueError(f"""The model provided is not supported. Supported models are: {', '.join(supported_models)}.""") diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py index 86264d0b..944ed24e 100644 --- a/scrapegraphai/prompts/description_node_prompts.py +++ b/scrapegraphai/prompts/description_node_prompts.py @@ -7,4 +7,4 @@ following content from a website. \n Please provide a description summary of maximum of 20 words. \n CONTENT OF THE WEBSITE: {content} -""" \ No newline at end of file +""" From c8a000f1d943734a921b34e91498b2f29c8c9422 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 24 Oct 2024 10:11:36 +0200 Subject: [PATCH 36/39] fix: fix export function --- scrapegraphai/utils/data_export.py | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 scrapegraphai/utils/data_export.py diff --git a/scrapegraphai/utils/data_export.py b/scrapegraphai/utils/data_export.py new file mode 100644 index 00000000..afa05af4 --- /dev/null +++ b/scrapegraphai/utils/data_export.py @@ -0,0 +1,53 @@ +import json +import csv +import xml.etree.ElementTree as ET +from typing import List, Dict, Any + +def export_to_json(data: List[Dict[str, Any]], filename: str) -> None: + """ + Export data to a JSON file. + + :param data: List of dictionaries containing the data to export + :param filename: Name of the file to save the JSON data + """ + with open(filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + print(f"Data exported to {filename}") + +def export_to_csv(data: List[Dict[str, Any]], filename: str) -> None: + """ + Export data to a CSV file. + + :param data: List of dictionaries containing the data to export + :param filename: Name of the file to save the CSV data + """ + if not data: + print("No data to export") + return + + keys = data[0].keys() + with open(filename, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=keys) + writer.writeheader() + writer.writerows(data) + print(f"Data exported to {filename}") + +def export_to_xml(data: List[Dict[str, Any]], filename: str, root_element: str = "data") -> None: + """ + Export data to an XML file. + + :param data: List of dictionaries containing the data to export + :param filename: Name of the file to save the XML data + :param root_element: Name of the root element in the XML structure + """ + root = ET.Element(root_element) + for item in data: + element = ET.SubElement(root, "item") + for key, value in item.items(): + sub_element = ET.SubElement(element, key) + sub_element.text = str(value) + + tree = ET.ElementTree(root) + tree.write(filename, encoding='utf-8', xml_declaration=True) + print(f"Data exported to {filename}") + From 6179ab99a4803c1d086848d72d5966bd184e3087 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 24 Oct 2024 15:20:36 +0200 Subject: [PATCH 37/39] Update data_export.py --- scrapegraphai/utils/data_export.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scrapegraphai/utils/data_export.py b/scrapegraphai/utils/data_export.py index afa05af4..fbff45e2 100644 --- a/scrapegraphai/utils/data_export.py +++ b/scrapegraphai/utils/data_export.py @@ -1,3 +1,7 @@ +""" +data_export module +This module provides functions to export data to various file formats. +""" import json import csv import xml.etree.ElementTree as ET From fd57cc7c126658960e33b7214c2cc656ea032d8f Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 24 Oct 2024 22:39:44 +0000 Subject: [PATCH 38/39] ci(release): 1.27.0-beta.9 [skip ci] ## [1.27.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.8...v1.27.0-beta.9) (2024-10-24) ### Features * add model integration gpt4 ([51c55eb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/51c55eb3a2984ba60572edbcdea4c30620e18d76)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71c7f6dd..abeac5ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.8...v1.27.0-beta.9) (2024-10-24) + + +### Features + +* add model integration gpt4 ([51c55eb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/51c55eb3a2984ba60572edbcdea4c30620e18d76)) + ## [1.27.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.7...v1.27.0-beta.8) (2024-10-24) diff --git a/pyproject.toml b/pyproject.toml index e12c8ff7..539ef425 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b8" +version = "1.27.0b9" From eee131e959a36a4471f72610eefbc1764808b6be Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 25 Oct 2024 06:45:23 +0000 Subject: [PATCH 39/39] ci(release): 1.27.0-beta.10 [skip ci] ## [1.27.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.9...v1.27.0-beta.10) (2024-10-25) ### Bug Fixes * fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abeac5ec..58aba1fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.9...v1.27.0-beta.10) (2024-10-25) + + +### Bug Fixes + +* fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422)) + ## [1.27.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.8...v1.27.0-beta.9) (2024-10-24) diff --git a/pyproject.toml b/pyproject.toml index 539ef425..be705469 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b9" +version = "1.27.0b10"