diff --git a/docs/source/conf.py b/docs/source/conf.py index 43c849c4..9fc3aec7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -36,4 +36,7 @@ "source_repository": "https://github.com/VinciGit00/Scrapegraph-ai/", "source_branch": "main", "source_directory": "docs/source/", -} \ No newline at end of file + 'navigation_with_keys': True, + 'sidebar_hide_name': False, +} + diff --git a/docs/source/index.rst b/docs/source/index.rst index e49f54a9..acc0db73 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -22,9 +22,6 @@ :caption: Scrapers scrapers/graphs - scrapers/llm - scrapers/graph_config - scrapers/benchmarks .. toctree:: :maxdepth: 2 diff --git a/docs/source/scrapers/graphs.rst b/docs/source/scrapers/graphs.rst index 892a4ef1..ee5f072f 100644 --- a/docs/source/scrapers/graphs.rst +++ b/docs/source/scrapers/graphs.rst @@ -3,224 +3,11 @@ Graphs Graphs are scraping pipelines aimed at solving specific tasks. They are composed by nodes which can be configured individually to address different aspects of the task (fetching data, extracting information, etc.). -There are several types of graphs available in the library, each with its own purpose and functionality. The most common ones are: - -- **SmartScraperGraph**: one-page scraper that requires a user-defined prompt and a URL (or local file) to extract information using LLM. -- **SearchGraph**: multi-page scraper that only requires a user-defined prompt to extract information from a search engine using LLM. It is built on top of SmartScraperGraph. -- **SpeechGraph**: text-to-speech pipeline that generates an answer as well as a requested audio file. It is built on top of SmartScraperGraph and requires a user-defined prompt and a URL (or local file). -- **ScriptCreatorGraph**: script generator that creates a Python script to scrape a website using the specified library (e.g. BeautifulSoup). It requires a user-defined prompt and a URL (or local file). - -There are also two additional graphs that can handle multiple sources: - -- **SmartScraperMultiGraph**: similar to `SmartScraperGraph`, but with the ability to handle multiple sources. -- **ScriptCreatorMultiGraph**: similar to `ScriptCreatorGraph`, but with the ability to handle multiple sources. - -With the introduction of `GPT-4o`, two new powerful graphs have been created: - -- **OmniScraperGraph**: similar to `SmartScraperGraph`, but with the ability to scrape images and describe them. -- **OmniSearchGraph**: similar to `SearchGraph`, but with the ability to scrape images and describe them. - - -.. note:: - - They all use a graph configuration to set up LLM models and other parameters. To find out more about the configurations, check the :ref:`LLM` and :ref:`Configuration` sections. - - -.. note:: - - We can pass an optional `schema` parameter to the graph constructor to specify the output schema. If not provided or set to `None`, the schema will be generated by the LLM itself. - -OmniScraperGraph -^^^^^^^^^^^^^^^^ - -.. image:: ../../assets/omniscrapergraph.png - :align: center - :width: 90% - :alt: OmniScraperGraph -| - -First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the OmniScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. -It will fetch the data from the source and extract the information based on the prompt in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import OmniScraperGraph - - graph_config = { - "llm": {...}, - } - - omni_scraper_graph = OmniScraperGraph( - prompt="List me all the projects with their titles and image links and descriptions.", - source="https://perinim.github.io/projects", - config=graph_config, - schema=schema - ) - - result = omni_scraper_graph.run() - print(result) - -OmniSearchGraph -^^^^^^^^^^^^^^^ - -.. image:: ../../assets/omnisearchgraph.png - :align: center - :width: 80% - :alt: OmniSearchGraph -| - -Similar to OmniScraperGraph, we define the graph configuration, create multiple of the OmniSearchGraph class, and run the graph. -It will create a search query, fetch the first n results from the search engine, run n OmniScraperGraph instances, and return the results in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import OmniSearchGraph - - graph_config = { - "llm": {...}, - } - - # Create the OmniSearchGraph instance - omni_search_graph = OmniSearchGraph( - prompt="List me all Chioggia's famous dishes and describe their pictures.", - config=graph_config, - schema=schema - ) - - # Run the graph - result = omni_search_graph.run() - print(result) - -SmartScraperGraph & SmartScraperMultiGraph -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. image:: ../../assets/smartscrapergraph.png - :align: center - :width: 90% - :alt: SmartScraperGraph -| - -First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the SmartScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. -It will fetch the data from the source and extract the information based on the prompt in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import SmartScraperGraph - - graph_config = { - "llm": {...}, - } - - smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their descriptions", - source="https://perinim.github.io/projects", - config=graph_config, - schema=schema - ) - - result = smart_scraper_graph.run() - print(result) - -**SmartScraperMultiGraph** is similar to SmartScraperGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the SmartScraperMultiGraph class, and run the graph. - -SearchGraph -^^^^^^^^^^^ - -.. image:: ../../assets/searchgraph.png - :align: center - :width: 80% - :alt: SearchGraph -| - -Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SearchGraph class, and run the graph. -It will create a search query, fetch the first n results from the search engine, run n SmartScraperGraph instances, and return the results in JSON format. - - -.. code-block:: python - - from scrapegraphai.graphs import SearchGraph - - graph_config = { - "llm": {...}, - "embeddings": {...}, - } - - # Create the SearchGraph instance - search_graph = SearchGraph( - prompt="List me all the traditional recipes from Chioggia", - config=graph_config, - schema=schema - ) - - # Run the graph - result = search_graph.run() - print(result) - - -SpeechGraph -^^^^^^^^^^^ - -.. image:: ../../assets/speechgraph.png - :align: center - :width: 90% - :alt: SpeechGraph -| - -Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SpeechGraph class, and run the graph. -It will fetch the data from the source, extract the information based on the prompt, and generate an audio file with the answer, as well as the answer itself, in JSON format. - -.. code-block:: python - - from scrapegraphai.graphs import SpeechGraph - - graph_config = { - "llm": {...}, - "tts_model": {...}, - } - - # ************************************************ - # Create the SpeechGraph instance and run it - # ************************************************ - - speech_graph = SpeechGraph( - prompt="Make a detailed audio summary of the projects.", - source="https://perinim.github.io/projects/", - config=graph_config, - schema=schema - ) - - result = speech_graph.run() - print(result) - - -ScriptCreatorGraph & ScriptCreatorMultiGraph -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. image:: ../../assets/scriptcreatorgraph.png - :align: center - :width: 90% - :alt: ScriptCreatorGraph - -First we define the graph configuration, which includes the LLM model and other parameters. -Then we create an instance of the ScriptCreatorGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. - -.. code-block:: python - - from scrapegraphai.graphs import ScriptCreatorGraph - - graph_config = { - "llm": {...}, - "library": "beautifulsoup4" - } - - script_creator_graph = ScriptCreatorGraph( - prompt="Create a Python script to scrape the projects.", - source="https://perinim.github.io/projects/", - config=graph_config, - schema=schema - ) - - result = script_creator_graph.run() - print(result) - -**ScriptCreatorMultiGraph** is similar to ScriptCreatorGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the ScriptCreatorMultiGraph class, and run the graph. +.. toctree:: + :maxdepth: 4 + + types + llm + graph_config + benchmarks + telemetry diff --git a/docs/source/scrapers/telemetry.rst b/docs/source/scrapers/telemetry.rst new file mode 100644 index 00000000..a6598092 --- /dev/null +++ b/docs/source/scrapers/telemetry.rst @@ -0,0 +1,72 @@ +=============== +Usage Analytics +=============== + +ScrapeGraphAI collects **anonymous** usage data by default to improve the library and guide development efforts. + +**Events Captured** + +We capture events in the following scenarios: + +1. When a ``Graph`` finishes running. +2. When an exception is raised in one of the nodes. + +**Data Collected** + +The data captured is limited to: + +- Operating System and Python version +- A persistent UUID to identify the session, stored in ``~/.scrapegraphai.conf`` + +Additionally, the following properties are collected: + +.. code-block:: python + + properties = { + "graph_name": graph_name, + "llm_model": llm_model_name, + "embedder_model": embedder_model_name, + "source_type": source_type, + "execution_time": execution_time, + "error_node": error_node_name, + } + +For more details, refer to the `telemetry.py `_ module. + +**Opting Out** + +If you prefer not to participate in telemetry, you can opt out using any of the following methods: + +1. **Programmatically Disable Telemetry**: + + Add the following code at the beginning of your script: + + .. code-block:: python + + from scrapegraphai import telemetry + telemetry.disable_telemetry() + +2. **Configuration File**: + + Set the ``telemetry_enabled`` key to ``false`` in ``~/.scrapegraphai.conf`` under the ``[DEFAULT]`` section: + + .. code-block:: ini + + [DEFAULT] + telemetry_enabled = False + +3. **Environment Variable**: + + - **For a Shell Session**: + + .. code-block:: bash + + export SCRAPEGRAPHAI_TELEMETRY_ENABLED=false + + - **For a Single Command**: + + .. code-block:: bash + + SCRAPEGRAPHAI_TELEMETRY_ENABLED=false python my_script.py + +By following any of these methods, you can easily opt out of telemetry and ensure your usage data is not collected. diff --git a/docs/source/scrapers/types.rst b/docs/source/scrapers/types.rst new file mode 100644 index 00000000..42613066 --- /dev/null +++ b/docs/source/scrapers/types.rst @@ -0,0 +1,225 @@ +Types +===== + + +There are several types of graphs available in the library, each with its own purpose and functionality. The most common ones are: + +- **SmartScraperGraph**: one-page scraper that requires a user-defined prompt and a URL (or local file) to extract information using LLM. +- **SearchGraph**: multi-page scraper that only requires a user-defined prompt to extract information from a search engine using LLM. It is built on top of SmartScraperGraph. +- **SpeechGraph**: text-to-speech pipeline that generates an answer as well as a requested audio file. It is built on top of SmartScraperGraph and requires a user-defined prompt and a URL (or local file). +- **ScriptCreatorGraph**: script generator that creates a Python script to scrape a website using the specified library (e.g. BeautifulSoup). It requires a user-defined prompt and a URL (or local file). + +There are also two additional graphs that can handle multiple sources: + +- **SmartScraperMultiGraph**: similar to `SmartScraperGraph`, but with the ability to handle multiple sources. +- **ScriptCreatorMultiGraph**: similar to `ScriptCreatorGraph`, but with the ability to handle multiple sources. + +With the introduction of `GPT-4o`, two new powerful graphs have been created: + +- **OmniScraperGraph**: similar to `SmartScraperGraph`, but with the ability to scrape images and describe them. +- **OmniSearchGraph**: similar to `SearchGraph`, but with the ability to scrape images and describe them. + + +.. note:: + + They all use a graph configuration to set up LLM models and other parameters. To find out more about the configurations, check the :ref:`LLM` and :ref:`Configuration` sections. + + +.. note:: + + We can pass an optional `schema` parameter to the graph constructor to specify the output schema. If not provided or set to `None`, the schema will be generated by the LLM itself. + +OmniScraperGraph +^^^^^^^^^^^^^^^^ + +.. image:: ../../assets/omniscrapergraph.png + :align: center + :width: 90% + :alt: OmniScraperGraph +| + +First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the OmniScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. +It will fetch the data from the source and extract the information based on the prompt in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import OmniScraperGraph + + graph_config = { + "llm": {...}, + } + + omni_scraper_graph = OmniScraperGraph( + prompt="List me all the projects with their titles and image links and descriptions.", + source="https://perinim.github.io/projects", + config=graph_config, + schema=schema + ) + + result = omni_scraper_graph.run() + print(result) + +OmniSearchGraph +^^^^^^^^^^^^^^^ + +.. image:: ../../assets/omnisearchgraph.png + :align: center + :width: 80% + :alt: OmniSearchGraph +| + +Similar to OmniScraperGraph, we define the graph configuration, create multiple of the OmniSearchGraph class, and run the graph. +It will create a search query, fetch the first n results from the search engine, run n OmniScraperGraph instances, and return the results in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import OmniSearchGraph + + graph_config = { + "llm": {...}, + } + + # Create the OmniSearchGraph instance + omni_search_graph = OmniSearchGraph( + prompt="List me all Chioggia's famous dishes and describe their pictures.", + config=graph_config, + schema=schema + ) + + # Run the graph + result = omni_search_graph.run() + print(result) + +SmartScraperGraph & SmartScraperMultiGraph +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: ../../assets/smartscrapergraph.png + :align: center + :width: 90% + :alt: SmartScraperGraph +| + +First we define the graph configuration, which includes the LLM model and other parameters. Then we create an instance of the SmartScraperGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. +It will fetch the data from the source and extract the information based on the prompt in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import SmartScraperGraph + + graph_config = { + "llm": {...}, + } + + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their descriptions", + source="https://perinim.github.io/projects", + config=graph_config, + schema=schema + ) + + result = smart_scraper_graph.run() + print(result) + +**SmartScraperMultiGraph** is similar to SmartScraperGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the SmartScraperMultiGraph class, and run the graph. + +SearchGraph +^^^^^^^^^^^ + +.. image:: ../../assets/searchgraph.png + :align: center + :width: 80% + :alt: SearchGraph +| + +Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SearchGraph class, and run the graph. +It will create a search query, fetch the first n results from the search engine, run n SmartScraperGraph instances, and return the results in JSON format. + + +.. code-block:: python + + from scrapegraphai.graphs import SearchGraph + + graph_config = { + "llm": {...}, + "embeddings": {...}, + } + + # Create the SearchGraph instance + search_graph = SearchGraph( + prompt="List me all the traditional recipes from Chioggia", + config=graph_config, + schema=schema + ) + + # Run the graph + result = search_graph.run() + print(result) + + +SpeechGraph +^^^^^^^^^^^ + +.. image:: ../../assets/speechgraph.png + :align: center + :width: 90% + :alt: SpeechGraph +| + +Similar to SmartScraperGraph, we define the graph configuration, create an instance of the SpeechGraph class, and run the graph. +It will fetch the data from the source, extract the information based on the prompt, and generate an audio file with the answer, as well as the answer itself, in JSON format. + +.. code-block:: python + + from scrapegraphai.graphs import SpeechGraph + + graph_config = { + "llm": {...}, + "tts_model": {...}, + } + + # ************************************************ + # Create the SpeechGraph instance and run it + # ************************************************ + + speech_graph = SpeechGraph( + prompt="Make a detailed audio summary of the projects.", + source="https://perinim.github.io/projects/", + config=graph_config, + schema=schema + ) + + result = speech_graph.run() + print(result) + + +ScriptCreatorGraph & ScriptCreatorMultiGraph +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: ../../assets/scriptcreatorgraph.png + :align: center + :width: 90% + :alt: ScriptCreatorGraph + +First we define the graph configuration, which includes the LLM model and other parameters. +Then we create an instance of the ScriptCreatorGraph class, passing the prompt, source, and configuration as arguments. Finally, we run the graph and print the result. + +.. code-block:: python + + from scrapegraphai.graphs import ScriptCreatorGraph + + graph_config = { + "llm": {...}, + "library": "beautifulsoup4" + } + + script_creator_graph = ScriptCreatorGraph( + prompt="Create a Python script to scrape the projects.", + source="https://perinim.github.io/projects/", + config=graph_config, + schema=schema + ) + + result = script_creator_graph.run() + print(result) + +**ScriptCreatorMultiGraph** is similar to ScriptCreatorGraph, but it can handle multiple sources. We define the graph configuration, create an instance of the ScriptCreatorMultiGraph class, and run the graph. diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py index e353fd9b..bae4f688 100644 --- a/examples/openai/smart_scraper_openai.py +++ b/examples/openai/smart_scraper_openai.py @@ -2,7 +2,7 @@ Basic example of scraping pipeline using SmartScraper """ -import os +import os, json from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info @@ -37,7 +37,7 @@ ) result = smart_scraper_graph.run() -print(result) +print(json.dumps(result, indent=4)) # ************************************************ # Get graph execution info diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index b5e15e8f..6cd4ac45 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -26,7 +26,7 @@ OneApi ) from ..models.ernie import Ernie -from ..utils.logging import set_verbosity_debug, set_verbosity_warning +from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info from ..helpers import models_tokens from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek @@ -90,7 +90,7 @@ def __init__(self, prompt: str, config: dict, verbose = bool(config and config.get("verbose")) if verbose: - set_verbosity_debug() + set_verbosity_info() else: set_verbosity_warning() diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 1b2cb4da..90585e6a 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -1,12 +1,10 @@ -""" -BaseGraph Module -""" - import time import warnings from langchain_community.callbacks import get_openai_callback from typing import Tuple +# Import telemetry functions +from ..telemetry import log_graph_execution, log_event class BaseGraph: """ @@ -46,12 +44,12 @@ class BaseGraph: ... ) """ - def __init__(self, nodes: list, edges: list, entry_point: str, use_burr: bool = False, burr_config: dict = None): - + def __init__(self, nodes: list, edges: list, entry_point: str, use_burr: bool = False, burr_config: dict = None, graph_name: str = "Custom"): self.nodes = nodes self.raw_edges = edges self.edges = self._create_edges({e for e in edges}) self.entry_point = entry_point.node_name + self.graph_name = graph_name self.initial_state = {} if nodes[0].node_name != entry_point.node_name: @@ -103,12 +101,46 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: "total_cost_USD": 0.0, } + start_time = time.time() + error_node = None + source_type = None + llm_model = None + embedder_model = None + while current_node_name: curr_time = time.time() current_node = next(node for node in self.nodes if node.node_name == current_node_name) + # check if there is a "source" key in the node config + if current_node.__class__.__name__ == "FetchNode": + # get the second key name of the state dictionary + source_type = list(state.keys())[1] + # quick fix for local_dir source type + if source_type == "local_dir": + source_type = "html_dir" + + # check if there is an "llm_model" variable in the class + if hasattr(current_node, "llm_model") and llm_model is None: + llm_model = current_node.llm_model + if hasattr(llm_model, "model_name"): + llm_model = llm_model.model_name + elif hasattr(llm_model, "model"): + llm_model = llm_model.model + + # check if there is an "embedder_model" variable in the class + if hasattr(current_node, "embedder_model") and embedder_model is None: + embedder_model = current_node.embedder_model + if hasattr(embedder_model, "model_name"): + embedder_model = embedder_model.model_name + elif hasattr(embedder_model, "model"): + embedder_model = embedder_model.model + with get_openai_callback() as cb: - result = current_node.execute(state) + try: + result = current_node.execute(state) + except Exception as e: + error_node = current_node.node_name + raise e node_exec_time = time.time() - curr_time total_exec_time += node_exec_time @@ -147,6 +179,17 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: "exec_time": total_exec_time, }) + # Log the graph execution telemetry + graph_execution_time = time.time() - start_time + log_graph_execution( + graph_name=self.graph_name, + llm_model=llm_model, + embedder_model=embedder_model, + source_type=source_type, + execution_time=graph_execution_time, + error_node=error_node + ) + return state, exec_info def execute(self, initial_state: dict) -> Tuple[dict, list]: @@ -162,7 +205,6 @@ def execute(self, initial_state: dict) -> Tuple[dict, list]: self.initial_state = initial_state if self.use_burr: - from ..integrations import BurrBridge bridge = BurrBridge(self, self.burr_config) @@ -190,4 +232,4 @@ def append_node(self, node): # add the node to the list of nodes self.nodes.append(node) # update the edges connecting the last node to the new node - self.edges = self._create_edges({e for e in self.raw_edges}) \ No newline at end of file + self.edges = self._create_edges({e for e in self.raw_edges}) diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index d8d25b4a..48fb5bdb 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -64,7 +64,8 @@ def _create_graph(self): (fetch_node, rag_node), (rag_node, generate_answer_node) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index 85ed1727..fd15f49a 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -100,7 +100,8 @@ def _create_graph(self) -> BaseGraph: edges=[ (graph_iterator_node, merge_answers_node), ], - entry_point=graph_iterator_node + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index d8d5525f..e9e41771 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -141,7 +141,8 @@ def _create_repeated_graph(self) -> BaseGraph: (search_node, graph_iterator_node), (graph_iterator_node, merge_answers_node) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 2dbee471..09a5f02e 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -89,7 +89,8 @@ def _create_graph(self) -> BaseGraph: (fetch_node, rag_node), (rag_node, generate_answer_node) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index f86fdc67..2824c416 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -104,7 +104,8 @@ def _create_graph(self) -> BaseGraph: edges=[ (graph_iterator_node, merge_answers_node), ], - entry_point=graph_iterator_node + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 3234dd02..a5eefad2 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -122,7 +122,8 @@ def _create_graph(self) -> BaseGraph: (image_to_text_node, rag_node), (rag_node, generate_answer_omni_node) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py index d5783729..df525949 100644 --- a/scrapegraphai/graphs/omni_search_graph.py +++ b/scrapegraphai/graphs/omni_search_graph.py @@ -115,7 +115,8 @@ def _create_graph(self) -> BaseGraph: (search_internet_node, graph_iterator_node), (graph_iterator_node, merge_answers_node) ], - entry_point=search_internet_node + entry_point=search_internet_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index c476e629..41099d8b 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -105,7 +105,8 @@ def _create_graph(self) -> BaseGraph: (parse_node, rag_node), (rag_node, generate_answer_node_pdf) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index 60e81bf7..e9b5660b 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -105,7 +105,8 @@ def _create_graph(self) -> BaseGraph: edges=[ (graph_iterator_node, merge_answers_node), ], - entry_point=graph_iterator_node + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index 0697db0b..ce3fa319 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -95,7 +95,8 @@ def _create_graph(self) -> BaseGraph: (fetch_node, parse_node), (parse_node, generate_scraper_node), ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index 1660fd83..2b36f4ed 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -99,7 +99,8 @@ def _create_graph(self) -> BaseGraph: edges=[ (graph_iterator_node, merge_scripts_node), ], - entry_point=graph_iterator_node + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 23d08854..6bece062 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -114,7 +114,8 @@ def _create_graph(self) -> BaseGraph: (search_internet_node, graph_iterator_node), (graph_iterator_node, merge_answers_node) ], - entry_point=search_internet_node + entry_point=search_internet_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 0cc6a701..9ee0c3cc 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -104,7 +104,8 @@ def _create_graph(self) -> BaseGraph: (parse_node, rag_node), (rag_node, generate_answer_node) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 70fd570a..996beff1 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -104,7 +104,8 @@ def _create_graph(self) -> BaseGraph: edges=[ (graph_iterator_node, merge_answers_node), ], - entry_point=graph_iterator_node + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 9eb9b44a..1058d127 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -109,7 +109,8 @@ def _create_graph(self) -> BaseGraph: (rag_node, generate_answer_node), (generate_answer_node, text_to_speech_node) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 2ef5a1c4..dbab0b73 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -91,7 +91,8 @@ def _create_graph(self) -> BaseGraph: (fetch_node, rag_node), (rag_node, generate_answer_node) ], - entry_point=fetch_node + entry_point=fetch_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index a9127d5b..e1f4423c 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -105,7 +105,8 @@ def _create_graph(self) -> BaseGraph: edges=[ (graph_iterator_node, merge_answers_node), ], - entry_point=graph_iterator_node + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ ) def run(self) -> str: diff --git a/scrapegraphai/telemetry/__init__.py b/scrapegraphai/telemetry/__init__.py new file mode 100644 index 00000000..9586734d --- /dev/null +++ b/scrapegraphai/telemetry/__init__.py @@ -0,0 +1,5 @@ +""" +This module contains the telemetry module for the scrapegraphai package. +""" + +from .telemetry import log_graph_execution, log_event, disable_telemetry \ No newline at end of file diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py new file mode 100644 index 00000000..73e7c9cb --- /dev/null +++ b/scrapegraphai/telemetry/telemetry.py @@ -0,0 +1,183 @@ +""" +This module contains code that relates to sending ScrapeGraphAI usage telemetry. + +To disable sending telemetry there are three ways: + +1. Set it to false programmatically in your driver: + >>> from scrapegraphai import telemetry + >>> telemetry.disable_telemetry() +2. Set it to `false` in ~/.scrapegraphai.conf under `DEFAULT` + [DEFAULT] + telemetry_enabled = False +3. Set SCRAPEGRAPHAI_TELEMETRY_ENABLED=false as an environment variable: + SCRAPEGRAPHAI_TELEMETRY_ENABLED=false python run.py + or: + export SCRAPEGRAPHAI_TELEMETRY_ENABLED=false +""" + +import configparser +import functools +import importlib.metadata +import json +import os +import platform +import threading +import logging +import uuid +from typing import Callable, Dict +from urllib import request + +VERSION = importlib.metadata.version("scrapegraphai") +STR_VERSION = ".".join([str(i) for i in VERSION]) +HOST = "https://eu.i.posthog.com" +TRACK_URL = f"{HOST}/capture/" # https://posthog.com/docs/api/post-only-endpoints +API_KEY = "phc_orsfU4aHhtpTSLVcUE2hdUkQDLM4OEQZndKGFBKMEtn" +TIMEOUT = 2 +DEFAULT_CONFIG_LOCATION = os.path.expanduser("~/.scrapegraphai.conf") + + +logger = logging.getLogger(__name__) + + +def _load_config(config_location: str) -> configparser.ConfigParser: + config = configparser.ConfigParser() + try: + with open(config_location) as f: + config.read_file(f) + except Exception: + config["DEFAULT"] = {} + else: + if "DEFAULT" not in config: + config["DEFAULT"] = {} + + if "anonymous_id" not in config["DEFAULT"]: + config["DEFAULT"]["anonymous_id"] = str(uuid.uuid4()) + try: + with open(config_location, "w") as f: + config.write(f) + except Exception: + pass + return config + + +def _check_config_and_environ_for_telemetry_flag( + telemetry_default: bool, config_obj: configparser.ConfigParser +) -> bool: + telemetry_enabled = telemetry_default + if "telemetry_enabled" in config_obj["DEFAULT"]: + try: + telemetry_enabled = config_obj.getboolean("DEFAULT", "telemetry_enabled") + except ValueError as e: + logger.debug(f"Unable to parse value for `telemetry_enabled` from config. Encountered {e}") + if os.environ.get("SCRAPEGRAPHAI_TELEMETRY_ENABLED") is not None: + env_value = os.environ.get("SCRAPEGRAPHAI_TELEMETRY_ENABLED") + config_obj["DEFAULT"]["telemetry_enabled"] = env_value + try: + telemetry_enabled = config_obj.getboolean("DEFAULT", "telemetry_enabled") + except ValueError as e: + logger.debug(f"Unable to parse value for `SCRAPEGRAPHAI_TELEMETRY_ENABLED` from environment. Encountered {e}") + return telemetry_enabled + + +config = _load_config(DEFAULT_CONFIG_LOCATION) +g_telemetry_enabled = _check_config_and_environ_for_telemetry_flag(True, config) +g_anonymous_id = config["DEFAULT"]["anonymous_id"] +call_counter = 0 +MAX_COUNT_SESSION = 1000 + +BASE_PROPERTIES = { + "os_type": os.name, + "os_version": platform.platform(), + "python_version": f"{platform.python_version()}/{platform.python_implementation()}", + "distinct_id": g_anonymous_id, + "scrapegraphai_version": VERSION, + "telemetry_version": "0.0.1", +} + + +def disable_telemetry(): + global g_telemetry_enabled + g_telemetry_enabled = False + + +def is_telemetry_enabled() -> bool: + if g_telemetry_enabled: + global call_counter + if call_counter == 0: + logger.debug( + "Note: ScrapeGraphAI collects anonymous usage data to improve the library. " + "You can disable telemetry by setting SCRAPEGRAPHAI_TELEMETRY_ENABLED=false or " + "by editing ~/.scrapegraphai.conf." + ) + call_counter += 1 + if call_counter > MAX_COUNT_SESSION: + return False + return True + else: + return False + + +def _send_event_json(event_json: dict): + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {API_KEY}", + "User-Agent": f"scrapegraphai/{STR_VERSION}", + } + try: + data = json.dumps(event_json).encode() + req = request.Request(TRACK_URL, data=data, headers=headers) + with request.urlopen(req, timeout=TIMEOUT) as f: + res = f.read() + if f.code != 200: + raise RuntimeError(res) + except Exception as e: + logger.debug(f"Failed to send telemetry data: {e}") + else: + logger.debug(f"Telemetry data sent: {data}") + + +def send_event_json(event_json: dict): + if not g_telemetry_enabled: + raise RuntimeError("Telemetry tracking is disabled!") + try: + th = threading.Thread(target=_send_event_json, args=(event_json,)) + th.start() + except Exception as e: + logger.debug(f"Failed to send telemetry data in a thread: {e}") + + +def log_event(event: str, properties: Dict[str, any]): + if is_telemetry_enabled(): + event_json = { + "api_key": API_KEY, + "event": event, + "properties": {**BASE_PROPERTIES, **properties}, + } + send_event_json(event_json) + + +def log_graph_execution(graph_name: str, llm_model: str, embedder_model: str, source_type: str, execution_time: float, error_node: str = None): + properties = { + "graph_name": graph_name, + "llm_model": llm_model, + "embedder_model": embedder_model, + "source_type": source_type, + "execution_time": execution_time, + "error_node": error_node, + } + log_event("graph_execution", properties) + + +def capture_function_usage(call_fn: Callable) -> Callable: + @functools.wraps(call_fn) + def wrapped_fn(*args, **kwargs): + try: + return call_fn(*args, **kwargs) + finally: + if is_telemetry_enabled(): + try: + function_name = call_fn.__name__ + log_event("function_usage", {"function_name": function_name}) + except Exception as e: + logger.debug(f"Failed to send telemetry for function usage. Encountered: {e}") + return wrapped_fn \ No newline at end of file