From 438b8127db941b5ac71094bf7c06f4dfdf80489a Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 15 Sep 2024 11:20:08 +0200 Subject: [PATCH] refctoring of the code --- scrapegraphai/builders/graph_builder.py | 6 ------ scrapegraphai/nodes/fetch_node.py | 10 ++++++---- scrapegraphai/telemetry/telemetry.py | 17 +++++++---------- scrapegraphai/utils/convert_to_md.py | 1 + scrapegraphai/utils/copy.py | 3 +++ scrapegraphai/utils/parse_state_keys.py | 5 ++--- scrapegraphai/utils/save_audio_from_bytes.py | 1 - scrapegraphai/utils/token_calculator.py | 1 - 8 files changed, 19 insertions(+), 25 deletions(-) diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py index 69ebe492..1bfdab72 100644 --- a/scrapegraphai/builders/graph_builder.py +++ b/scrapegraphai/builders/graph_builder.py @@ -65,12 +65,10 @@ def _create_llm(self, llm_config: dict): "temperature": 0, "streaming": True } - # Update defaults with any LLM parameters that were provided llm_params = {**llm_defaults, **llm_config} if "api_key" not in llm_params: raise ValueError("LLM configuration must include an 'api_key'.") - # select the model based on the model name if "gpt-" in llm_params["model"]: return ChatOpenAI(llm_params) elif "gemini" in llm_params["model"]: @@ -152,17 +150,13 @@ def convert_json_to_graphviz(json_data, format: str = 'pdf'): edges = graph_config.get('edges', []) entry_point = graph_config.get('entry_point') - # Add nodes to the graph for node in nodes: - # If this node is the entry point, use a double circle to denote it if node['node_name'] == entry_point: graph.node(node['node_name'], shape='doublecircle') else: graph.node(node['node_name']) - # Add edges to the graph for edge in edges: - # An edge could potentially have multiple 'to' nodes if it's from a conditional node if isinstance(edge['to'], list): for to_node in edge['to']: graph.edge(edge['from'], to_node) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index f015278d..bbc0abd7 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -252,8 +252,8 @@ def handle_web_source(self, state, source): if not self.cut: parsed_content = cleanup_html(response, source) - if ((isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) - and not self.script_creator) or (self.force and not self.script_creator): + if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \ + and not self.script_creator) or (self.force and not self.script_creator): parsed_content = convert_to_md(source, parsed_content) compressed_document = [Document(page_content=parsed_content)] @@ -271,7 +271,8 @@ def handle_web_source(self, state, source): try: from ..docloaders.browser_base import browser_base_fetch except ImportError: - raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.") + raise ImportError("""The browserbase module is not installed. + Please install it using `pip install browserbase`.""") data = browser_base_fetch(self.browser_base.get("api_key"), self.browser_base.get("project_id"), [source]) @@ -283,7 +284,8 @@ def handle_web_source(self, state, source): document = loader.load() if not document or not document[0].page_content.strip(): - raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") + raise ValueError("""No HTML body content found in + the document fetched by ChromiumLoader.""") parsed_content = document[0].page_content if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index 655b2cc5..61af900c 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -14,7 +14,6 @@ or: export SCRAPEGRAPHAI_TELEMETRY_ENABLED=false """ - import configparser import functools import importlib.metadata @@ -68,14 +67,16 @@ def _check_config_and_environ_for_telemetry_flag( try: telemetry_enabled = config_obj.getboolean("DEFAULT", "telemetry_enabled") except ValueError as e: - logger.debug(f"Unable to parse value for `telemetry_enabled` from config. Encountered {e}") + logger.debug(f"""Unable to parse value for + `telemetry_enabled` from config. Encountered {e}""") if os.environ.get("SCRAPEGRAPHAI_TELEMETRY_ENABLED") is not None: env_value = os.environ.get("SCRAPEGRAPHAI_TELEMETRY_ENABLED") config_obj["DEFAULT"]["telemetry_enabled"] = env_value try: telemetry_enabled = config_obj.getboolean("DEFAULT", "telemetry_enabled") except ValueError as e: - logger.debug(f"Unable to parse value for `SCRAPEGRAPHAI_TELEMETRY_ENABLED` from environment. Encountered {e}") + logger.debug(f"""Unable to parse value for `SCRAPEGRAPHAI_TELEMETRY_ENABLED` + from environment. Encountered {e}""") return telemetry_enabled @@ -94,7 +95,6 @@ def _check_config_and_environ_for_telemetry_flag( "telemetry_version": "0.0.3", } - def disable_telemetry(): """ function for disabling the telemetries @@ -102,7 +102,6 @@ def disable_telemetry(): global g_telemetry_enabled g_telemetry_enabled = False - def is_telemetry_enabled() -> bool: """ function for checking if a telemetry is enables @@ -122,7 +121,6 @@ def is_telemetry_enabled() -> bool: else: return False - def _send_event_json(event_json: dict): headers = { "Content-Type": "application/json", @@ -141,7 +139,6 @@ def _send_event_json(event_json: dict): else: logger.debug(f"Telemetry data sent: {data}") - def send_event_json(event_json: dict): """ fucntion for sending event json @@ -154,7 +151,6 @@ def send_event_json(event_json: dict): except Exception as e: logger.debug(f"Failed to send telemetry data in a thread: {e}") - def log_event(event: str, properties: Dict[str, any]): """ function for logging the events @@ -167,7 +163,6 @@ def log_event(event: str, properties: Dict[str, any]): } send_event_json(event_json) - def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, @@ -193,8 +188,10 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, } log_event("graph_execution", properties) - def capture_function_usage(call_fn: Callable) -> Callable: + """ + function that captures the usage + """ @functools.wraps(call_fn) def wrapped_fn(*args, **kwargs): try: diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index ff0bbbd7..2f31c3a1 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -24,6 +24,7 @@ def convert_to_md(html: str, url: str = None) -> str: h = html2text.HTML2Text() h.ignore_links = False h.body_width = 0 + if url is not None: parsed_url = urlparse(url) domain = f"{parsed_url.scheme}://{parsed_url.netloc}" diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py index 838f3c05..0cdda362 100644 --- a/scrapegraphai/utils/copy.py +++ b/scrapegraphai/utils/copy.py @@ -1,3 +1,6 @@ +""" +copy module +""" import copy from typing import Any diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index f4bd2ea5..79de329c 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -3,7 +3,6 @@ """ import re - def parse_expression(expression, state: dict) -> list: """ Parses a complex boolean expression involving state keys. @@ -22,7 +21,8 @@ def parse_expression(expression, state: dict) -> list: Example: >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", - {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None}) + {"user_input": None, "document": None, + "parsed_document": None, "relevant_chunks": None}) ['user_input', 'relevant_chunks', 'parsed_document', 'document'] This function evaluates the expression to determine the @@ -69,7 +69,6 @@ def evaluate_simple_expression(exp): return [elem.strip() for elem in and_segment if elem.strip() in state] return [] - # Helper function to evaluate expressions with parentheses def evaluate_expression(expression): while '(' in expression: start = expression.rfind('(') diff --git a/scrapegraphai/utils/save_audio_from_bytes.py b/scrapegraphai/utils/save_audio_from_bytes.py index 2bad3106..aeef411c 100644 --- a/scrapegraphai/utils/save_audio_from_bytes.py +++ b/scrapegraphai/utils/save_audio_from_bytes.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import Union - def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None: """ Saves the byte response as an audio file to the specified path. diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index c5e5fbbb..475a0a14 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -5,7 +5,6 @@ import tiktoken from ..helpers.models_tokens import models_tokens - def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]: """ Truncates text into chunks that are small enough to be processed by specified llm models.