diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 28eb27b2..68652dc8 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -10,7 +10,7 @@ from ..helpers import models_tokens from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings - +from ..utils.logging import get_logger class AbstractGraph(ABC): """ @@ -61,6 +61,7 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None): self.headless = True if config is None else config.get( "headless", True) self.loader_kwargs = config.get("loader_kwargs", {}) + self.logger = get_logger("graph") common_params = {"headless": self.headless, "verbose": self.verbose, @@ -79,7 +80,7 @@ def set_common_params(self, params: dict, overwrite=False): for node in self.graph.nodes: node.update_config(params, overwrite) - + def _set_model_token(self, llm): if 'Azure' in str(type(llm)): diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 1edefdbd..6a87d9f4 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -1,4 +1,4 @@ -""" +"""" FetchNode Module """ @@ -13,7 +13,7 @@ from ..docloaders import ChromiumLoader from .base_node import BaseNode from ..utils.cleanup_html import cleanup_html - +from ..utils.logging import get_logger class FetchNode(BaseNode): """ @@ -74,7 +74,7 @@ def execute(self, state): necessary information to perform the operation is missing. """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) @@ -128,7 +128,7 @@ def execute(self, state): cleanedup_html = cleanup_html(response.text, source) compressed_document = [Document(page_content=cleanedup_html)] else: - print(f"Failed to retrieve contents from the webpage at url: {source}") + self.logger.warning(f"Failed to retrieve contents from the webpage at url: {source}") else: loader_kwargs = {} @@ -144,4 +144,4 @@ def execute(self, state): ] state.update({self.output[0]: compressed_document}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 53f7121b..cf32b411 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -1,4 +1,5 @@ """ +gg Module for generating the answer node """ # Imports from standard library @@ -9,6 +10,7 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -72,7 +74,7 @@ def execute(self, state): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index f554f8d9..234e339e 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -10,6 +10,7 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -59,7 +60,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 31839d22..1e7e0edf 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -9,6 +9,7 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -72,7 +73,7 @@ def execute(self, state): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 804635de..d35db233 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -10,6 +10,7 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -63,7 +64,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index e970c285..39b437a5 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -6,7 +6,7 @@ from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate from .base_node import BaseNode - +from ..utils.logging import get_logger class GetProbableTagsNode(BaseNode): """ @@ -25,11 +25,12 @@ class GetProbableTagsNode(BaseNode): node_name (str): The unique identifier name for the node, defaulting to "GetProbableTags". """ - def __init__(self, input: str, output: List[str], model_config: dict, + def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "GetProbableTags"): - super().__init__(node_name, "node", input, output, 2, model_config) + super().__init__(node_name, "node", input, output, 2, node_config) - self.llm_model = model_config["llm_model"] + self.llm_model = node_config["llm_model"] + self.verbose = False if node_config is None else node_config.get("verbose", False) def execute(self, state: dict) -> dict: """ @@ -49,7 +50,8 @@ def execute(self, state: dict) -> dict: necessary information for generating tag predictions is missing. """ - print(f"--- Executing {self.node_name} Node ---") + if self.verbose: + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 8a71319a..063466a9 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -5,7 +5,7 @@ import asyncio import copy from typing import List, Optional - +from ..utils.logging import get_logger from tqdm.asyncio import tqdm from .base_node import BaseNode @@ -60,7 +60,8 @@ def execute(self, state: dict) -> dict: batchsize = self.node_config.get("batchsize", _default_batchsize) if self.verbose: - print(f"--- Executing {self.node_name} Node with batchsize {batchsize} ---") + self.logger.info(f"--- Executing {self.node_name} Node with batchsize {batchsize} ---") + try: eventloop = asyncio.get_event_loop() diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 27f09016..314e26bc 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -4,6 +4,7 @@ from typing import List, Optional from .base_node import BaseNode +from ..utils.logging import get_logger class ImageToTextNode(BaseNode): @@ -42,7 +43,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print("---GENERATING TEXT FROM IMAGE---") + self.logger.info(f"--- Executing {self.node_name} Node ---") input_keys = self.get_input_keys(state) input_data = [state[key] for key in input_keys] diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 63ed6afa..8d8c4e82 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -5,7 +5,7 @@ # Imports from standard library from typing import List, Optional from tqdm import tqdm - +from ..utils.logging import get_logger # Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -54,7 +54,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.ogger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 2cd7eb33..2f49106f 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -6,7 +6,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_transformers import Html2TextTransformer from .base_node import BaseNode - +from ..utils.logging import get_logger class ParseNode(BaseNode): """ @@ -49,7 +49,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 27d97b6e..dae666cf 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -8,6 +8,7 @@ from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS +from ..utils.logging import get_logger from .base_node import BaseNode @@ -57,7 +58,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) @@ -80,7 +81,7 @@ def execute(self, state: dict) -> dict: chunked_docs.append(doc) if self.verbose: - print("--- (updated chunks metadata) ---") + self.logger.info("--- (updated chunks metadata) ---") # check if embedder_model is provided, if not use llm_model self.embedder_model = self.embedder_model if self.embedder_model else self.llm_model @@ -108,7 +109,7 @@ def execute(self, state: dict) -> dict: compressed_docs = compression_retriever.invoke(user_prompt) if self.verbose: - print("--- (tokens compressed and vector stored) ---") + self.logger.info("--- (tokens compressed and vector stored) ---") state.update({self.output[0]: compressed_docs}) return state diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 7aea6cae..29b71800 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -9,7 +9,7 @@ from langchain.output_parsers import CommaSeparatedListOutputParser from .base_node import BaseNode from ..helpers import robots_dictionary - +from ..utils.logging import get_logger class RobotsNode(BaseNode): """ @@ -61,9 +61,10 @@ def execute(self, state: dict) -> dict: ValueError: If the website is not scrapeable based on the robots.txt file and scraping is not enforced. """ + logger = get_logger("robots node") if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) @@ -121,17 +122,17 @@ def execute(self, state: dict) -> dict: if "no" in is_scrapable: if self.verbose: - print("\033[31m(Scraping this website is not allowed)\033[0m") + self.logger.warning("\033[31m(Scraping this website is not allowed)\033[0m") if not self.force_scraping: raise ValueError( 'The website you selected is not scrapable') else: if self.verbose: - print("\033[33m(WARNING: Scraping this website is not allowed but you decided to force it)\033[0m") + self.logger.warning("\033[33m(WARNING: Scraping this website is not allowed but you decided to force it)\033[0m") else: if self.verbose: - print("\033[32m(Scraping this website is allowed)\033[0m") + self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m") state.update({self.output[0]: is_scrapable}) return state diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 87f8dcb2..9611407d 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -7,7 +7,7 @@ from langchain.prompts import PromptTemplate from ..utils.research_web import search_on_web from .base_node import BaseNode - +from ..utils.logging import get_logger class SearchInternetNode(BaseNode): """ @@ -56,7 +56,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") input_keys = self.get_input_keys(state) @@ -88,7 +88,8 @@ def execute(self, state: dict) -> dict: search_query = search_answer.invoke({"user_prompt": user_prompt})[0] if self.verbose: - print(f"Search Query: {search_query}") + self.logger.info(f"Search Query: {search_query}") + answer = search_on_web( query=search_query, max_results=self.max_results) diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index bf64b5d9..a06ccdee 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -5,7 +5,7 @@ # Imports from standard library from typing import List, Optional from tqdm import tqdm - +from ..utils.logging import get_logger # Imports from Langchain from langchain.prompts import PromptTemplate @@ -59,7 +59,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index d9fe7ca4..497b2501 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -4,7 +4,7 @@ from typing import List, Optional from .base_node import BaseNode - +from ..utils.logging import get_logger class TextToSpeechNode(BaseNode): """ @@ -45,7 +45,7 @@ def execute(self, state: dict) -> dict: """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + self.logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 72a8b96c..ee647466 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -9,3 +9,4 @@ from .save_audio_from_bytes import save_audio_from_bytes from .sys_dynamic_import import dynamic_import, srcfile_import from .cleanup_html import cleanup_html +from .logging import * \ No newline at end of file diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py new file mode 100644 index 00000000..428fb8a7 --- /dev/null +++ b/scrapegraphai/utils/logging.py @@ -0,0 +1,137 @@ +"""A centralized logging system for any library + +source code inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/utils/logging.py +""" +import logging +import os +import sys +import threading +from functools import lru_cache + + +_library_name = __name__.split(".", maxsplit=1)[0] + +_default_handler = None +_default_logging_level = logging.WARNING + +_semaphore = threading.Lock() + + +def _get_library_root_logger() -> logging.Logger: + return logging.getLogger(_library_name) + + +def _set_library_root_logger() -> None: + global _default_handler + + with _semaphore: + if _default_handler: return + + _default_handler = logging.StreamHandler() # sys.stderr as stream + + # https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176 + if sys.stderr is None: + sys.stderr = open(os.devnull, "w") + + _default_handler.flush = sys.stderr.flush + + library_root_logger = _get_library_root_logger() + library_root_logger.addHandler(_default_handler) + library_root_logger.setLevel(_default_logging_level) + library_root_logger.propagate = False + + +def get_logger(name: str | None = None) -> logging.Logger: + _set_library_root_logger() + return logging.getLogger(name or _library_name) + + +def get_verbosity() -> int: + _set_library_root_logger() + return _get_library_root_logger().getEffectiveLevel() + + +def set_verbosity(verbosity: int) -> None: + _set_library_root_logger() + _get_library_root_logger().setLevel(verbosity) + + +def set_verbosity_debug() -> None: + set_verbosity(logging.DEBUG) + + +def set_verbosity_info() -> None: + set_verbosity(logging.INFO) + + +def set_verbosity_warning() -> None: + set_verbosity(logging.WARNING) + + +def set_verbosity_error() -> None: + set_verbosity(logging.ERROR) + + +def set_verbosity_fatal() -> None: + set_verbosity(logging.FATAL) + + +def set_handler(handler: logging.Handler) -> None: + _set_library_root_logger() + + assert handler is not None + + _get_library_root_logger().addHandler(handler) + + +def set_default_handler() -> None: + set_handler(_default_handler) + + +def unset_handler(handler: logging.Handler) -> None: + _set_library_root_logger() + + assert handler is not None + + _get_library_root_logger().removeHandler(handler) + + +def unset_default_handler() -> None: + unset_handler(_default_handler) + + +def set_propagation() -> None: + _get_library_root_logger().propagate = True + + +def unset_propagation() -> None: + _get_library_root_logger().propagate = False + + +def set_formatting() -> None: + """sets formatting for all handlers bound to the root logger + + ``` + [levelname|filename|line number] time >> message + ``` + """ + formatter = logging.Formatter( + "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s" + ) + + for handler in _get_library_root_logger().handlers: + handler.setFormatter(formatter) + + +def unset_formatting() -> None: + for handler in _get_library_root_logger().handlers: + handler.setFormatter(None) + + +@lru_cache(None) +def warning_once(self, *args, **kwargs): + """emits warning logs with the same message only once""" + self.warning(*args, **kwargs) + + +logging.Logger.warning_once = warning_once \ No newline at end of file