diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 9636e32d..85b292c3 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -66,8 +66,10 @@ def _create_graph(self) -> BaseGraph: output=["doc", "link_urls", "img_urls"], node_config={ "loader_kwargs": self.config.get("loader_kwargs", {}), + "headless": self.config.get("headless", True) # Ensure headless flag is passed } ) + logging.info("FetchNode configured with headless: %s", self.config.get("headless", True)) parse_node = ParseNode( input="doc", output=["parsed_doc"], diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 5d2b575f..dbdd9925 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -131,6 +131,9 @@ def execute(self, state): pass elif not source.startswith("http"): + self.logger.info(f"Fetching local HTML content from: {source}") + if not source.strip(): + raise ValueError("No HTML body content found in the local source.") title, minimized_body, link_urls, image_urls = cleanup_html(source, source) parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}" compressed_document = [ @@ -138,8 +141,11 @@ def execute(self, state): ] elif self.useSoup: + self.logger.info(f"Fetching HTML content using requests from: {source}") response = requests.get(source) if response.status_code == 200: + if not response.text.strip(): + raise ValueError("No HTML body content found in the response.") title, minimized_body, link_urls, image_urls = cleanup_html( response.text, source ) @@ -151,6 +157,7 @@ def execute(self, state): ) else: + self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}") loader_kwargs = {} if self.node_config is not None: @@ -159,6 +166,9 @@ def execute(self, state): loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() + if not document or not document[0].page_content.strip(): + raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") + title, minimized_body, link_urls, image_urls = cleanup_html( str(document[0].page_content), source ) diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 1774af20..d3b4dd48 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -24,6 +24,12 @@ def cleanup_html(html_content: str, base_url: str) -> str: This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. """ + import logging + logging.basicConfig(level=logging.DEBUG) + + # Add logging to capture the HTML content before parsing + logging.debug(f'HTML content before parsing: {html_content}') + soup = BeautifulSoup(html_content, 'html.parser') # Title Extraction @@ -53,9 +59,9 @@ def cleanup_html(html_content: str, base_url: str) -> str: if body_content: # Minify the HTML within the body tag minimized_body = minify(str(body_content)) - return title, minimized_body, link_urls, image_urls - # return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls) - # throw an error if no body content is found - raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.") + else: + logging.error(f'No body content found in HTML: {html_content}') + raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}") +