Merge pull request #372 from supercoder-dev/supercoder-327

VinciGit00 · web-flow · commit 0145b8f0e823 · 2024-06-12T12:12:28.000+02:00
Issue 327 Resolved
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -66,8 +66,10 @@ def _create_graph(self) -> BaseGraph:
             output=["doc", "link_urls", "img_urls"],
             node_config={
                 "loader_kwargs": self.config.get("loader_kwargs", {}),
+                "headless": self.config.get("headless", True)  # Ensure headless flag is passed
             }
         )
+        logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
         parse_node = ParseNode(
             input="doc",
             output=["parsed_doc"],
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -131,15 +131,21 @@ def execute(self, state):
             pass
 
         elif not source.startswith("http"):
+            self.logger.info(f"Fetching local HTML content from: {source}")
+            if not source.strip():
+                raise ValueError("No HTML body content found in the local source.")
             title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
             parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
             compressed_document = [
                 Document(page_content=parsed_content, metadata={"source": "local_dir"})
             ]
 
         elif self.useSoup:
+            self.logger.info(f"Fetching HTML content using requests from: {source}")
             response = requests.get(source)
             if response.status_code == 200:
+                if not response.text.strip():
+                    raise ValueError("No HTML body content found in the response.")
                 title, minimized_body, link_urls, image_urls = cleanup_html(
                     response.text, source
                 )
@@ -151,6 +157,7 @@ def execute(self, state):
                 )
 
         else:
+            self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}")
             loader_kwargs = {}
 
             if self.node_config is not None:
@@ -159,6 +166,9 @@ def execute(self, state):
             loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
             document = loader.load()
 
+            if not document or not document[0].page_content.strip():
+                raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
+
             title, minimized_body, link_urls, image_urls = cleanup_html(
                 str(document[0].page_content), source
             )
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
@@ -24,6 +24,12 @@ def cleanup_html(html_content: str, base_url: str) -> str:
     This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
     """
 
+    import logging
+    logging.basicConfig(level=logging.DEBUG)
+
+    # Add logging to capture the HTML content before parsing
+    logging.debug(f'HTML content before parsing: {html_content}')
+
     soup = BeautifulSoup(html_content, 'html.parser')
 
     # Title Extraction
@@ -53,9 +59,9 @@ def cleanup_html(html_content: str, base_url: str) -> str:
     if body_content:
         # Minify the HTML within the body tag
         minimized_body = minify(str(body_content))
-
         return title, minimized_body, link_urls, image_urls
-        # return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
 
-    # throw an error if no body content is found
-    raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
+    else:
+        logging.error(f'No body content found in HTML: {html_content}')
+        raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
+

Original file line number	Diff line number	Diff line change
`@@ -66,8 +66,10 @@ def _create_graph(self) -> BaseGraph:`
`66`	`66`	`output=["doc", "link_urls", "img_urls"],`
`67`	`67`	`node_config={`
`68`	`68`	`"loader_kwargs": self.config.get("loader_kwargs", {}),`
	`69`	`+ "headless": self.config.get("headless", True) # Ensure headless flag is passed`
`69`	`70`	`}`
`70`	`71`	`)`
	`72`	`+ logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))`
`71`	`73`	`parse_node = ParseNode(`
`72`	`74`	`input="doc",`
`73`	`75`	`output=["parsed_doc"],`