From acece72c28f40b4de00fec792fdfa81d5eb3af6e Mon Sep 17 00:00:00 2001
From: seyf97 <111386377+seyf97@users.noreply.github.com>
Date: Tue, 4 Jun 2024 13:49:00 +0300
Subject: [PATCH] Update cleanup_html.py

Remove redundant lines in Links extraction
---
 scrapegraphai/utils/cleanup_html.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
index d9398c0f..1774af20 100644
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
         tag.extract()
 
     # Links extraction
-    links = soup.find_all('a')
-    link_urls = []
-    for link in links:
-        if 'href' in link.attrs:
-            link_urls.append(urljoin(base_url, link['href']))
+    link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
 
     # Images extraction
     images = soup.find_all('img')
@@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
         # return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
 
     # throw an error if no body content is found
-    raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
\ No newline at end of file
+    raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")