diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
index d9398c0f..1774af20 100644
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
tag.extract()
# Links extraction
- links = soup.find_all('a')
- link_urls = []
- for link in links:
- if 'href' in link.attrs:
- link_urls.append(urljoin(base_url, link['href']))
+ link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
# Images extraction
images = soup.find_all('img')
@@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
# return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
# throw an error if no body content is found
- raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
\ No newline at end of file
+ raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")