From acece72c28f40b4de00fec792fdfa81d5eb3af6e Mon Sep 17 00:00:00 2001 From: seyf97 <111386377+seyf97@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:49:00 +0300 Subject: [PATCH] Update cleanup_html.py Remove redundant lines in Links extraction --- scrapegraphai/utils/cleanup_html.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index d9398c0f..1774af20 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str: tag.extract() # Links extraction - links = soup.find_all('a') - link_urls = [] - for link in links: - if 'href' in link.attrs: - link_urls.append(urljoin(base_url, link['href'])) + link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)] # Images extraction images = soup.find_all('img') @@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str: # return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls) # throw an error if no body content is found - raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.") \ No newline at end of file + raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")