Skip to content

Commit 0145b8f

Browse files
authored
Merge pull request #372 from supercoder-dev/supercoder-327
Issue 327 Resolved
2 parents b4d7532 + 5065aa0 commit 0145b8f

File tree

3 files changed

+22
-4
lines changed

3 files changed

+22
-4
lines changed

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,10 @@ def _create_graph(self) -> BaseGraph:
6666
output=["doc", "link_urls", "img_urls"],
6767
node_config={
6868
"loader_kwargs": self.config.get("loader_kwargs", {}),
69+
"headless": self.config.get("headless", True) # Ensure headless flag is passed
6970
}
7071
)
72+
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
7173
parse_node = ParseNode(
7274
input="doc",
7375
output=["parsed_doc"],

scrapegraphai/nodes/fetch_node.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,15 +131,21 @@ def execute(self, state):
131131
pass
132132

133133
elif not source.startswith("http"):
134+
self.logger.info(f"Fetching local HTML content from: {source}")
135+
if not source.strip():
136+
raise ValueError("No HTML body content found in the local source.")
134137
title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
135138
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
136139
compressed_document = [
137140
Document(page_content=parsed_content, metadata={"source": "local_dir"})
138141
]
139142

140143
elif self.useSoup:
144+
self.logger.info(f"Fetching HTML content using requests from: {source}")
141145
response = requests.get(source)
142146
if response.status_code == 200:
147+
if not response.text.strip():
148+
raise ValueError("No HTML body content found in the response.")
143149
title, minimized_body, link_urls, image_urls = cleanup_html(
144150
response.text, source
145151
)
@@ -151,6 +157,7 @@ def execute(self, state):
151157
)
152158

153159
else:
160+
self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}")
154161
loader_kwargs = {}
155162

156163
if self.node_config is not None:
@@ -159,6 +166,9 @@ def execute(self, state):
159166
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
160167
document = loader.load()
161168

169+
if not document or not document[0].page_content.strip():
170+
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
171+
162172
title, minimized_body, link_urls, image_urls = cleanup_html(
163173
str(document[0].page_content), source
164174
)

scrapegraphai/utils/cleanup_html.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@ def cleanup_html(html_content: str, base_url: str) -> str:
2424
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
2525
"""
2626

27+
import logging
28+
logging.basicConfig(level=logging.DEBUG)
29+
30+
# Add logging to capture the HTML content before parsing
31+
logging.debug(f'HTML content before parsing: {html_content}')
32+
2733
soup = BeautifulSoup(html_content, 'html.parser')
2834

2935
# Title Extraction
@@ -53,9 +59,9 @@ def cleanup_html(html_content: str, base_url: str) -> str:
5359
if body_content:
5460
# Minify the HTML within the body tag
5561
minimized_body = minify(str(body_content))
56-
5762
return title, minimized_body, link_urls, image_urls
58-
# return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
5963

60-
# throw an error if no body content is found
61-
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
64+
else:
65+
logging.error(f'No body content found in HTML: {html_content}')
66+
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
67+

0 commit comments

Comments
 (0)