Skip to content

Commit 5467630

Browse files
authored
Merge pull request #836 from aleenprd/feat_chromium_scroller
Feat: chromium scroller
2 parents 60e2fdf + c396dcf commit 5467630

File tree

1 file changed

+136
-5
lines changed

1 file changed

+136
-5
lines changed

scrapegraphai/docloaders/chromium.py

Lines changed: 136 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from langchain_core.documents import Document
55
import aiohttp
66
import async_timeout
7+
from typing import Union
78
from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
89

910
logger = get_logger("web-loader")
@@ -102,14 +103,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
102103

103104
return results
104105

106+
async def ascrape_playwright_scroll(
107+
self,
108+
url: str,
109+
timeout: Union[int, None]=30,
110+
scroll: int=15000,
111+
sleep: float=2,
112+
scroll_to_bottom: bool=False
113+
) -> str:
114+
"""
115+
Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
116+
117+
Notes:
118+
- The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
119+
- If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
120+
the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
121+
- Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
122+
Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
123+
make sure that the scrolling actually happens, thereby allowing the page height to change.
124+
- Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
125+
126+
Args:
127+
- url (str): The URL to scrape.
128+
- timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
129+
Can also be set to None, in which case the scraper will only stop when the page height stops changing.
130+
- scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
131+
Less than this and we don't scroll enough to see any content change.
132+
- sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
133+
Defaults to 2. Must be greater than 0.
134+
135+
Returns:
136+
str: The scraped HTML content
137+
138+
Raises:
139+
- ValueError: If the timeout value is less than or equal to 0.
140+
- ValueError: If the sleep value is less than or equal to 0.
141+
- ValueError: If the scroll value is less than 5000.
142+
"""
143+
# NB: I have tested using scrollHeight to determine when to stop scrolling
144+
# but it doesn't always work as expected. The page height doesn't change on some sites like
145+
# https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
146+
# In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
147+
148+
if timeout and timeout <= 0:
149+
raise ValueError("If set, timeout value for scrolling scraper must be greater than 0.")
150+
151+
if sleep <= 0:
152+
raise ValueError("Sleep for scrolling scraper value must be greater than 0.")
153+
154+
if scroll < 5000:
155+
raise ValueError("Scroll value for scrolling scraper must be greater than or equal to 5000.")
156+
157+
from playwright.async_api import async_playwright
158+
from undetected_playwright import Malenia
159+
import time
160+
161+
logger.info(f"Starting scraping with scrolling support for {url}...")
162+
163+
results = ""
164+
attempt = 0
165+
166+
while attempt < self.RETRY_LIMIT:
167+
try:
168+
async with async_playwright() as p:
169+
browser = await p.chromium.launch(
170+
headless=self.headless, proxy=self.proxy, **self.browser_config
171+
)
172+
context = await browser.new_context()
173+
await Malenia.apply_stealth(context)
174+
page = await context.new_page()
175+
await page.goto(url, wait_until="domcontentloaded")
176+
await page.wait_for_load_state(self.load_state)
177+
178+
previous_height = None
179+
start_time = time.time()
180+
181+
# Store the heights of the page after each scroll
182+
# This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
183+
# or simly when the page stops changing for some reason.
184+
heights = []
185+
186+
while True:
187+
current_height = await page.evaluate("document.body.scrollHeight")
188+
heights.append(current_height)
189+
heights = heights[-5:] # Keep only the last 5 heights, to not run out of memory
190+
191+
# Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
192+
# Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
193+
# or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
194+
if scroll_to_bottom and previous_height == current_height:
195+
logger.info(f"Reached bottom of page for url {url}")
196+
break
197+
198+
previous_height = current_height
199+
200+
await page.mouse.wheel(0, scroll)
201+
logger.debug(f"Scrolled {url} to current height {current_height}px...")
202+
time.sleep(sleep) # Allow some time for any lazy-loaded content to load
203+
204+
current_time = time.time()
205+
elapsed_time = current_time - start_time
206+
logger.debug(f"Elapsed time: {elapsed_time} seconds")
207+
208+
if timeout:
209+
if elapsed_time >= timeout:
210+
logger.info(f"Reached timeout of {timeout} seconds for url {url}")
211+
break
212+
elif len(heights) == 5 and len(set(heights)) == 1:
213+
logger.info(f"Page height has not changed for url {url} for the last 5 scrolls. Stopping.")
214+
break
215+
216+
results = await page.content()
217+
break
218+
219+
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
220+
attempt += 1
221+
logger.error(f"Attempt {attempt} failed: {e}")
222+
if attempt == self.RETRY_LIMIT:
223+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
224+
finally:
225+
await browser.close()
226+
227+
return results
228+
105229
async def ascrape_playwright(self, url: str) -> str:
106230
"""
107231
Asynchronously scrape the content of a given URL using Playwright's async API.
232+
233+
Args:
234+
url (str): The URL to scrape.
235+
236+
Returns:
237+
str: The scraped HTML content or an error message if an exception occurs.
108238
"""
109239
from playwright.async_api import async_playwright
110240
from undetected_playwright import Malenia
111241

112242
logger.info(f"Starting scraping with {self.backend}...")
243+
results = ""
113244
attempt = 0
114245

115246
while attempt < self.RETRY_LIMIT:
@@ -127,16 +258,16 @@ async def ascrape_playwright(self, url: str) -> str:
127258
await page.wait_for_load_state(self.load_state)
128259
results = await page.content()
129260
logger.info("Content scraped")
130-
return results
261+
break
131262
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
132263
attempt += 1
133264
logger.error(f"Attempt {attempt} failed: {e}")
134265
if attempt == self.RETRY_LIMIT:
135-
raise RuntimeError(
136-
f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}"
137-
)
266+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
138267
finally:
139-
if "browser" in locals():
268+
await browser.close()
269+
270+
return results
140271

141272

142273
async def ascrape_with_js_support(self, url: str) -> str:

0 commit comments

Comments
 (0)