4
4
from langchain_core .documents import Document
5
5
import aiohttp
6
6
import async_timeout
7
+ from typing import Union
7
8
from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
8
9
9
10
logger = get_logger ("web-loader" )
@@ -102,14 +103,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
102
103
103
104
return results
104
105
106
+ async def ascrape_playwright_scroll (
107
+ self ,
108
+ url : str ,
109
+ timeout : Union [int , None ]= 30 ,
110
+ scroll : int = 15000 ,
111
+ sleep : float = 2 ,
112
+ scroll_to_bottom : bool = False
113
+ ) -> str :
114
+ """
115
+ Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
116
+
117
+ Notes:
118
+ - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
119
+ - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
120
+ the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
121
+ - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
122
+ Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
123
+ make sure that the scrolling actually happens, thereby allowing the page height to change.
124
+ - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
125
+
126
+ Args:
127
+ - url (str): The URL to scrape.
128
+ - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
129
+ Can also be set to None, in which case the scraper will only stop when the page height stops changing.
130
+ - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
131
+ Less than this and we don't scroll enough to see any content change.
132
+ - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
133
+ Defaults to 2. Must be greater than 0.
134
+
135
+ Returns:
136
+ str: The scraped HTML content
137
+
138
+ Raises:
139
+ - ValueError: If the timeout value is less than or equal to 0.
140
+ - ValueError: If the sleep value is less than or equal to 0.
141
+ - ValueError: If the scroll value is less than 5000.
142
+ """
143
+ # NB: I have tested using scrollHeight to determine when to stop scrolling
144
+ # but it doesn't always work as expected. The page height doesn't change on some sites like
145
+ # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
146
+ # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
147
+
148
+ if timeout and timeout <= 0 :
149
+ raise ValueError ("If set, timeout value for scrolling scraper must be greater than 0." )
150
+
151
+ if sleep <= 0 :
152
+ raise ValueError ("Sleep for scrolling scraper value must be greater than 0." )
153
+
154
+ if scroll < 5000 :
155
+ raise ValueError ("Scroll value for scrolling scraper must be greater than or equal to 5000." )
156
+
157
+ from playwright .async_api import async_playwright
158
+ from undetected_playwright import Malenia
159
+ import time
160
+
161
+ logger .info (f"Starting scraping with scrolling support for { url } ..." )
162
+
163
+ results = ""
164
+ attempt = 0
165
+
166
+ while attempt < self .RETRY_LIMIT :
167
+ try :
168
+ async with async_playwright () as p :
169
+ browser = await p .chromium .launch (
170
+ headless = self .headless , proxy = self .proxy , ** self .browser_config
171
+ )
172
+ context = await browser .new_context ()
173
+ await Malenia .apply_stealth (context )
174
+ page = await context .new_page ()
175
+ await page .goto (url , wait_until = "domcontentloaded" )
176
+ await page .wait_for_load_state (self .load_state )
177
+
178
+ previous_height = None
179
+ start_time = time .time ()
180
+
181
+ # Store the heights of the page after each scroll
182
+ # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
183
+ # or simly when the page stops changing for some reason.
184
+ heights = []
185
+
186
+ while True :
187
+ current_height = await page .evaluate ("document.body.scrollHeight" )
188
+ heights .append (current_height )
189
+ heights = heights [- 5 :] # Keep only the last 5 heights, to not run out of memory
190
+
191
+ # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
192
+ # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
193
+ # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
194
+ if scroll_to_bottom and previous_height == current_height :
195
+ logger .info (f"Reached bottom of page for url { url } " )
196
+ break
197
+
198
+ previous_height = current_height
199
+
200
+ await page .mouse .wheel (0 , scroll )
201
+ logger .debug (f"Scrolled { url } to current height { current_height } px..." )
202
+ time .sleep (sleep ) # Allow some time for any lazy-loaded content to load
203
+
204
+ current_time = time .time ()
205
+ elapsed_time = current_time - start_time
206
+ logger .debug (f"Elapsed time: { elapsed_time } seconds" )
207
+
208
+ if timeout :
209
+ if elapsed_time >= timeout :
210
+ logger .info (f"Reached timeout of { timeout } seconds for url { url } " )
211
+ break
212
+ elif len (heights ) == 5 and len (set (heights )) == 1 :
213
+ logger .info (f"Page height has not changed for url { url } for the last 5 scrolls. Stopping." )
214
+ break
215
+
216
+ results = await page .content ()
217
+ break
218
+
219
+ except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
220
+ attempt += 1
221
+ logger .error (f"Attempt { attempt } failed: { e } " )
222
+ if attempt == self .RETRY_LIMIT :
223
+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
224
+ finally :
225
+ await browser .close ()
226
+
227
+ return results
228
+
105
229
async def ascrape_playwright (self , url : str ) -> str :
106
230
"""
107
231
Asynchronously scrape the content of a given URL using Playwright's async API.
232
+
233
+ Args:
234
+ url (str): The URL to scrape.
235
+
236
+ Returns:
237
+ str: The scraped HTML content or an error message if an exception occurs.
108
238
"""
109
239
from playwright .async_api import async_playwright
110
240
from undetected_playwright import Malenia
111
241
112
242
logger .info (f"Starting scraping with { self .backend } ..." )
243
+ results = ""
113
244
attempt = 0
114
245
115
246
while attempt < self .RETRY_LIMIT :
@@ -127,16 +258,16 @@ async def ascrape_playwright(self, url: str) -> str:
127
258
await page .wait_for_load_state (self .load_state )
128
259
results = await page .content ()
129
260
logger .info ("Content scraped" )
130
- return results
261
+ break
131
262
except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
132
263
attempt += 1
133
264
logger .error (f"Attempt { attempt } failed: { e } " )
134
265
if attempt == self .RETRY_LIMIT :
135
- raise RuntimeError (
136
- f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } "
137
- )
266
+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
138
267
finally :
139
- if "browser" in locals ():
268
+ await browser .close ()
269
+
270
+ return results
140
271
141
272
142
273
async def ascrape_with_js_support (self , url : str ) -> str :
0 commit comments