From 38c6dd2aa1ce31b981eb8c35a56e9533d19df81b Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 4 Nov 2024 09:21:29 +0100 Subject: [PATCH] feat: update chromium --- scrapegraphai/docloaders/chromium.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 48058436..cf784e95 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -1,6 +1,3 @@ -""" -chromiumloader module -""" import asyncio from typing import Any, AsyncIterator, Iterator, List, Optional from langchain_community.document_loaders.base import BaseLoader @@ -12,15 +9,16 @@ logger = get_logger("web-loader") class ChromiumLoader(BaseLoader): - """scrapes HTML pages from URLs using a (headless) instance of the - Chromium web driver with proxy protection + """Scrapes HTML pages from URLs using a (headless) instance of the + Chromium web driver with proxy protection. Attributes: backend: The web driver backend library; defaults to 'playwright'. browser_config: A dictionary containing additional browser kwargs. - headless: whether to run browser in headless mode. + headless: Whether to run browser in headless mode. proxy: A dictionary containing proxy settings; None disables protection. urls: A list of URLs to scrape content from. + requires_js_support: Flag to determine if JS rendering is required. """ RETRY_LIMIT = 3 @@ -34,15 +32,17 @@ def __init__( headless: bool = True, proxy: Optional[Proxy] = None, load_state: str = "domcontentloaded", + requires_js_support: bool = False, **kwargs: Any, ): """Initialize the loader with a list of URL paths. Args: backend: The web driver backend library; defaults to 'playwright'. - headless: whether to run browser in headless mode. + headless: Whether to run browser in headless mode. proxy: A dictionary containing proxy information; None disables protection. urls: A list of URLs to scrape content from. + requires_js_support: Whether to use JS rendering for scraping. kwargs: A dictionary containing additional browser kwargs. Raises: @@ -61,6 +61,7 @@ def __init__( self.proxy = parse_or_search_proxy(proxy) if proxy else None self.urls = urls self.load_state = load_state + self.requires_js_support = requires_js_support async def ascrape_undetected_chromedriver(self, url: str) -> str: """ @@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]: Yields: Document: The scraped content encapsulated within a Document object. """ - scraping_fn = getattr(self, f"ascrape_{self.backend}") + scraping_fn = ( + self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}") + ) for url in self.urls: html_content = asyncio.run(scraping_fn(url)) @@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]: Document: A Document object containing the scraped content, along with its source URL as metadata. """ - scraping_fn = getattr(self, f"ascrape_{self.backend}") + scraping_fn = ( + self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}") + ) tasks = [scraping_fn(url) for url in self.urls] results = await asyncio.gather(*tasks)