diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index f22a3fe6..579933e6 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -29,6 +29,7 @@ def __init__( backend: str = "playwright", headless: bool = True, proxy: Optional[Proxy] = None, + load_state: str = "domcontentloaded", **kwargs: Any, ): """Initialize the loader with a list of URL paths. @@ -55,6 +56,7 @@ def __init__( self.headless = headless self.proxy = parse_or_search_proxy(proxy) if proxy else None self.urls = urls + self.load_state = load_state async def ascrape_playwright(self, url: str) -> str: """ @@ -81,6 +83,7 @@ async def ascrape_playwright(self, url: str) -> str: await Malenia.apply_stealth(context) page = await context.new_page() await page.goto(url) + await page.wait_for_load_state(self.load_state) results = await page.content() # Simply get the HTML content logger.info("Content scraped") except Exception as e: diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 34886b24..2a0c5f18 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -83,6 +83,9 @@ def execute(self, state: dict) -> dict: Assume relevance broadly, including any links that might be related or potentially useful in relation to the task. + + Sort it in order of importance, the first one should be the most important one, the last one + the least important Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain whether the content at the link is directly relevant.