From 81f89d88be0c829efe27e6f5dfcf2231c286a88e Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:14:24 +0200 Subject: [PATCH 1/2] Update search_link_node.py --- scrapegraphai/nodes/search_link_node.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 34886b24..2a0c5f18 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -83,6 +83,9 @@ def execute(self, state: dict) -> dict: Assume relevance broadly, including any links that might be related or potentially useful in relation to the task. + + Sort it in order of importance, the first one should be the most important one, the last one + the least important Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain whether the content at the link is directly relevant. From 8f405ff87a986dfa198fedc055e33675b718633d Mon Sep 17 00:00:00 2001 From: Steven Thomas Date: Tue, 11 Jun 2024 11:22:39 -0400 Subject: [PATCH 2/2] Add the ability to specify load state --- scrapegraphai/docloaders/chromium.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index f22a3fe6..579933e6 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -29,6 +29,7 @@ def __init__( backend: str = "playwright", headless: bool = True, proxy: Optional[Proxy] = None, + load_state: str = "domcontentloaded", **kwargs: Any, ): """Initialize the loader with a list of URL paths. @@ -55,6 +56,7 @@ def __init__( self.headless = headless self.proxy = parse_or_search_proxy(proxy) if proxy else None self.urls = urls + self.load_state = load_state async def ascrape_playwright(self, url: str) -> str: """ @@ -81,6 +83,7 @@ async def ascrape_playwright(self, url: str) -> str: await Malenia.apply_stealth(context) page = await context.new_page() await page.goto(url) + await page.wait_for_load_state(self.load_state) results = await page.content() # Simply get the HTML content logger.info("Content scraped") except Exception as e: