From a0125b7e900964a6487114f0158e39cb2e023b97 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 18 May 2024 09:31:40 +0200 Subject: [PATCH 1/3] add orca mini integration --- scrapegraphai/helpers/models_tokens.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index f8881d75..16a42c17 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -49,6 +49,7 @@ "dbrx": 32768, "dbrx:instruct": 32768, "nous-hermes2:34b": 4096, + "orca-mini": 2048, # embedding models "nomic-embed-text": 8192, "snowflake-arctic-embed:335m": 8192, From 7f30da99308968f7b06a432aafab8d8a1365ffb9 Mon Sep 17 00:00:00 2001 From: Mobin Chowdhury <47663360+MobinX@users.noreply.github.com> Date: Sat, 18 May 2024 23:32:27 +0600 Subject: [PATCH 2/3] Update models_tokens.py --- scrapegraphai/helpers/models_tokens.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 16a42c17..f1780049 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -27,6 +27,8 @@ }, "gemini": { "gemini-pro": 128000, + "gemini-1.5-flash-latest":128000, + "gemini-1.5-pro-latest":128000, "models/embedding-001": 2048 }, From 7b3ee4e71e4af04edeb47999d70d398b67c93ac4 Mon Sep 17 00:00:00 2001 From: QIN2DIM <62018067+QIN2DIM@users.noreply.github.com> Date: Sun, 19 May 2024 18:01:03 +0800 Subject: [PATCH 3/3] feat(docloaders): undetected-playwright --- .gitignore | 3 +-- pyproject.toml | 1 + requirements.txt | 1 + scrapegraphai/docloaders/chromium.py | 5 ++++- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index b8ab5703..4bd66401 100644 --- a/.gitignore +++ b/.gitignore @@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph examples/**/result.csv examples/**/result.json main.py - - \ No newline at end of file +.idea \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 19c714e8..29d0b419 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "playwright==1.43.0", "google==3.0.0", "yahoo-search-py==0.3", + "undetected-playwright==0.3.0", ] license = "MIT" diff --git a/requirements.txt b/requirements.txt index 1e6224b4..2ccdf0d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ langchain-aws==0.1.2 langchain-anthropic==0.1.11 yahoo-search-py==0.3 pypdf==4.2.0 +undetected-playwright==0.3.0 \ No newline at end of file diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 7d499245..d3581a7a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -69,6 +69,7 @@ async def ascrape_playwright(self, url: str) -> str: """ from playwright.async_api import async_playwright + from undetected_playwright import Malenia logger.info("Starting scraping...") results = "" @@ -77,7 +78,9 @@ async def ascrape_playwright(self, url: str) -> str: headless=self.headless, proxy=self.proxy, **self.browser_config ) try: - page = await browser.new_page() + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() await page.goto(url) results = await page.content() # Simply get the HTML content logger.info("Content scraped")