diff --git a/.gitignore b/.gitignore index b8ab5703..4bd66401 100644 --- a/.gitignore +++ b/.gitignore @@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph examples/**/result.csv examples/**/result.json main.py - - \ No newline at end of file +.idea \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 19c714e8..29d0b419 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "playwright==1.43.0", "google==3.0.0", "yahoo-search-py==0.3", + "undetected-playwright==0.3.0", ] license = "MIT" diff --git a/requirements.txt b/requirements.txt index 1e6224b4..2ccdf0d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ langchain-aws==0.1.2 langchain-anthropic==0.1.11 yahoo-search-py==0.3 pypdf==4.2.0 +undetected-playwright==0.3.0 \ No newline at end of file diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 7d499245..d3581a7a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -69,6 +69,7 @@ async def ascrape_playwright(self, url: str) -> str: """ from playwright.async_api import async_playwright + from undetected_playwright import Malenia logger.info("Starting scraping...") results = "" @@ -77,7 +78,9 @@ async def ascrape_playwright(self, url: str) -> str: headless=self.headless, proxy=self.proxy, **self.browser_config ) try: - page = await browser.new_page() + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() await page.goto(url) results = await page.content() # Simply get the HTML content logger.info("Content scraped") diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index f8881d75..f1780049 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -27,6 +27,8 @@ }, "gemini": { "gemini-pro": 128000, + "gemini-1.5-flash-latest":128000, + "gemini-1.5-pro-latest":128000, "models/embedding-001": 2048 }, @@ -49,6 +51,7 @@ "dbrx": 32768, "dbrx:instruct": 32768, "nous-hermes2:34b": 4096, + "orca-mini": 2048, # embedding models "nomic-embed-text": 8192, "snowflake-arctic-embed:335m": 8192,