diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4e6b9ee6..0ad43347 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,40 +14,55 @@ jobs: run: | sudo apt update sudo apt install -y git + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install uv uses: astral-sh/setup-uv@v3 + - name: Install Node Env uses: actions/setup-node@v4 with: node-version: 20 + - name: Checkout uses: actions/checkout@v4.1.1 with: fetch-depth: 0 persist-credentials: false - - name: Build app + + - name: Build and validate package run: | + uv venv + . .venv/bin/activate + uv pip install --upgrade setuptools wheel hatchling uv sync --frozen + uv pip install -e . uv build - id: build_cache - if: success() + uv pip install --upgrade pkginfo==1.12.0 twine==6.0.1 # Upgrade pkginfo and install twine + python -m twine check dist/* + + - name: Debug Dist Directory + run: ls -al dist + - name: Cache build - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ./dist - key: ${{ runner.os }}-build-${{ hashFiles('dist/**') }} - if: steps.build_cache.outputs.id != '' + key: ${{ runner.os }}-build-${{ github.sha }} release: name: Release runs-on: ubuntu-latest needs: build environment: development - if: | - github.event_name == 'push' && github.ref == 'refs/heads/main' || - github.event_name == 'push' && github.ref == 'refs/heads/pre/beta' || - github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'main' || - github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'pre/beta' + if: > + github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/pre/beta') || + (github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && + (github.event.pull_request.base.ref == 'main' || github.event.pull_request.base.ref == 'pre/beta')) permissions: contents: write issues: write @@ -59,6 +74,13 @@ jobs: with: fetch-depth: 0 persist-credentials: false + + - name: Restore build artifacts + uses: actions/cache@v3 + with: + path: ./dist + key: ${{ runner.os }}-build-${{ github.sha }} + - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 6722b19b..dfc5d288 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,144 @@ -## [1.33.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.10...v1.33.11) (2025-01-02) +## [1.34.0-beta.14](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.13...v1.34.0-beta.14) (2025-01-03) ### Bug Fixes +* add model tokens ([9b16cb9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9b16cb987fd93132d814ebd933af1565eb166331)) * revert ([b312251](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b312251cc56ee4c82554ecf116b5e6edd1560726)) * revert ([bb5de58](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb5de581c064a1d141f849081e52987500957d1c)) +* validate URL only if the input type is a URL ([e2caee6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e2caee695ecce2d13aa5a82306097b1a80ba0e18)) ### Docs +* added api reference šŸ”— ([67038e1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/67038e195224e1a721fe123ad1d5604b3592df20)) * added official cookbook reference ([98aa74f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/98aa74ff2d35041884130be14efdf47ca5e716df)) +* fixed missing import ([96064f2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/96064f20ee8a849a2548f293419cf9028386c47b)) * updated documentation reference ([fe89ae2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fe89ae29e6dc5f4322c25c693e2c9f6ce958d6e2)) + +### CI + +* **release:** 1.33.10 [skip ci] ([a44b74a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a44b74aa6f7be7cdb4bdbebebc3b51a6d54a51e6)) +* **release:** 1.33.11 [skip ci] ([30f48b3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/30f48b394f6eb8c7c9a1fa113bffabd2ac1ac585)) +* **release:** 1.33.9 [skip ci] ([9b6d6c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9b6d6c0efb2fd1af5bf87cf61a0ba3d79876d21d)) + +## [1.34.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.12...v1.34.0-beta.13) (2025-01-03) + + + +### Bug Fixes + +* bump hatchling version to 1.26.3 ([159ed32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/159ed329d2e8fa86015df1e59a7e2ebb439c6ec0)) + +## [1.34.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.11...v1.34.0-beta.12) (2025-01-02) + + +### Docs + +### Bug Fixes + +* removed license for license-files ([b5acfb4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b5acfb414321989c45f76fad82f0d720ec889274)) + +## [1.34.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.10...v1.34.0-beta.11) (2025-01-02) + + +### Bug Fixes + +* added license-files = [ ([9150e4c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9150e4c95fa468afe9ddda3f1278b5037a2d0f38)) + +## [1.34.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.9...v1.34.0-beta.10) (2025-01-02) + + +### Bug Fixes + +* upgrade twine ([020e211](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/020e21123889c6483459e9db1c3c796cbc116140)) + +## [1.34.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.8...v1.34.0-beta.9) (2025-01-02) + + +### Bug Fixes + +* update pkginfo ([9203ab9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9203ab9a4ab4400105fd34433684f9ac2453f35c)) + +## [1.34.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.7...v1.34.0-beta.8) (2025-01-02) + + +### Bug Fixes + +* added twine ([df07da9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df07da9bcc59cbccf1c45d69e3a3e904eaed565b)) +* twine ([eb36a2b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/eb36a2b630d62363f3c57e243f2b90cf530c0a3b)) +* uv virtual env ([fce9886](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fce988687b3dc6fc36ce9244a8c2744f4a25d561)) +* version ([95b8990](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/95b8990a3649646972e12d78b11c7e1b7e707bf6)) +* workflow ([abe2945](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/abe29457f2380932d070bfd607c8ab5f749627c3)) + +## [1.34.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.6...v1.34.0-beta.7) (2025-01-02) + + +### Bug Fixes + +* revert to d1b2104 ([a0c0a7f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a0c0a7ff5c5dc9a107e7be8d5b5e1854886d411c)) + +## [1.34.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.5...v1.34.0-beta.6) (2025-01-02) + + +### Bug Fixes + +* release workflow ([a00f128](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a00f128992e9fef88c870295c46b983b4286a3eb)) + +## [1.34.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.4...v1.34.0-beta.5) (2025-01-02) + + +### Bug Fixes + +* release workflow ([cb6d140](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cb6d140042685bd419444d75ae7cab706cbcee38)) +* uv build ([1be6ffe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1be6ffe309124d55b8b3b66ded448f06dfd87b7e)) +* uv install workflow ([bcac20a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bcac20a7a8e65e2aa5760fb14e17b8054b4f4cf4)) + +## [1.34.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.3...v1.34.0-beta.4) (2024-12-18) + + +### Bug Fixes + +* build config ([b186a4f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b186a4f1c73fe29fa706158cc3c61812d6b16343)) +* build config ([46f5985](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/46f598546109067267d01ae7d8ea7609526ea4d4)) +* build config ([d2fc53f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d2fc53fc8414475c9bee7590144fe4251d56faf4)) +* last desperate attempt to restore automatic builds ([2538fe3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2538fe3db339014ef54e2c78269bce9259e284ea)) +* release config ([9cd0d31](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd0d31882c22f347ebd9c58d8dd66b47d178c64)) +* release config ([62ee294](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62ee294a864993a9414644c1547bafb96a43df20)) +* release config ([89863ee](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/89863ee166e09ee18287bfcc1b5475d894c9e8c6)) +* release config ([38e477c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38e477c540a3a50fc7ff6120da255d51798bfadd)) + +## [1.34.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.2...v1.34.0-beta.3) (2024-12-18) + + +### Bug Fixes + +* pyproject ([35a4907](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/35a490747cf6b8dad747a4af7f02d6f5aeb0d338)) + +## [1.34.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.1...v1.34.0-beta.2) (2024-12-17) + + +### Bug Fixes + +* context window ([ffdadae](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ffdadaed6fe3f17da535e6eddb73851fce2f4bf2)) +* formatting ([d1b2104](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d1b2104f28d84c5129edb29a5efdaf5bf7d22bfb)) +* pyproject ([76ac0a2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ac0a2141d9d53af023a405e2c61849921e4f0e)) +* pyproject ([3dcfcd4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3dcfcd492e71297031a7df1dba9dd135f1fae60e)) +* pyproject ([bf6cb0a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bf6cb0a582004617724e11ed04ba617eb39abc0c)) +* uv.lock ([0a7fc39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0a7fc392dea2b62122b977d62f4d85b117fc8351)) + + +### CI + +* **release:** 1.33.3 [skip ci] ([488093a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/488093a63fcc1dc01eabdab301d752416a025139)) +* **release:** 1.33.4 [skip ci] ([a789179](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a78917997060edbd61df5279546587e4ef123ea1)) +* **release:** 1.33.5 [skip ci] ([7a6164f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7a6164f1dc6dbb8ff0b4f7fc653f3910445f0754)) +* **release:** 1.33.6 [skip ci] ([ca96c3d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ca96c3d4309bd2b92c87a2b0095578dda302ad92)) +* **release:** 1.33.7 [skip ci] ([7a5764e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7a5764e3fdbfea12b04ea0686a28025a9d89cb2f)) +* **release:** 1.33.8 [skip ci] ([bdd6a39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdd6a392e2c18de8c3e4e47e2f91a4a366365ff2)) + + ## [1.33.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.1...v1.33.2) (2024-12-06) @@ -29,6 +156,7 @@ ## [1.33.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0) (2024-12-05) + ### Features * add api integration ([8aa9103](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8aa9103f02af92d9e1a780450daa7bb303afc150)) diff --git a/examples/extras/chromium_selenium.py b/examples/extras/chromium_selenium.py new file mode 100644 index 00000000..fba530d4 --- /dev/null +++ b/examples/extras/chromium_selenium.py @@ -0,0 +1,119 @@ +import asyncio +import os +import json +from dotenv import load_dotenv +from scrapegraphai.docloaders.chromium import ChromiumLoader # Import your ChromiumLoader class +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from aiohttp import ClientError + +# Load environment variables for API keys +load_dotenv() + +# ************************************************ +# Define function to analyze content with ScrapegraphAI +# ************************************************ +async def analyze_content_with_scrapegraph(content: str): + """ + Analyze scraped content using ScrapegraphAI. + + Args: + content (str): The scraped HTML or text content. + + Returns: + dict: The result from ScrapegraphAI analysis. + """ + try: + # Initialize ScrapegraphAI SmartScraperGraph + smart_scraper = SmartScraperGraph( + prompt="Summarize the main content of this webpage and extract any contact information.", + source=content, # Pass the content directly + config={ + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "verbose": True + } + ) + result = smart_scraper.run() + return result + except Exception as e: + print(f"āŒ ScrapegraphAI analysis failed: {e}") + return {"error": str(e)} + +# ************************************************ +# Test scraper and ScrapegraphAI pipeline +# ************************************************ +async def test_scraper_with_analysis(scraper: ChromiumLoader, urls: list): + """ + Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI. + + Args: + scraper (ChromiumLoader): The ChromiumLoader instance. + urls (list): A list of URLs to scrape. + """ + for url in urls: + try: + print(f"\nšŸ”Ž Scraping: {url} using {scraper.backend}...") + result = await scraper.scrape(url) + + if "Error" in result or not result.strip(): + print(f"āŒ Failed to scrape {url}: {result}") + else: + print(f"āœ… Successfully scraped {url}. Content (first 200 chars): {result[:200]}") + + # Pass scraped content to ScrapegraphAI for analysis + print("šŸ¤– Analyzing content with ScrapegraphAI...") + analysis_result = await analyze_content_with_scrapegraph(result) + print("šŸ“ Analysis Result:") + print(json.dumps(analysis_result, indent=4)) + + except ClientError as ce: + print(f"āŒ Network error while scraping {url}: {ce}") + except Exception as e: + print(f"āŒ Unexpected error while scraping {url}: {e}") + +# ************************************************ +# Main Execution +# ************************************************ +async def main(): + urls_to_scrape = [ + "https://example.com", + "https://www.python.org", + "https://invalid-url.test" + ] + + # Test with Playwright backend + print("\n--- Testing Playwright Backend ---") + try: + scraper_playwright_chromium = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "chromium") + await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape) + + scraper_playwright_firefox = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "firefox") + await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape) + except ImportError as ie: + print(f"āŒ Playwright ImportError: {ie}") + except Exception as e: + print(f"āŒ Error initializing Playwright ChromiumLoader: {e}") + + # Test with Selenium backend + print("\n--- Testing Selenium Backend ---") + try: + scraper_selenium_chromium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "chromium") + await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape) + + scraper_selenium_firefox = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "firefox") + await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape) + except ImportError as ie: + print(f"āŒ Selenium ImportError: {ie}") + except Exception as e: + print(f"āŒ Error initializing Selenium ChromiumLoader: {e}") + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("āŒ Program interrupted by user.") + except Exception as e: + print(f"āŒ Program crashed: {e}") diff --git a/pyproject.toml b/pyproject.toml index 101bf26b..7b795ee8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,6 @@ [project] name = "scrapegraphai" - - - -version = "1.33.11" - - - +version = "1.34.0b14" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ @@ -48,7 +42,6 @@ dependencies = [ "scrapegraph-py>=1.7.0" ] -license = "MIT" readme = "README.md" homepage = "https://scrapegraphai.com/" repository = "https://github.com/ScrapeGraphAI/Scrapegraph-ai" @@ -115,7 +108,8 @@ screenshot_scraper = [ ] [build-system] -requires = ["hatchling"] +requires = ["hatchling==1.26.3"] + build-backend = "hatchling.build" [dependency-groups] diff --git a/requirements-dev.txt b/requirements-dev.txt index e04399e9..9174e4a4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ pytest==8.0.0 +pytest-asyncio==0.25.0 pytest-mock==3.14.0 burr[start]==0.22.1 sphinx==6.0 diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 942827ac..31043730 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -4,6 +4,9 @@ from langchain_core.documents import Document import aiohttp import async_timeout +from selenium import webdriver +from selenium.webdriver.chrome.options import Options as ChromeOptions +from typing import Union from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy logger = get_logger("web-loader") @@ -35,6 +38,7 @@ def __init__( load_state: str = "domcontentloaded", requires_js_support: bool = False, storage_state: Optional[str] = None, + browser_name: str = "chromium", #default chromium **kwargs: Any, ): """Initialize the loader with a list of URL paths. @@ -65,6 +69,16 @@ def __init__( self.load_state = load_state self.requires_js_support = requires_js_support self.storage_state = storage_state + self.browser_name = browser_name + + async def scrape(self, url:str) -> str: + if self.backend == "playwright": + return await self.ascrape_playwright(url) + elif self.backend == "selenium": + return await self.ascrape_undetected_chromedriver(url) + else: + raise ValueError(f"Unsupported backend: {self.backend}") + async def ascrape_undetected_chromedriver(self, url: str) -> str: """ @@ -85,11 +99,35 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_timeout.timeout(self.TIMEOUT): - driver = uc.Chrome(headless=self.headless) - driver.get(url) - results = driver.page_source - logger.info(f"Successfully scraped {url}") - break + # Handling browser selection + if self.backend == "selenium": + if self.browser_name == "chromium": + options = ChromeOptions() + options.headless = self.headless + # Initialize undetected chromedriver for Selenium + driver = uc.Chrome(options=options) + driver.get(url) + results = driver.page_source + logger.info(f"Successfully scraped {url} with {self.browser_name}") + break + elif self.browser_name == "firefox": + from selenium.webdriver.firefox.options import Options as FirefoxOptions + options = FirefoxOptions() + options.headless = self.headless + # Initialize undetected Firefox driver (if required) + driver = webdriver.Firefox(options=options) + driver.get(url) + results = driver.page_source + logger.info(f"Successfully scraped {url} with {self.browser_name}") + break + else: + logger.error(f"Unsupported browser {self.browser_name} for Selenium.") + results = f"Error: Unsupported browser {self.browser_name}." + break + else: + logger.error(f"Unsupported backend {self.backend}.") + results = f"Error: Unsupported backend {self.backend}." + break except (aiohttp.ClientError, asyncio.TimeoutError) as e: attempt += 1 logger.error(f"Attempt {attempt} failed: {e}") @@ -102,22 +140,169 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str: return results - async def ascrape_playwright(self, url: str) -> str: + async def ascrape_playwright_scroll( + self, + url: str, + timeout: Union[int, None]=30, + scroll: int=15000, + sleep: float=2, + scroll_to_bottom: bool=False, + browser_name: str = "chromium" #default chrome is added + ) -> str: + """ + Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling. + + Notes: + - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time. + - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when + the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual. + - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load. + Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to + make sure that the scrolling actually happens, thereby allowing the page height to change. + - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling. + + Args: + - url (str): The URL to scrape. + - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0. + Can also be set to None, in which case the scraper will only stop when the page height stops changing. + - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels. + Less than this and we don't scroll enough to see any content change. + - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load. + Defaults to 2. Must be greater than 0. + + Returns: + str: The scraped HTML content + + Raises: + - ValueError: If the timeout value is less than or equal to 0. + - ValueError: If the sleep value is less than or equal to 0. + - ValueError: If the scroll value is less than 5000. + """ + # NB: I have tested using scrollHeight to determine when to stop scrolling + # but it doesn't always work as expected. The page height doesn't change on some sites like + # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom. + # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!? + + if timeout and timeout <= 0: + raise ValueError("If set, timeout value for scrolling scraper must be greater than 0.") + + if sleep <= 0: + raise ValueError("Sleep for scrolling scraper value must be greater than 0.") + + if scroll < 5000: + raise ValueError("Scroll value for scrolling scraper must be greater than or equal to 5000.") + + from playwright.async_api import async_playwright + from undetected_playwright import Malenia + import time + + logger.info(f"Starting scraping with scrolling support for {url}...") + + results = "" + attempt = 0 + + while attempt < self.RETRY_LIMIT: + try: + async with async_playwright() as p: + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() + await page.goto(url, wait_until="domcontentloaded") + await page.wait_for_load_state(self.load_state) + + previous_height = None + start_time = time.time() + + # Store the heights of the page after each scroll + # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom + # or simly when the page stops changing for some reason. + heights = [] + + while True: + current_height = await page.evaluate("document.body.scrollHeight") + heights.append(current_height) + heights = heights[-5:] # Keep only the last 5 heights, to not run out of memory + + # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress + # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading + # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout. + if scroll_to_bottom and previous_height == current_height: + logger.info(f"Reached bottom of page for url {url}") + break + + previous_height = current_height + + await page.mouse.wheel(0, scroll) + logger.debug(f"Scrolled {url} to current height {current_height}px...") + time.sleep(sleep) # Allow some time for any lazy-loaded content to load + + current_time = time.time() + elapsed_time = current_time - start_time + logger.debug(f"Elapsed time: {elapsed_time} seconds") + + if timeout: + if elapsed_time >= timeout: + logger.info(f"Reached timeout of {timeout} seconds for url {url}") + break + elif len(heights) == 5 and len(set(heights)) == 1: + logger.info(f"Page height has not changed for url {url} for the last 5 scrolls. Stopping.") + break + + results = await page.content() + break + + except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e: + attempt += 1 + logger.error(f"Attempt {attempt} failed: {e}") + if attempt == self.RETRY_LIMIT: + results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}" + finally: + await browser.close() + + return results + + async def ascrape_playwright(self, url: str, browser_name: str = "chromium") -> str: """ Asynchronously scrape the content of a given URL using Playwright's async API. + + Args: + url (str): The URL to scrape. + + Returns: + str: The scraped HTML content or an error message if an exception occurs. """ from playwright.async_api import async_playwright from undetected_playwright import Malenia logger.info(f"Starting scraping with {self.backend}...") + results = "" attempt = 0 while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT): - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context( storage_state=self.storage_state ) @@ -127,20 +312,22 @@ async def ascrape_playwright(self, url: str) -> str: await page.wait_for_load_state(self.load_state) results = await page.content() logger.info("Content scraped") - return results + break except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e: attempt += 1 logger.error(f"Attempt {attempt} failed: {e}") if attempt == self.RETRY_LIMIT: - raise RuntimeError( - f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}" - ) + results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}" finally: if "browser" in locals(): await browser.close() - async def ascrape_with_js_support(self, url: str) -> str: + return results + + + + async def ascrape_with_js_support(self, url: str , browser_name:str = "chromium") -> str: """ Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright. @@ -160,9 +347,17 @@ async def ascrape_with_js_support(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT): - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context( storage_state=self.storage_state ) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 1148cc29..476b4b5b 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -5,6 +5,7 @@ from abc import ABC, abstractmethod from typing import Optional import uuid +import asyncio import warnings from pydantic import BaseModel from langchain.chat_models import init_chat_model @@ -293,3 +294,14 @@ def run(self) -> str: """ Abstract method to execute the graph and return the result. """ + + async def run_safe_async(self) -> str: + """ + Executes the run process asynchronously safety. + + Returns: + str: The answer to the prompt. + """ + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self.run) \ No newline at end of file diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 705e2969..f36dfa05 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -254,4 +254,7 @@ "mixtral-moe-8x22B-instruct": 65536, "mixtral-moe-8x7B-instruct": 65536, }, + "togetherai" : { + "Meta-Llama-3.1-70B-Instruct-Turbo": 128000 + } } diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py index 642868fb..27f5b660 100644 --- a/tests/graphs/abstract_graph_test.py +++ b/tests/graphs/abstract_graph_test.py @@ -96,4 +96,12 @@ def test_create_llm_unknown_provider(self): def test_create_llm_with_rate_limit(self, llm_config, expected_model): graph = TestGraph("Test prompt", {"llm": llm_config}) - assert isinstance(graph.llm_model, expected_model) \ No newline at end of file + assert isinstance(graph.llm_model, expected_model) + + @pytest.mark.asyncio + async def test_run_safe_async(self): + graph = TestGraph("Test prompt", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-randomtest001"}}) + with patch.object(graph, 'run', return_value="Async result") as mock_run: + result = await graph.run_safe_async() + assert result == "Async result" + mock_run.assert_called_once() \ No newline at end of file diff --git a/uv.lock b/uv.lock index 3840481c..8e6093b4 100644 --- a/uv.lock +++ b/uv.lock @@ -1,21 +1,21 @@ version = 1 requires-python = ">=3.10, <4.0" resolution-markers = [ - "python_full_version < '3.11' and platform_system == 'Darwin'", - "python_full_version < '3.11' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version < '3.11' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version == '3.11.*' and platform_system == 'Darwin'", - "python_full_version == '3.11.*' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version == '3.11.*' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system == 'Darwin'", - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "python_full_version >= '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", - "(python_full_version >= '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", + "python_full_version < '3.11' and sys_platform == 'darwin'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", ] [[package]] @@ -222,12 +222,12 @@ name = "async-timeout" version = "4.0.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.11' and platform_system == 'Darwin'", - "python_full_version < '3.11' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version < '3.11' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version == '3.11.*' and platform_system == 'Darwin'", - "python_full_version == '3.11.*' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version == '3.11.*' and platform_system != 'Darwin' and platform_system != 'Linux')", + "python_full_version < '3.11' and sys_platform == 'darwin'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", ] sdist = { url = "https://files.pythonhosted.org/packages/87/d6/21b30a550dafea84b1b8eee21b5e23fa16d010ae006011221f33dcd8d7f8/async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f", size = 8345 } wheels = [ @@ -239,15 +239,15 @@ name = "async-timeout" version = "5.0.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system == 'Darwin'", - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "python_full_version >= '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", - "(python_full_version >= '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", ] sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274 } wheels = [ @@ -457,7 +457,7 @@ name = "click" version = "8.1.7" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/96/d3/f04c7bfcf5c1862a2a5b845c6b2b360488cf47af55dfa79c98f6a6bf98b5/click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de", size = 336121 } wheels = [ @@ -2461,7 +2461,7 @@ version = "2.10.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pygments" }, - { name = "pywin32", marker = "platform_system == 'Windows'" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, { name = "tqdm" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3a/93/80ac75c20ce54c785648b4ed363c88f148bf22637e10c9863db4fbe73e74/mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97", size = 271270 } @@ -2675,7 +2675,7 @@ name = "nvidia-cudnn-cu12" version = "9.1.0.70" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 }, @@ -2686,7 +2686,7 @@ name = "nvidia-cufft-cu12" version = "11.2.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 }, @@ -2707,9 +2707,9 @@ name = "nvidia-cusolver-cu12" version = "11.6.1.9" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, - { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 }, @@ -2721,7 +2721,7 @@ name = "nvidia-cusparse-cu12" version = "12.3.1.170" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 }, @@ -3132,7 +3132,7 @@ name = "portalocker" version = "2.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "platform_system == 'Windows'" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 } wheels = [ @@ -4081,7 +4081,8 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.33.0" +version = "1.34.0b11" + source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, @@ -4145,10 +4146,13 @@ screenshot-scraper = [ [package.dev-dependencies] dev = [ + { name = "burr", extra = ["start"] }, + { name = "furo" }, { name = "poethepoet" }, { name = "pylint" }, { name = "pytest" }, { name = "pytest-mock" }, + { name = "sphinx" }, ] [package.metadata] @@ -4199,10 +4203,13 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ + { name = "burr", extras = ["start"], specifier = "==0.22.1" }, + { name = "furo", specifier = "==2024.5.6" }, { name = "poethepoet", specifier = ">=0.31.1" }, { name = "pylint", specifier = ">=3.2.5" }, { name = "pytest", specifier = "==8.0.0" }, { name = "pytest-mock", specifier = "==3.14.0" }, + { name = "sphinx", specifier = "==6.0" }, ] [[package]] @@ -4560,7 +4567,7 @@ dependencies = [ { name = "toml" }, { name = "tornado" }, { name = "typing-extensions" }, - { name = "watchdog", marker = "platform_system != 'Darwin'" }, + { name = "watchdog", marker = "sys_platform != 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b0/e5/2bf2daa9c98658f1474bb64e7de030cbc4182b5f2b2196536efedaef02cb/streamlit-1.40.2.tar.gz", hash = "sha256:0cc131fc9b18065feaff8f6f241c81164ad37d8d9e3a85499a0240aaaf6a6a61", size = 8265763 } wheels = [ @@ -4764,21 +4771,21 @@ dependencies = [ { name = "fsspec" }, { name = "jinja2" }, { name = "networkx" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, - { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions" }, ] wheels = [ @@ -4820,7 +4827,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ @@ -4862,7 +4869,7 @@ name = "triton" version = "3.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "(python_full_version < '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version < '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "filelock", marker = "(python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },