From 1c8b910562112947a357277bca9dc81619b72e61 Mon Sep 17 00:00:00 2001 From: Alin Cristian Preda Date: Tue, 3 Dec 2024 21:10:42 +0100 Subject: [PATCH 01/49] feat: added scrolling method to chromium docloader --- scrapegraphai/docloaders/chromium.py | 141 ++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 3cc49e7f..db547cec 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -4,9 +4,11 @@ from langchain_core.documents import Document import aiohttp import async_timeout +from typing import Union from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy logger = get_logger("web-loader") +logger.setLevel("INFO") class ChromiumLoader(BaseLoader): """Scrapes HTML pages from URLs using a (headless) instance of the @@ -97,14 +99,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str: return results + async def ascrape_playwright_scroll( + self, + url: str, + timeout: Union[int, None]=30, + scroll: int=15000, + sleep: float=2, + scroll_to_bottom: bool=False + ) -> str: + """ + Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling. + + Notes: + - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time. + - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when + the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual. + - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load. + Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to + make sure that the scrolling actually happens, thereby allowing the page height to change. + - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling. + + Args: + - url (str): The URL to scrape. + - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0. + Can also be set to None, in which case the scraper will only stop when the page height stops changing. + - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels. + Less than this and we don't scroll enough to see any content change. + - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load. + Defaults to 2. Must be greater than 0. + + Returns: + str: The scraped HTML content + + Raises: + - ValueError: If the timeout value is less than or equal to 0. + - ValueError: If the sleep value is less than or equal to 0. + - ValueError: If the scroll value is less than 5000. + """ + # NB: I have tested using scrollHeight to determine when to stop scrolling + # but it doesn't always work as expected. The page height doesn't change on some sites like + # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom. + # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!? + + if timeout and timeout <= 0: + raise ValueError("If set, timeout value for scrolling scraper must be greater than 0.") + + if sleep <= 0: + raise ValueError("Sleep for scrolling scraper value must be greater than 0.") + + if scroll < 5000: + raise ValueError("Scroll value for scrolling scraper must be greater than or equal to 5000.") + + from playwright.async_api import async_playwright + from undetected_playwright import Malenia + import time + + logger.info(f"Starting scraping with scrolling support for {url}...") + + results = "" + attempt = 0 + + while attempt < self.RETRY_LIMIT: + try: + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() + await page.goto(url, wait_until="domcontentloaded") + await page.wait_for_load_state(self.load_state) + + previous_height = None + start_time = time.time() + + # Store the heights of the page after each scroll + # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom + # or simly when the page stops changing for some reason. + heights = [] + + while True: + current_height = await page.evaluate("document.body.scrollHeight") + heights.append(current_height) + heights = heights[-5:] # Keep only the last 5 heights, to not run out of memory + + # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress + # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading + # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout. + if scroll_to_bottom and previous_height == current_height: + logger.info(f"Reached bottom of page for url {url}") + break + + previous_height = current_height + + await page.mouse.wheel(0, scroll) + logger.debug(f"Scrolled {url} to current height {current_height}px...") + time.sleep(sleep) # Allow some time for any lazy-loaded content to load + + current_time = time.time() + elapsed_time = current_time - start_time + logger.debug(f"Elapsed time: {elapsed_time} seconds") + + if timeout: + if elapsed_time >= timeout: + logger.info(f"Reached timeout of {timeout} seconds for url {url}") + break + elif len(heights) == 5 and len(set(heights)) == 1: + logger.info(f"Page height has not changed for url {url} for the last 5 scrolls. Stopping.") + break + + results = await page.content() + break + + except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e: + attempt += 1 + logger.error(f"Attempt {attempt} failed: {e}") + if attempt == self.RETRY_LIMIT: + results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}" + finally: + await browser.close() + + return results + async def ascrape_playwright(self, url: str) -> str: """ Asynchronously scrape the content of a given URL using Playwright's async API. + + Args: + url (str): The URL to scrape. + + Returns: + str: The scraped HTML content or an error message if an exception occurs. """ from playwright.async_api import async_playwright from undetected_playwright import Malenia logger.info(f"Starting scraping with {self.backend}...") + results = "" attempt = 0 while attempt < self.RETRY_LIMIT: @@ -120,15 +252,16 @@ async def ascrape_playwright(self, url: str) -> str: await page.wait_for_load_state(self.load_state) results = await page.content() logger.info("Content scraped") - return results + break except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e: attempt += 1 logger.error(f"Attempt {attempt} failed: {e}") if attempt == self.RETRY_LIMIT: - raise RuntimeError(f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}") + results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}" finally: - if 'browser' in locals(): - await browser.close() + await browser.close() + + return results async def ascrape_with_js_support(self, url: str) -> str: """ From 7eeca1b91ae3f00dc374a3a33e52fb00e604c6e2 Mon Sep 17 00:00:00 2001 From: Alin Cristian Preda Date: Tue, 3 Dec 2024 21:17:47 +0100 Subject: [PATCH 02/49] quickfix: removed unneccessary line --- scrapegraphai/docloaders/chromium.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index db547cec..3bb901b7 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -8,7 +8,6 @@ from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy logger = get_logger("web-loader") -logger.setLevel("INFO") class ChromiumLoader(BaseLoader): """Scrapes HTML pages from URLs using a (headless) instance of the From 60e2fdff78e405e127ba8b10daa454d634bccf46 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 5 Dec 2024 21:52:39 +0000 Subject: [PATCH 03/49] ci(release): 1.33.0-beta.1 [skip ci] ## [1.33.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0-beta.1) (2024-12-05) ### Features * add api integration ([8aa9103](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8aa9103f02af92d9e1a780450daa7bb303afc150)) * add API integration ([ba6e931](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ba6e931caf5f3d4a3b9c31ec4655fe7a9f0e214c)) * add sdk integration ([209b445](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/209b4456fd668d9d124fd5586b32a4be677d4bf8)) * revert search function ([faf0c01](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/faf0c0123b5e2e548cbd1917e9d1df22e1edb1c5)) ### Bug Fixes * error on fetching the code ([7285ab0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7285ab065bba9099ba2751c9d2f21ee13fed0d5f)) * improved links extraction for parse_node, resolves [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) ([7da7bfe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7da7bfe338a6ce53c83361a1f6cd9ea2d5bd797f)) ### chore * migrate from rye to uv ([5fe528a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe528a7e7a3e230d8f68fd83ce5ad6ede5adfef)) ### CI * **release:** 1.32.0-beta.1 [skip ci] ([b98dd39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b98dd39150947fb121cd726d343c9d6fb9a31d5f)) * **release:** 1.32.0-beta.2 [skip ci] ([8b17764](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8b17764a53c4e16c7c0178925f9275282f5dba3c)) * **release:** 1.32.0-beta.3 [skip ci] ([0769fce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0769fce7d501692bd1135d6337b0aea4a397c8f1)), closes [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) * **release:** 1.32.0-beta.4 [skip ci] ([67c9859](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/67c9859c2078e7ec3b3ac99827deb346860f1a83)) * **release:** 1.32.0-beta.5 [skip ci] ([fbb4252](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fbb42526320cd614684fe1092cac89cde86c27d4)) --- CHANGELOG.md | 30 ++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a455f231..cefed740 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,33 @@ +## [1.33.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0-beta.1) (2024-12-05) + + +### Features + +* add api integration ([8aa9103](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8aa9103f02af92d9e1a780450daa7bb303afc150)) +* add API integration ([ba6e931](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ba6e931caf5f3d4a3b9c31ec4655fe7a9f0e214c)) +* add sdk integration ([209b445](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/209b4456fd668d9d124fd5586b32a4be677d4bf8)) +* revert search function ([faf0c01](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/faf0c0123b5e2e548cbd1917e9d1df22e1edb1c5)) + + +### Bug Fixes + +* error on fetching the code ([7285ab0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7285ab065bba9099ba2751c9d2f21ee13fed0d5f)) +* improved links extraction for parse_node, resolves [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) ([7da7bfe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7da7bfe338a6ce53c83361a1f6cd9ea2d5bd797f)) + + +### chore + +* migrate from rye to uv ([5fe528a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe528a7e7a3e230d8f68fd83ce5ad6ede5adfef)) + + +### CI + +* **release:** 1.32.0-beta.1 [skip ci] ([b98dd39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b98dd39150947fb121cd726d343c9d6fb9a31d5f)) +* **release:** 1.32.0-beta.2 [skip ci] ([8b17764](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8b17764a53c4e16c7c0178925f9275282f5dba3c)) +* **release:** 1.32.0-beta.3 [skip ci] ([0769fce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0769fce7d501692bd1135d6337b0aea4a397c8f1)), closes [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) +* **release:** 1.32.0-beta.4 [skip ci] ([67c9859](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/67c9859c2078e7ec3b3ac99827deb346860f1a83)) +* **release:** 1.32.0-beta.5 [skip ci] ([fbb4252](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fbb42526320cd614684fe1092cac89cde86c27d4)) + ## [1.32.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0-beta.4...v1.32.0-beta.5) (2024-12-02) ## [1.32.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1...v1.32.0) (2024-12-02) diff --git a/pyproject.toml b/pyproject.toml index 1cd4a7b5..529daf9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "scrapegraphai" -version = "1.32.0b5" +version = "1.33.0b1" From 09995cd56c96cfa709a68bea73113ab5debfcb97 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 6 Dec 2024 08:18:43 +0000 Subject: [PATCH 04/49] ci(release): 1.33.0-beta.2 [skip ci] ## [1.33.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.0-beta.1...v1.33.0-beta.2) (2024-12-06) ### Features * added scrolling method to chromium docloader ([1c8b910](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1c8b910562112947a357277bca9dc81619b72e61)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cefed740..b5cac9c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.33.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.0-beta.1...v1.33.0-beta.2) (2024-12-06) + + +### Features + +* added scrolling method to chromium docloader ([1c8b910](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1c8b910562112947a357277bca9dc81619b72e61)) + ## [1.33.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0-beta.1) (2024-12-05) diff --git a/pyproject.toml b/pyproject.toml index 529daf9b..f0487b2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "scrapegraphai" -version = "1.33.0b1" +version = "1.33.0b2" From 2a032d6d7cf18c435fba59764e7cb28707737f0c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 8 Dec 2024 15:57:51 +0100 Subject: [PATCH 05/49] feat: add new model token --- scrapegraphai/helpers/models_tokens.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 705e2969..f36dfa05 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -254,4 +254,7 @@ "mixtral-moe-8x22B-instruct": 65536, "mixtral-moe-8x7B-instruct": 65536, }, + "togetherai" : { + "Meta-Llama-3.1-70B-Instruct-Turbo": 128000 + } } From f97c45c447a3f45dd59dbeb5b70ff676cecdec3c Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 8 Dec 2024 14:59:06 +0000 Subject: [PATCH 06/49] ci(release): 1.34.0-beta.1 [skip ci] ## [1.34.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.2...v1.34.0-beta.1) (2024-12-08) ### Features * add new model token ([2a032d6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2a032d6d7cf18c435fba59764e7cb28707737f0c)) * added scrolling method to chromium docloader ([1c8b910](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1c8b910562112947a357277bca9dc81619b72e61)) ### CI * **release:** 1.33.0-beta.1 [skip ci] ([60e2fdf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/60e2fdff78e405e127ba8b10daa454d634bccf46)), closes [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) * **release:** 1.33.0-beta.2 [skip ci] ([09995cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/09995cd56c96cfa709a68bea73113ab5debfcb97)) --- CHANGELOG.md | 14 ++++++++++++++ pyproject.toml | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6beeb037..1b242b41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +## [1.34.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.2...v1.34.0-beta.1) (2024-12-08) + + +### Features + +* add new model token ([2a032d6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2a032d6d7cf18c435fba59764e7cb28707737f0c)) +* added scrolling method to chromium docloader ([1c8b910](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1c8b910562112947a357277bca9dc81619b72e61)) + + +### CI + +* **release:** 1.33.0-beta.1 [skip ci] ([60e2fdf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/60e2fdff78e405e127ba8b10daa454d634bccf46)), closes [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) +* **release:** 1.33.0-beta.2 [skip ci] ([09995cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/09995cd56c96cfa709a68bea73113ab5debfcb97)) + ## [1.33.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.1...v1.33.2) (2024-12-06) diff --git a/pyproject.toml b/pyproject.toml index 87c64441..02fca045 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "scrapegraphai" -version = "1.33.2" +version = "1.34.0b1" From a4f0f5d16d7a2f218b51f198a1ad00a6edfcca7e Mon Sep 17 00:00:00 2001 From: SwapnilSonker Date: Mon, 16 Dec 2024 20:26:20 +0530 Subject: [PATCH 07/49] Add function to select backend (Selenium or other) for issue #171 --- scrapegraphai/docloaders/chromium.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 942827ac..54493887 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -65,6 +65,15 @@ def __init__( self.load_state = load_state self.requires_js_support = requires_js_support self.storage_state = storage_state + + async def scrape(self, url:str) -> str: + if self.backend == "playwright": + return await self.ascrape_playwright(url) + elif self.backend == "selenium": + return await self.ascrape_undetected_chromedriver(url) + else: + raise ValueError(f"Unsupported backend: {self.backend}") + async def ascrape_undetected_chromedriver(self, url: str) -> str: """ From 753737e655b89bcb7cbf15aa69215fdc58a5c3ae Mon Sep 17 00:00:00 2001 From: SwapnilSonker Date: Tue, 17 Dec 2024 08:08:10 +0530 Subject: [PATCH 08/49] added the example in example/extras #171 --- examples/extras/chromium_selenium.py | 57 ++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 examples/extras/chromium_selenium.py diff --git a/examples/extras/chromium_selenium.py b/examples/extras/chromium_selenium.py new file mode 100644 index 00000000..e8ade7c0 --- /dev/null +++ b/examples/extras/chromium_selenium.py @@ -0,0 +1,57 @@ +import asyncio +from chromium import ChromiumLoader # Import the ChromiumLoader class from chromium.py +from aiohttp import ClientError + + +async def test_scraper(scraper: ChromiumLoader, urls: list): + """ + Test scraper for the given backend and URLs. + Args: + scraper (ChromiumLoader): The ChromiumLoader instance. + urls (list): A list of URLs to scrape. + """ + for url in urls: + try: + print(f"Scraping: {url} using {scraper.backend}...") + result = await scraper.scrape(url) + if "Error" in result or not result.strip(): + print(f"❌ Failed to scrape {url}: {result}") + else: + print(f"✅ Successfully scraped {url}. Content (first 200 chars): {result[:200]}") + except ClientError as ce: + print(f"❌ Network error while scraping {url}: {ce}") + except Exception as e: + print(f"❌ Unexpected error while scraping {url}: {e}") + + +async def main(): + urls_to_scrape = ["https://example.com", "https://www.python.org", "https://invalid-url.test"] + + # Test with Playwright backend + print("\n--- Testing Playwright Backend ---") + try: + scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True) + await test_scraper(scraper_playwright, urls_to_scrape) + except ImportError as ie: + print(f"❌ Playwright ImportError: {ie}") + except Exception as e: + print(f"❌ Error initializing Playwright ChromiumLoader: {e}") + + # Test with Selenium backend + print("\n--- Testing Selenium Backend ---") + try: + scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True) + await test_scraper(scraper_selenium, urls_to_scrape) + except ImportError as ie: + print(f"❌ Selenium ImportError: {ie}") + except Exception as e: + print(f"❌ Error initializing Selenium ChromiumLoader: {e}") + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("Program interrupted by user.") + except Exception as e: + print(f"❌ Program crashed: {e}") From fe66e3d28dcab063d79c2b21bd8850adb31f2f50 Mon Sep 17 00:00:00 2001 From: SwapnilSonker Date: Tue, 17 Dec 2024 15:58:19 +0530 Subject: [PATCH 09/49] #171 used ScrapegraphAI library and created a more consistent example. --- examples/extras/chromium_selenium.py | 76 ++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/examples/extras/chromium_selenium.py b/examples/extras/chromium_selenium.py index e8ade7c0..fb7686a1 100644 --- a/examples/extras/chromium_selenium.py +++ b/examples/extras/chromium_selenium.py @@ -1,37 +1,94 @@ import asyncio -from chromium import ChromiumLoader # Import the ChromiumLoader class from chromium.py +import os +import json +from dotenv import load_dotenv +from chromium import ChromiumLoader # Import your ChromiumLoader class +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info from aiohttp import ClientError +# Load environment variables for API keys +load_dotenv() -async def test_scraper(scraper: ChromiumLoader, urls: list): +# ************************************************ +# Define function to analyze content with ScrapegraphAI +# ************************************************ +async def analyze_content_with_scrapegraph(content: str): """ - Test scraper for the given backend and URLs. + Analyze scraped content using ScrapegraphAI. + + Args: + content (str): The scraped HTML or text content. + + Returns: + dict: The result from ScrapegraphAI analysis. + """ + try: + # Initialize ScrapegraphAI SmartScraperGraph + smart_scraper = SmartScraperGraph( + prompt="Summarize the main content of this webpage and extract any contact information.", + source=content, # Pass the content directly + config={ + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "verbose": True + } + ) + result = smart_scraper.run() + return result + except Exception as e: + print(f"❌ ScrapegraphAI analysis failed: {e}") + return {"error": str(e)} + +# ************************************************ +# Test scraper and ScrapegraphAI pipeline +# ************************************************ +async def test_scraper_with_analysis(scraper: ChromiumLoader, urls: list): + """ + Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI. + Args: scraper (ChromiumLoader): The ChromiumLoader instance. urls (list): A list of URLs to scrape. """ for url in urls: try: - print(f"Scraping: {url} using {scraper.backend}...") + print(f"\n🔎 Scraping: {url} using {scraper.backend}...") result = await scraper.scrape(url) + if "Error" in result or not result.strip(): print(f"❌ Failed to scrape {url}: {result}") else: print(f"✅ Successfully scraped {url}. Content (first 200 chars): {result[:200]}") + + # Pass scraped content to ScrapegraphAI for analysis + print("🤖 Analyzing content with ScrapegraphAI...") + analysis_result = await analyze_content_with_scrapegraph(result) + print("📝 Analysis Result:") + print(json.dumps(analysis_result, indent=4)) + except ClientError as ce: print(f"❌ Network error while scraping {url}: {ce}") except Exception as e: print(f"❌ Unexpected error while scraping {url}: {e}") - +# ************************************************ +# Main Execution +# ************************************************ async def main(): - urls_to_scrape = ["https://example.com", "https://www.python.org", "https://invalid-url.test"] + urls_to_scrape = [ + "https://example.com", + "https://www.python.org", + "https://invalid-url.test" + ] # Test with Playwright backend print("\n--- Testing Playwright Backend ---") try: scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True) - await test_scraper(scraper_playwright, urls_to_scrape) + await test_scraper_with_analysis(scraper_playwright, urls_to_scrape) except ImportError as ie: print(f"❌ Playwright ImportError: {ie}") except Exception as e: @@ -41,17 +98,16 @@ async def main(): print("\n--- Testing Selenium Backend ---") try: scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True) - await test_scraper(scraper_selenium, urls_to_scrape) + await test_scraper_with_analysis(scraper_selenium, urls_to_scrape) except ImportError as ie: print(f"❌ Selenium ImportError: {ie}") except Exception as e: print(f"❌ Error initializing Selenium ChromiumLoader: {e}") - if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: - print("Program interrupted by user.") + print("❌ Program interrupted by user.") except Exception as e: print(f"❌ Program crashed: {e}") From cbc75add0d92a64244c47165b9900354c2ab6221 Mon Sep 17 00:00:00 2001 From: SwapnilSonker Date: Tue, 17 Dec 2024 17:43:03 +0530 Subject: [PATCH 10/49] #171 imported ChromiumLoader from ScrapegraphAI --- examples/extras/chromium_selenium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/extras/chromium_selenium.py b/examples/extras/chromium_selenium.py index fb7686a1..5e647bce 100644 --- a/examples/extras/chromium_selenium.py +++ b/examples/extras/chromium_selenium.py @@ -2,7 +2,7 @@ import os import json from dotenv import load_dotenv -from chromium import ChromiumLoader # Import your ChromiumLoader class +from scrapegraphai.docloaders.chromium import ChromiumLoader # Import your ChromiumLoader class from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from aiohttp import ClientError From caf941df25b116bece9d9142b5133d8d4e1db264 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 17 Dec 2024 12:33:22 +0000 Subject: [PATCH 11/49] ci(release): 1.34.0-beta.2 [skip ci] ## [1.34.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.1...v1.34.0-beta.2) (2024-12-17) ### Bug Fixes * context window ([ffdadae](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ffdadaed6fe3f17da535e6eddb73851fce2f4bf2)) * formatting ([d1b2104](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d1b2104f28d84c5129edb29a5efdaf5bf7d22bfb)) * pyproject ([76ac0a2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ac0a2141d9d53af023a405e2c61849921e4f0e)) * pyproject ([3dcfcd4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3dcfcd492e71297031a7df1dba9dd135f1fae60e)) * pyproject ([bf6cb0a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bf6cb0a582004617724e11ed04ba617eb39abc0c)) * uv.lock ([0a7fc39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0a7fc392dea2b62122b977d62f4d85b117fc8351)) ### CI * **release:** 1.33.3 [skip ci] ([488093a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/488093a63fcc1dc01eabdab301d752416a025139)) * **release:** 1.33.4 [skip ci] ([a789179](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a78917997060edbd61df5279546587e4ef123ea1)) * **release:** 1.33.5 [skip ci] ([7a6164f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7a6164f1dc6dbb8ff0b4f7fc653f3910445f0754)) * **release:** 1.33.6 [skip ci] ([ca96c3d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ca96c3d4309bd2b92c87a2b0095578dda302ad92)) * **release:** 1.33.7 [skip ci] ([7a5764e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7a5764e3fdbfea12b04ea0686a28025a9d89cb2f)) * **release:** 1.33.8 [skip ci] ([bdd6a39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdd6a392e2c18de8c3e4e47e2f91a4a366365ff2)) --- CHANGELOG.md | 22 ++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6beeb037..d7eca23c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,25 @@ +## [1.34.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.1...v1.34.0-beta.2) (2024-12-17) + + +### Bug Fixes + +* context window ([ffdadae](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ffdadaed6fe3f17da535e6eddb73851fce2f4bf2)) +* formatting ([d1b2104](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d1b2104f28d84c5129edb29a5efdaf5bf7d22bfb)) +* pyproject ([76ac0a2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ac0a2141d9d53af023a405e2c61849921e4f0e)) +* pyproject ([3dcfcd4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3dcfcd492e71297031a7df1dba9dd135f1fae60e)) +* pyproject ([bf6cb0a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bf6cb0a582004617724e11ed04ba617eb39abc0c)) +* uv.lock ([0a7fc39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0a7fc392dea2b62122b977d62f4d85b117fc8351)) + + +### CI + +* **release:** 1.33.3 [skip ci] ([488093a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/488093a63fcc1dc01eabdab301d752416a025139)) +* **release:** 1.33.4 [skip ci] ([a789179](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a78917997060edbd61df5279546587e4ef123ea1)) +* **release:** 1.33.5 [skip ci] ([7a6164f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7a6164f1dc6dbb8ff0b4f7fc653f3910445f0754)) +* **release:** 1.33.6 [skip ci] ([ca96c3d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ca96c3d4309bd2b92c87a2b0095578dda302ad92)) +* **release:** 1.33.7 [skip ci] ([7a5764e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7a5764e3fdbfea12b04ea0686a28025a9d89cb2f)) +* **release:** 1.33.8 [skip ci] ([bdd6a39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdd6a392e2c18de8c3e4e47e2f91a4a366365ff2)) + ## [1.33.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.1...v1.33.2) (2024-12-06) diff --git a/pyproject.toml b/pyproject.toml index 2fb301d0..2cedba46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "scrapegraphai" -version = "1.33.2" +version = "1.34.0b2" From 491492833977ba34ebdce573dca712f8675772fb Mon Sep 17 00:00:00 2001 From: SwapnilSonker Date: Wed, 18 Dec 2024 09:10:46 +0530 Subject: [PATCH 12/49] #772 added functionality to change browser to firefox --- examples/extras/chromium_selenium.py | 14 ++++-- scrapegraphai/docloaders/chromium.py | 75 ++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 15 deletions(-) diff --git a/examples/extras/chromium_selenium.py b/examples/extras/chromium_selenium.py index 5e647bce..fba530d4 100644 --- a/examples/extras/chromium_selenium.py +++ b/examples/extras/chromium_selenium.py @@ -87,8 +87,11 @@ async def main(): # Test with Playwright backend print("\n--- Testing Playwright Backend ---") try: - scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True) - await test_scraper_with_analysis(scraper_playwright, urls_to_scrape) + scraper_playwright_chromium = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "chromium") + await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape) + + scraper_playwright_firefox = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "firefox") + await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape) except ImportError as ie: print(f"❌ Playwright ImportError: {ie}") except Exception as e: @@ -97,8 +100,11 @@ async def main(): # Test with Selenium backend print("\n--- Testing Selenium Backend ---") try: - scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True) - await test_scraper_with_analysis(scraper_selenium, urls_to_scrape) + scraper_selenium_chromium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "chromium") + await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape) + + scraper_selenium_firefox = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "firefox") + await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape) except ImportError as ie: print(f"❌ Selenium ImportError: {ie}") except Exception as e: diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 94d57016..31043730 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -4,6 +4,8 @@ from langchain_core.documents import Document import aiohttp import async_timeout +from selenium import webdriver +from selenium.webdriver.chrome.options import Options as ChromeOptions from typing import Union from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy @@ -36,6 +38,7 @@ def __init__( load_state: str = "domcontentloaded", requires_js_support: bool = False, storage_state: Optional[str] = None, + browser_name: str = "chromium", #default chromium **kwargs: Any, ): """Initialize the loader with a list of URL paths. @@ -66,6 +69,7 @@ def __init__( self.load_state = load_state self.requires_js_support = requires_js_support self.storage_state = storage_state + self.browser_name = browser_name async def scrape(self, url:str) -> str: if self.backend == "playwright": @@ -95,11 +99,35 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_timeout.timeout(self.TIMEOUT): - driver = uc.Chrome(headless=self.headless) - driver.get(url) - results = driver.page_source - logger.info(f"Successfully scraped {url}") - break + # Handling browser selection + if self.backend == "selenium": + if self.browser_name == "chromium": + options = ChromeOptions() + options.headless = self.headless + # Initialize undetected chromedriver for Selenium + driver = uc.Chrome(options=options) + driver.get(url) + results = driver.page_source + logger.info(f"Successfully scraped {url} with {self.browser_name}") + break + elif self.browser_name == "firefox": + from selenium.webdriver.firefox.options import Options as FirefoxOptions + options = FirefoxOptions() + options.headless = self.headless + # Initialize undetected Firefox driver (if required) + driver = webdriver.Firefox(options=options) + driver.get(url) + results = driver.page_source + logger.info(f"Successfully scraped {url} with {self.browser_name}") + break + else: + logger.error(f"Unsupported browser {self.browser_name} for Selenium.") + results = f"Error: Unsupported browser {self.browser_name}." + break + else: + logger.error(f"Unsupported backend {self.backend}.") + results = f"Error: Unsupported backend {self.backend}." + break except (aiohttp.ClientError, asyncio.TimeoutError) as e: attempt += 1 logger.error(f"Attempt {attempt} failed: {e}") @@ -118,7 +146,8 @@ async def ascrape_playwright_scroll( timeout: Union[int, None]=30, scroll: int=15000, sleep: float=2, - scroll_to_bottom: bool=False + scroll_to_bottom: bool=False, + browser_name: str = "chromium" #default chrome is added ) -> str: """ Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling. @@ -175,9 +204,17 @@ async def ascrape_playwright_scroll( while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p: - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context() await Malenia.apply_stealth(context) page = await context.new_page() @@ -235,7 +272,7 @@ async def ascrape_playwright_scroll( return results - async def ascrape_playwright(self, url: str) -> str: + async def ascrape_playwright(self, url: str, browser_name: str = "chromium") -> str: """ Asynchronously scrape the content of a given URL using Playwright's async API. @@ -255,9 +292,17 @@ async def ascrape_playwright(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT): - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context( storage_state=self.storage_state ) @@ -282,7 +327,7 @@ async def ascrape_playwright(self, url: str) -> str: - async def ascrape_with_js_support(self, url: str) -> str: + async def ascrape_with_js_support(self, url: str , browser_name:str = "chromium") -> str: """ Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright. @@ -302,9 +347,17 @@ async def ascrape_with_js_support(self, url: str) -> str: while attempt < self.RETRY_LIMIT: try: async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT): - browser = await p.chromium.launch( + browser = None + if browser_name == "chromium": + browser = await p.chromium.launch( headless=self.headless, proxy=self.proxy, **self.browser_config ) + elif browser_name == "firefox": + browser = await p.firefox.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + else: + raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context( storage_state=self.storage_state ) From 35a490747cf6b8dad747a4af7f02d6f5aeb0d338 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:41:34 +0100 Subject: [PATCH 13/49] fix: pyproject --- pyproject.toml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2cedba46..a017a651 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,6 @@ [project] name = "scrapegraphai" - - - version = "1.34.0b2" - - - - description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 7cd865b98d1b14446cf2959db04ad1b81728c5aa Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 18 Dec 2024 08:42:50 +0000 Subject: [PATCH 14/49] ci(release): 1.34.0-beta.3 [skip ci] ## [1.34.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.2...v1.34.0-beta.3) (2024-12-18) ### Bug Fixes * pyproject ([35a4907](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/35a490747cf6b8dad747a4af7f02d6f5aeb0d338)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7eca23c..44e81337 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.2...v1.34.0-beta.3) (2024-12-18) + + +### Bug Fixes + +* pyproject ([35a4907](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/35a490747cf6b8dad747a4af7f02d6f5aeb0d338)) + ## [1.34.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.1...v1.34.0-beta.2) (2024-12-17) diff --git a/pyproject.toml b/pyproject.toml index a017a651..91e6c004 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b2" +version = "1.34.0b3" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From d2fc53fc8414475c9bee7590144fe4251d56faf4 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:49:25 +0100 Subject: [PATCH 15/49] fix: build config --- .github/workflows/release.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4e6b9ee6..6159ce8c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -64,13 +64,13 @@ jobs: with: semantic_version: 23 extra_plugins: | - semantic-release-pypi@3 - @semantic-release/git - @semantic-release/commit-analyzer@12 - @semantic-release/release-notes-generator@13 - @semantic-release/github@10 - @semantic-release/changelog@6 - conventional-changelog-conventionalcommits@7 + semantic-release-pypi@latest + @semantic-release/git@latest + @semantic-release/commit-analyzer@latest + @semantic-release/release-notes-generator@latest + @semantic-release/github@latest + @semantic-release/changelog@latest + conventional-changelog-conventionalcommits@latest env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} From 46f598546109067267d01ae7d8ea7609526ea4d4 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:49:25 +0100 Subject: [PATCH 16/49] fix: build config --- .github/workflows/release.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4e6b9ee6..7544d512 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -64,13 +64,13 @@ jobs: with: semantic_version: 23 extra_plugins: | - semantic-release-pypi@3 - @semantic-release/git - @semantic-release/commit-analyzer@12 - @semantic-release/release-notes-generator@13 - @semantic-release/github@10 + semantic-release-pypi@4 + @semantic-release/git@10 + @semantic-release/commit-analyzer@13 + @semantic-release/release-notes-generator@14 + @semantic-release/github@11 @semantic-release/changelog@6 - conventional-changelog-conventionalcommits@7 + conventional-changelog-conventionalcommits@8 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} From 38e477c540a3a50fc7ff6120da255d51798bfadd Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:04:39 +0100 Subject: [PATCH 17/49] fix: release config --- .github/workflows/release.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7544d512..d26ef64d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,7 +32,7 @@ jobs: id: build_cache if: success() - name: Cache build - uses: actions/cache@v2 + uses: actions/cache@latest with: path: ./dist key: ${{ runner.os }}-build-${{ hashFiles('dist/**') }} @@ -59,6 +59,10 @@ jobs: with: fetch-depth: 0 persist-credentials: false + - name: Install semantic release plugins + run: npm install semantic-release-pypi@4 @semantic-release/git@10 @semantic-release/commit-analyzer@13 @semantic-release/release-notes-generator@14 @semantic-release/github@11 @semantic-release/changelog@6 conventional-changelog-conventionalcommits@8 --no-audit + - name: Clear npm cache + run: npm cache clean --force - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: From 89863ee166e09ee18287bfcc1b5475d894c9e8c6 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:04:39 +0100 Subject: [PATCH 18/49] fix: release config --- .github/workflows/release.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7544d512..8e26f5f3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,7 +32,7 @@ jobs: id: build_cache if: success() - name: Cache build - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ./dist key: ${{ runner.os }}-build-${{ hashFiles('dist/**') }} @@ -59,6 +59,10 @@ jobs: with: fetch-depth: 0 persist-credentials: false + - name: Install semantic release plugins + run: npm install semantic-release-pypi@4 @semantic-release/git@10 @semantic-release/commit-analyzer@13 @semantic-release/release-notes-generator@14 @semantic-release/github@11 @semantic-release/changelog@6 conventional-changelog-conventionalcommits@8 --no-audit + - name: Clear npm cache + run: npm cache clean --force - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: From 62ee294a864993a9414644c1547bafb96a43df20 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:28:40 +0100 Subject: [PATCH 19/49] fix: release config --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8e26f5f3..b91cee0c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,7 +19,7 @@ jobs: - name: Install Node Env uses: actions/setup-node@v4 with: - node-version: 20 + node-version: 22 - name: Checkout uses: actions/checkout@v4.1.1 with: From b186a4f1c73fe29fa706158cc3c61812d6b16343 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:35:36 +0100 Subject: [PATCH 20/49] fix: build config --- .github/workflows/release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b91cee0c..5532b3db 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,6 +16,8 @@ jobs: sudo apt install -y git - name: Install uv uses: astral-sh/setup-uv@v3 + - name: Clear Node.js cache + run: sudo apt-get purge -y nodejs - name: Install Node Env uses: actions/setup-node@v4 with: From 2538fe3db339014ef54e2c78269bce9259e284ea Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:39:26 +0100 Subject: [PATCH 21/49] fix: last desperate attempt to restore automatic builds --- .github/workflows/release.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5532b3db..7b15ead4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -65,6 +65,12 @@ jobs: run: npm install semantic-release-pypi@4 @semantic-release/git@10 @semantic-release/commit-analyzer@13 @semantic-release/release-notes-generator@14 @semantic-release/github@11 @semantic-release/changelog@6 conventional-changelog-conventionalcommits@8 --no-audit - name: Clear npm cache run: npm cache clean --force + - name: Clear Node.js cache + run: sudo apt-get purge -y nodejs + - name: Install Node Env + uses: actions/setup-node@v4 + with: + node-version: 22 - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: From 9cd0d31882c22f347ebd9c58d8dd66b47d178c64 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:28:40 +0100 Subject: [PATCH 22/49] fix: release config fix: last desperate attempt to restore automatic builds fix: build config --- .github/workflows/release.yml | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8e26f5f3..a8557a9d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,7 +32,7 @@ jobs: id: build_cache if: success() - name: Cache build - uses: actions/cache@v4 + uses: actions/cache@v2 with: path: ./dist key: ${{ runner.os }}-build-${{ hashFiles('dist/**') }} @@ -59,22 +59,18 @@ jobs: with: fetch-depth: 0 persist-credentials: false - - name: Install semantic release plugins - run: npm install semantic-release-pypi@4 @semantic-release/git@10 @semantic-release/commit-analyzer@13 @semantic-release/release-notes-generator@14 @semantic-release/github@11 @semantic-release/changelog@6 conventional-changelog-conventionalcommits@8 --no-audit - - name: Clear npm cache - run: npm cache clean --force - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: semantic_version: 23 extra_plugins: | - semantic-release-pypi@4 - @semantic-release/git@10 - @semantic-release/commit-analyzer@13 - @semantic-release/release-notes-generator@14 - @semantic-release/github@11 + semantic-release-pypi@3 + @semantic-release/git + @semantic-release/commit-analyzer@12 + @semantic-release/release-notes-generator@13 + @semantic-release/github@10 @semantic-release/changelog@6 - conventional-changelog-conventionalcommits@8 + conventional-changelog-conventionalcommits@7 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} \ No newline at end of file From 9cba928cc4449acdb784649c5a804f1ef8c7a7a5 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 18 Dec 2024 09:44:37 +0000 Subject: [PATCH 23/49] ci(release): 1.34.0-beta.4 [skip ci] ## [1.34.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.3...v1.34.0-beta.4) (2024-12-18) ### Bug Fixes * build config ([b186a4f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b186a4f1c73fe29fa706158cc3c61812d6b16343)) * build config ([46f5985](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/46f598546109067267d01ae7d8ea7609526ea4d4)) * build config ([d2fc53f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d2fc53fc8414475c9bee7590144fe4251d56faf4)) * last desperate attempt to restore automatic builds ([2538fe3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2538fe3db339014ef54e2c78269bce9259e284ea)) * release config ([9cd0d31](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd0d31882c22f347ebd9c58d8dd66b47d178c64)) * release config ([62ee294](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62ee294a864993a9414644c1547bafb96a43df20)) * release config ([89863ee](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/89863ee166e09ee18287bfcc1b5475d894c9e8c6)) * release config ([38e477c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38e477c540a3a50fc7ff6120da255d51798bfadd)) --- CHANGELOG.md | 14 ++++++++++++++ pyproject.toml | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44e81337..6c81a205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +## [1.34.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.3...v1.34.0-beta.4) (2024-12-18) + + +### Bug Fixes + +* build config ([b186a4f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b186a4f1c73fe29fa706158cc3c61812d6b16343)) +* build config ([46f5985](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/46f598546109067267d01ae7d8ea7609526ea4d4)) +* build config ([d2fc53f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d2fc53fc8414475c9bee7590144fe4251d56faf4)) +* last desperate attempt to restore automatic builds ([2538fe3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2538fe3db339014ef54e2c78269bce9259e284ea)) +* release config ([9cd0d31](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd0d31882c22f347ebd9c58d8dd66b47d178c64)) +* release config ([62ee294](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62ee294a864993a9414644c1547bafb96a43df20)) +* release config ([89863ee](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/89863ee166e09ee18287bfcc1b5475d894c9e8c6)) +* release config ([38e477c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38e477c540a3a50fc7ff6120da255d51798bfadd)) + ## [1.34.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.2...v1.34.0-beta.3) (2024-12-18) diff --git a/pyproject.toml b/pyproject.toml index 91e6c004..ac50b84b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b3" +version = "1.34.0b4" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 489adf7006d6ea2492a2e485dad4a11e2f865cfc Mon Sep 17 00:00:00 2001 From: Marcelo Busana Date: Sat, 21 Dec 2024 09:53:56 -0300 Subject: [PATCH 24/49] Add the AbstractGraph run_safe_async and its test --- requirements-dev.txt | 1 + scrapegraphai/graphs/abstract_graph.py | 12 ++++++++++++ tests/graphs/abstract_graph_test.py | 10 +++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index e04399e9..9174e4a4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ pytest==8.0.0 +pytest-asyncio==0.25.0 pytest-mock==3.14.0 burr[start]==0.22.1 sphinx==6.0 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 1148cc29..476b4b5b 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -5,6 +5,7 @@ from abc import ABC, abstractmethod from typing import Optional import uuid +import asyncio import warnings from pydantic import BaseModel from langchain.chat_models import init_chat_model @@ -293,3 +294,14 @@ def run(self) -> str: """ Abstract method to execute the graph and return the result. """ + + async def run_safe_async(self) -> str: + """ + Executes the run process asynchronously safety. + + Returns: + str: The answer to the prompt. + """ + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self.run) \ No newline at end of file diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py index 642868fb..27f5b660 100644 --- a/tests/graphs/abstract_graph_test.py +++ b/tests/graphs/abstract_graph_test.py @@ -96,4 +96,12 @@ def test_create_llm_unknown_provider(self): def test_create_llm_with_rate_limit(self, llm_config, expected_model): graph = TestGraph("Test prompt", {"llm": llm_config}) - assert isinstance(graph.llm_model, expected_model) \ No newline at end of file + assert isinstance(graph.llm_model, expected_model) + + @pytest.mark.asyncio + async def test_run_safe_async(self): + graph = TestGraph("Test prompt", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-randomtest001"}}) + with patch.object(graph, 'run', return_value="Async result") as mock_run: + result = await graph.run_safe_async() + assert result == "Async result" + mock_run.assert_called_once() \ No newline at end of file From cb6d140042685bd419444d75ae7cab706cbcee38 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 17:11:40 +0100 Subject: [PATCH 25/49] fix: release workflow --- .github/workflows/release.yml | 47 +++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a8557a9d..0c2c2b51 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,40 +14,52 @@ jobs: run: | sudo apt update sudo apt install -y git + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install uv uses: astral-sh/setup-uv@v3 + - name: Install Node Env uses: actions/setup-node@v4 with: node-version: 20 + - name: Checkout uses: actions/checkout@v4.1.1 with: fetch-depth: 0 persist-credentials: false - - name: Build app + + - name: Install dependencies run: | + uv pip install build hatchling hatch-vcs uv sync --frozen - uv build + + - name: Build package + run: python -m build id: build_cache - if: success() - - name: Cache build - uses: actions/cache@v2 + + - name: Cache build artifacts + uses: actions/cache@v3 with: - path: ./dist - key: ${{ runner.os }}-build-${{ hashFiles('dist/**') }} - if: steps.build_cache.outputs.id != '' + path: | + ./dist + ./build + key: ${{ runner.os }}-build-${{ github.sha }} release: name: Release runs-on: ubuntu-latest needs: build environment: development - if: | - github.event_name == 'push' && github.ref == 'refs/heads/main' || - github.event_name == 'push' && github.ref == 'refs/heads/pre/beta' || - github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'main' || - github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'pre/beta' + if: | + github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/pre/')) || + github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && + (github.event.pull_request.base.ref == 'main' || startsWith(github.event.pull_request.base.ref, 'pre/')) permissions: contents: write issues: write @@ -59,6 +71,15 @@ jobs: with: fetch-depth: 0 persist-credentials: false + + - name: Restore build artifacts + uses: actions/cache@v3 + with: + path: | + ./dist + ./build + key: ${{ runner.os }}-build-${{ github.sha }} + - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: From bcac20a7a8e65e2aa5760fb14e17b8054b4f4cf4 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 17:17:13 +0100 Subject: [PATCH 26/49] fix: uv install workflow --- .github/workflows/release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0c2c2b51..fda1a13a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -36,6 +36,8 @@ jobs: - name: Install dependencies run: | + uv venv + source .venv/bin/activate uv pip install build hatchling hatch-vcs uv sync --frozen From 1be6ffe309124d55b8b3b66ded448f06dfd87b7e Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 17:20:22 +0100 Subject: [PATCH 27/49] fix: uv build --- .github/workflows/release.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fda1a13a..db445a86 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -34,17 +34,14 @@ jobs: fetch-depth: 0 persist-credentials: false - - name: Install dependencies + - name: Install dependencies and build package run: | uv venv - source .venv/bin/activate + . .venv/bin/activate uv pip install build hatchling hatch-vcs uv sync --frozen + uv build --no-sources - - name: Build package - run: python -m build - id: build_cache - - name: Cache build artifacts uses: actions/cache@v3 with: From ab50a613e854fab671597659b64296f8a37a462c Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 16:23:51 +0000 Subject: [PATCH 28/49] ci(release): 1.34.0-beta.5 [skip ci] ## [1.34.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.4...v1.34.0-beta.5) (2025-01-02) ### Bug Fixes * release workflow ([cb6d140](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cb6d140042685bd419444d75ae7cab706cbcee38)) * uv build ([1be6ffe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1be6ffe309124d55b8b3b66ded448f06dfd87b7e)) * uv install workflow ([bcac20a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bcac20a7a8e65e2aa5760fb14e17b8054b4f4cf4)) --- CHANGELOG.md | 9 +++++++++ pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c81a205..5d6e2b94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## [1.34.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.4...v1.34.0-beta.5) (2025-01-02) + + +### Bug Fixes + +* release workflow ([cb6d140](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cb6d140042685bd419444d75ae7cab706cbcee38)) +* uv build ([1be6ffe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1be6ffe309124d55b8b3b66ded448f06dfd87b7e)) +* uv install workflow ([bcac20a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bcac20a7a8e65e2aa5760fb14e17b8054b4f4cf4)) + ## [1.34.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.3...v1.34.0-beta.4) (2024-12-18) diff --git a/pyproject.toml b/pyproject.toml index ac50b84b..0b0e4e25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b4" +version = "1.34.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From a00f128992e9fef88c870295c46b983b4286a3eb Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 17:34:23 +0100 Subject: [PATCH 29/49] fix: release workflow --- .github/workflows/release.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index db445a86..72213d72 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -38,9 +38,10 @@ jobs: run: | uv venv . .venv/bin/activate - uv pip install build hatchling hatch-vcs + uv pip install twine uv sync --frozen - uv build --no-sources + uv pip install -e . + uv build - name: Cache build artifacts uses: actions/cache@v3 From 44524f3ac4ae72ef3813f7f2a26edbb54a7c524e Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 16:38:15 +0000 Subject: [PATCH 30/49] ci(release): 1.34.0-beta.6 [skip ci] ## [1.34.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.5...v1.34.0-beta.6) (2025-01-02) ### Bug Fixes * release workflow ([a00f128](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a00f128992e9fef88c870295c46b983b4286a3eb)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d6e2b94..0459c515 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.5...v1.34.0-beta.6) (2025-01-02) + + +### Bug Fixes + +* release workflow ([a00f128](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a00f128992e9fef88c870295c46b983b4286a3eb)) + ## [1.34.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.4...v1.34.0-beta.5) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index 0b0e4e25..bd6bf225 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b5" +version = "1.34.0b6" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From a0c0a7ff5c5dc9a107e7be8d5b5e1854886d411c Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 17:48:54 +0100 Subject: [PATCH 31/49] fix: revert to d1b2104 --- .github/workflows/release.yml | 47 ++++++++++------------------------- pyproject.toml | 29 +-------------------- 2 files changed, 14 insertions(+), 62 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 72213d72..a8557a9d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,52 +14,40 @@ jobs: run: | sudo apt update sudo apt install -y git - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Install uv uses: astral-sh/setup-uv@v3 - - name: Install Node Env uses: actions/setup-node@v4 with: node-version: 20 - - name: Checkout uses: actions/checkout@v4.1.1 with: fetch-depth: 0 persist-credentials: false - - - name: Install dependencies and build package + - name: Build app run: | - uv venv - . .venv/bin/activate - uv pip install twine uv sync --frozen - uv pip install -e . uv build - - - name: Cache build artifacts - uses: actions/cache@v3 + id: build_cache + if: success() + - name: Cache build + uses: actions/cache@v2 with: - path: | - ./dist - ./build - key: ${{ runner.os }}-build-${{ github.sha }} + path: ./dist + key: ${{ runner.os }}-build-${{ hashFiles('dist/**') }} + if: steps.build_cache.outputs.id != '' release: name: Release runs-on: ubuntu-latest needs: build environment: development - if: | - github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/pre/')) || - github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && - (github.event.pull_request.base.ref == 'main' || startsWith(github.event.pull_request.base.ref, 'pre/')) + if: | + github.event_name == 'push' && github.ref == 'refs/heads/main' || + github.event_name == 'push' && github.ref == 'refs/heads/pre/beta' || + github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'main' || + github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'pre/beta' permissions: contents: write issues: write @@ -71,15 +59,6 @@ jobs: with: fetch-depth: 0 persist-credentials: false - - - name: Restore build artifacts - uses: actions/cache@v3 - with: - path: | - ./dist - ./build - key: ${{ runner.os }}-build-${{ github.sha }} - - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: diff --git a/pyproject.toml b/pyproject.toml index 0b0e4e25..f5192ff2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,36 +108,9 @@ screenshot_scraper = [ ] [build-system] -requires = ["hatchling>=1.0.0", "hatch-vcs"] +requires = ["hatchling"] build-backend = "hatchling.build" -[tool.hatch.build] -packages = ["scrapegraphai"] -exclude = [ - "tests/**", - "examples/**", -] - -[tool.hatch.version] -source = "vcs" - -[tool.hatch.build.hooks.vcs] -version-file = "scrapegraphai/_version.py" - -[tool.hatch.build.targets.wheel] -packages = ["scrapegraphai"] - -[tool.hatch.build.targets.sdist] -include = [ - "/scrapegraphai", - "pyproject.toml", - "README.md", - "LICENSE", -] - -[tool.hatch.metadata] -allow-direct-references = true - [dependency-groups] dev = [ "burr[start]==0.22.1", From 6f7547dee89b1e83fca0bccbb744c6d84b7cb64e Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 16:50:09 +0000 Subject: [PATCH 32/49] ci(release): 1.34.0-beta.7 [skip ci] ## [1.34.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.6...v1.34.0-beta.7) (2025-01-02) ### Bug Fixes * revert to d1b2104 ([a0c0a7f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a0c0a7ff5c5dc9a107e7be8d5b5e1854886d411c)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0459c515..6b892258 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.6...v1.34.0-beta.7) (2025-01-02) + + +### Bug Fixes + +* revert to d1b2104 ([a0c0a7f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a0c0a7ff5c5dc9a107e7be8d5b5e1854886d411c)) + ## [1.34.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.5...v1.34.0-beta.6) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index f8ef6db5..7a3b42aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b6" +version = "1.34.0b7" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 95b8990a3649646972e12d78b11c7e1b7e707bf6 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 18:27:49 +0100 Subject: [PATCH 33/49] fix: version --- .github/workflows/release.yml | 27 +++++--- uv.lock | 118 ++++++++++++++++++---------------- 2 files changed, 81 insertions(+), 64 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a8557a9d..9b945a95 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,17 +25,29 @@ jobs: with: fetch-depth: 0 persist-credentials: false + - name: Remove Hardcoded Version (if present) + run: | + sed -i '/^version =/d' pyproject.toml + - name: Update Dependencies + run: | + uv pip install --upgrade setuptools wheel hatchling twine - name: Build app run: | uv sync --frozen uv build id: build_cache if: success() + - name: Validate Metadata + run: | + uv run twine check dist/* + - name: Debug Dist Directory + run: | + ls -al dist - name: Cache build - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ./dist - key: ${{ runner.os }}-build-${{ hashFiles('dist/**') }} + key: ${{ runner.os }}-build-${{ github.sha }} if: steps.build_cache.outputs.id != '' release: @@ -43,11 +55,10 @@ jobs: runs-on: ubuntu-latest needs: build environment: development - if: | - github.event_name == 'push' && github.ref == 'refs/heads/main' || - github.event_name == 'push' && github.ref == 'refs/heads/pre/beta' || - github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'main' || - github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && github.event.pull_request.base.ref == 'pre/beta' + if: > + github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/pre/beta') || + (github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged && + (github.event.pull_request.base.ref == 'main' || github.event.pull_request.base.ref == 'pre/beta')) permissions: contents: write issues: write @@ -73,4 +84,4 @@ jobs: conventional-changelog-conventionalcommits@7 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} \ No newline at end of file + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} diff --git a/uv.lock b/uv.lock index 57b7f611..1058327b 100644 --- a/uv.lock +++ b/uv.lock @@ -1,21 +1,21 @@ version = 1 requires-python = ">=3.10, <4.0" resolution-markers = [ - "python_full_version < '3.11' and platform_system == 'Darwin'", - "python_full_version < '3.11' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version < '3.11' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version == '3.11.*' and platform_system == 'Darwin'", - "python_full_version == '3.11.*' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version == '3.11.*' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system == 'Darwin'", - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "python_full_version >= '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", - "(python_full_version >= '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", + "python_full_version < '3.11' and sys_platform == 'darwin'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", ] [[package]] @@ -222,12 +222,12 @@ name = "async-timeout" version = "4.0.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.11' and platform_system == 'Darwin'", - "python_full_version < '3.11' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version < '3.11' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version == '3.11.*' and platform_system == 'Darwin'", - "python_full_version == '3.11.*' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version == '3.11.*' and platform_system != 'Darwin' and platform_system != 'Linux')", + "python_full_version < '3.11' and sys_platform == 'darwin'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", ] sdist = { url = "https://files.pythonhosted.org/packages/87/d6/21b30a550dafea84b1b8eee21b5e23fa16d010ae006011221f33dcd8d7f8/async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f", size = 8345 } wheels = [ @@ -239,15 +239,15 @@ name = "async-timeout" version = "5.0.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system == 'Darwin'", - "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_system != 'Darwin' and platform_system != 'Linux')", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.13' and platform_system == 'Darwin'", - "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "python_full_version >= '3.13' and platform_machine == 'aarch64' and platform_system == 'Linux'", - "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", - "(python_full_version >= '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version >= '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and python_full_version < '3.12.4' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and python_full_version < '3.12.4' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.13' and sys_platform == 'darwin'", + "python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12.4' and python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12.4' and python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", ] sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274 } wheels = [ @@ -457,7 +457,7 @@ name = "click" version = "8.1.7" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/96/d3/f04c7bfcf5c1862a2a5b845c6b2b360488cf47af55dfa79c98f6a6bf98b5/click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de", size = 336121 } wheels = [ @@ -2461,7 +2461,7 @@ version = "2.10.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pygments" }, - { name = "pywin32", marker = "platform_system == 'Windows'" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, { name = "tqdm" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3a/93/80ac75c20ce54c785648b4ed363c88f148bf22637e10c9863db4fbe73e74/mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97", size = 271270 } @@ -2675,7 +2675,7 @@ name = "nvidia-cudnn-cu12" version = "9.1.0.70" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 }, @@ -2686,7 +2686,7 @@ name = "nvidia-cufft-cu12" version = "11.2.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 }, @@ -2707,9 +2707,9 @@ name = "nvidia-cusolver-cu12" version = "11.6.1.9" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, - { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 }, @@ -2721,7 +2721,7 @@ name = "nvidia-cusparse-cu12" version = "12.3.1.170" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system != 'Darwin') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 }, @@ -3132,7 +3132,7 @@ name = "portalocker" version = "2.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "platform_system == 'Windows'" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 } wheels = [ @@ -4081,7 +4081,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.33.3" +version = "1.34.0b6" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, @@ -4145,10 +4145,13 @@ screenshot-scraper = [ [package.dev-dependencies] dev = [ + { name = "burr", extra = ["start"] }, + { name = "furo" }, { name = "poethepoet" }, { name = "pylint" }, { name = "pytest" }, { name = "pytest-mock" }, + { name = "sphinx" }, ] [package.metadata] @@ -4199,10 +4202,13 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ + { name = "burr", extras = ["start"], specifier = "==0.22.1" }, + { name = "furo", specifier = "==2024.5.6" }, { name = "poethepoet", specifier = ">=0.31.1" }, { name = "pylint", specifier = ">=3.2.5" }, { name = "pytest", specifier = "==8.0.0" }, { name = "pytest-mock", specifier = "==3.14.0" }, + { name = "sphinx", specifier = "==6.0" }, ] [[package]] @@ -4560,7 +4566,7 @@ dependencies = [ { name = "toml" }, { name = "tornado" }, { name = "typing-extensions" }, - { name = "watchdog", marker = "platform_system != 'Darwin'" }, + { name = "watchdog", marker = "sys_platform != 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b0/e5/2bf2daa9c98658f1474bb64e7de030cbc4182b5f2b2196536efedaef02cb/streamlit-1.40.2.tar.gz", hash = "sha256:0cc131fc9b18065feaff8f6f241c81164ad37d8d9e3a85499a0240aaaf6a6a61", size = 8265763 } wheels = [ @@ -4764,21 +4770,21 @@ dependencies = [ { name = "fsspec" }, { name = "jinja2" }, { name = "networkx" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, - { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions" }, ] wheels = [ @@ -4820,7 +4826,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ @@ -4862,7 +4868,7 @@ name = "triton" version = "3.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "(python_full_version < '3.13' and platform_machine != 'aarch64' and platform_system != 'Darwin') or (python_full_version < '3.13' and platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "filelock", marker = "(python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 }, From fce988687b3dc6fc36ce9244a8c2744f4a25d561 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 18:32:47 +0100 Subject: [PATCH 34/49] fix: uv virtual env --- .github/workflows/release.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9b945a95..bbf16129 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,20 +25,23 @@ jobs: with: fetch-depth: 0 persist-credentials: false - - name: Remove Hardcoded Version (if present) + - name: Create Virtual Environment run: | - sed -i '/^version =/d' pyproject.toml - - name: Update Dependencies + uv venv + - name: Activate Virtual Environment and Install Dependencies run: | + source .venv/bin/activate # For Linux/macOS. Use .venv\Scripts\activate for Windows + uv sync --frozen uv pip install --upgrade setuptools wheel hatchling twine - name: Build app run: | - uv sync --frozen + source .venv/bin/activate # Activate the virtual environment again if needed uv build id: build_cache if: success() - name: Validate Metadata run: | + source .venv/bin/activate uv run twine check dist/* - name: Debug Dist Directory run: | @@ -70,6 +73,9 @@ jobs: with: fetch-depth: 0 persist-credentials: false + - name: Restore Virtual Environment + run: | + source .venv/bin/activate - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: From abe29457f2380932d070bfd607c8ab5f749627c3 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 18:38:47 +0100 Subject: [PATCH 35/49] fix: workflow --- .github/workflows/release.yml | 42 +++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bbf16129..a0fd18ac 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,44 +14,44 @@ jobs: run: | sudo apt update sudo apt install -y git + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install uv uses: astral-sh/setup-uv@v3 + - name: Install Node Env uses: actions/setup-node@v4 with: node-version: 20 + - name: Checkout uses: actions/checkout@v4.1.1 with: fetch-depth: 0 persist-credentials: false - - name: Create Virtual Environment + + - name: Build and validate package run: | uv venv - - name: Activate Virtual Environment and Install Dependencies - run: | - source .venv/bin/activate # For Linux/macOS. Use .venv\Scripts\activate for Windows - uv sync --frozen + . .venv/bin/activate uv pip install --upgrade setuptools wheel hatchling twine - - name: Build app - run: | - source .venv/bin/activate # Activate the virtual environment again if needed + uv sync --frozen + uv pip install -e . uv build - id: build_cache - if: success() - - name: Validate Metadata - run: | - source .venv/bin/activate uv run twine check dist/* + - name: Debug Dist Directory - run: | - ls -al dist + run: ls -al dist + - name: Cache build uses: actions/cache@v3 with: path: ./dist key: ${{ runner.os }}-build-${{ github.sha }} - if: steps.build_cache.outputs.id != '' release: name: Release @@ -73,9 +73,13 @@ jobs: with: fetch-depth: 0 persist-credentials: false - - name: Restore Virtual Environment - run: | - source .venv/bin/activate + + - name: Restore build artifacts + uses: actions/cache@v3 + with: + path: ./dist + key: ${{ runner.os }}-build-${{ github.sha }} + - name: Semantic Release uses: cycjimmy/semantic-release-action@v4.1.0 with: From df07da9bcc59cbccf1c45d69e3a3e904eaed565b Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 18:41:50 +0100 Subject: [PATCH 36/49] fix: added twine --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a0fd18ac..587b22ee 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -42,7 +42,7 @@ jobs: uv sync --frozen uv pip install -e . uv build - uv run twine check dist/* + python -m twine check dist/* - name: Debug Dist Directory run: ls -al dist From eb36a2b630d62363f3c57e243f2b90cf530c0a3b Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 18:44:22 +0100 Subject: [PATCH 37/49] fix: twine --- .github/workflows/release.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 587b22ee..99090609 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -38,10 +38,11 @@ jobs: run: | uv venv . .venv/bin/activate - uv pip install --upgrade setuptools wheel hatchling twine + uv pip install --upgrade setuptools wheel hatchling uv sync --frozen uv pip install -e . uv build + uv pip install twine # Explicitly install twine again python -m twine check dist/* - name: Debug Dist Directory From 5e85617ccaccf421c0736abecee62426c6140686 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 17:47:50 +0000 Subject: [PATCH 38/49] ci(release): 1.34.0-beta.8 [skip ci] ## [1.34.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.7...v1.34.0-beta.8) (2025-01-02) ### Bug Fixes * added twine ([df07da9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df07da9bcc59cbccf1c45d69e3a3e904eaed565b)) * twine ([eb36a2b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/eb36a2b630d62363f3c57e243f2b90cf530c0a3b)) * uv virtual env ([fce9886](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fce988687b3dc6fc36ce9244a8c2744f4a25d561)) * version ([95b8990](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/95b8990a3649646972e12d78b11c7e1b7e707bf6)) * workflow ([abe2945](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/abe29457f2380932d070bfd607c8ab5f749627c3)) --- CHANGELOG.md | 11 +++++++++++ pyproject.toml | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b892258..e9e8a7f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## [1.34.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.7...v1.34.0-beta.8) (2025-01-02) + + +### Bug Fixes + +* added twine ([df07da9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df07da9bcc59cbccf1c45d69e3a3e904eaed565b)) +* twine ([eb36a2b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/eb36a2b630d62363f3c57e243f2b90cf530c0a3b)) +* uv virtual env ([fce9886](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fce988687b3dc6fc36ce9244a8c2744f4a25d561)) +* version ([95b8990](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/95b8990a3649646972e12d78b11c7e1b7e707bf6)) +* workflow ([abe2945](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/abe29457f2380932d070bfd607c8ab5f749627c3)) + ## [1.34.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.6...v1.34.0-beta.7) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index 7a3b42aa..c669b6dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b7" +version = "1.34.0b8" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 9203ab9a4ab4400105fd34433684f9ac2453f35c Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 18:55:40 +0100 Subject: [PATCH 39/49] fix: update pkginfo --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 99090609..c85f1969 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -42,7 +42,7 @@ jobs: uv sync --frozen uv pip install -e . uv build - uv pip install twine # Explicitly install twine again + uv pip install "pkginfo>=1.10.0" twine # Install newer pkginfo with twine python -m twine check dist/* - name: Debug Dist Directory From 9ff302a11db1c3a3fc5d8ec2739bd0f0df330461 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 17:59:12 +0000 Subject: [PATCH 40/49] ci(release): 1.34.0-beta.9 [skip ci] ## [1.34.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.8...v1.34.0-beta.9) (2025-01-02) ### Bug Fixes * update pkginfo ([9203ab9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9203ab9a4ab4400105fd34433684f9ac2453f35c)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9e8a7f3..b51bc6d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.8...v1.34.0-beta.9) (2025-01-02) + + +### Bug Fixes + +* update pkginfo ([9203ab9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9203ab9a4ab4400105fd34433684f9ac2453f35c)) + ## [1.34.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.7...v1.34.0-beta.8) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index c669b6dd..c8ef9b33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b8" +version = "1.34.0b9" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 020e21123889c6483459e9db1c3c796cbc116140 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 19:06:27 +0100 Subject: [PATCH 41/49] fix: upgrade twine --- .github/workflows/release.yml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c85f1969..0ad43347 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -42,7 +42,7 @@ jobs: uv sync --frozen uv pip install -e . uv build - uv pip install "pkginfo>=1.10.0" twine # Install newer pkginfo with twine + uv pip install --upgrade pkginfo==1.12.0 twine==6.0.1 # Upgrade pkginfo and install twine python -m twine check dist/* - name: Debug Dist Directory diff --git a/uv.lock b/uv.lock index 1058327b..59a2afe4 100644 --- a/uv.lock +++ b/uv.lock @@ -4081,7 +4081,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.34.0b6" +version = "1.34.0b8" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, From 11177c68f3fb3c80dfb1e8f787371f93874f709c Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 18:10:03 +0000 Subject: [PATCH 42/49] ci(release): 1.34.0-beta.10 [skip ci] ## [1.34.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.9...v1.34.0-beta.10) (2025-01-02) ### Bug Fixes * upgrade twine ([020e211](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/020e21123889c6483459e9db1c3c796cbc116140)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b51bc6d4..106dcbf0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.9...v1.34.0-beta.10) (2025-01-02) + + +### Bug Fixes + +* upgrade twine ([020e211](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/020e21123889c6483459e9db1c3c796cbc116140)) + ## [1.34.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.8...v1.34.0-beta.9) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index c8ef9b33..2e326857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b9" +version = "1.34.0b10" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 9150e4c95fa468afe9ddda3f1278b5037a2d0f38 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 19:27:28 +0100 Subject: [PATCH 43/49] fix: added license-files = [ "LICENSE" ] --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index c8ef9b33..520cd9f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,9 @@ dependencies = [ ] license = "MIT" +license-files = [ + "LICENSE" +] readme = "README.md" homepage = "https://scrapegraphai.com/" repository = "https://github.com/ScrapeGraphAI/Scrapegraph-ai" From 16164d45c80a5267135ea8d899ea2cd75f6d80ad Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 18:31:19 +0000 Subject: [PATCH 44/49] ci(release): 1.34.0-beta.11 [skip ci] ## [1.34.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.10...v1.34.0-beta.11) (2025-01-02) ### Bug Fixes * added license-files = [ ([9150e4c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9150e4c95fa468afe9ddda3f1278b5037a2d0f38)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 106dcbf0..5a930738 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.10...v1.34.0-beta.11) (2025-01-02) + + +### Bug Fixes + +* added license-files = [ ([9150e4c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9150e4c95fa468afe9ddda3f1278b5037a2d0f38)) + ## [1.34.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.9...v1.34.0-beta.10) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index 41af2800..d7a9ee17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b10" +version = "1.34.0b11" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From b5acfb414321989c45f76fad82f0d720ec889274 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Thu, 2 Jan 2025 19:40:23 +0100 Subject: [PATCH 45/49] fix: removed license for license-files --- pyproject.toml | 6 ++++-- uv.lock | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 41af2800..594ad6d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,10 +41,12 @@ dependencies = [ "scrapegraph-py>=1.7.0" ] -license = "MIT" +# License Information. +# See PEP-639 at https://peps.python.org/pep-0639/#add-license-files-key license-files = [ - "LICENSE" + "LICENSE", ] + readme = "README.md" homepage = "https://scrapegraphai.com/" repository = "https://github.com/ScrapeGraphAI/Scrapegraph-ai" diff --git a/uv.lock b/uv.lock index 59a2afe4..5d297abc 100644 --- a/uv.lock +++ b/uv.lock @@ -4081,7 +4081,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.34.0b8" +version = "1.34.0b10" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, From cfea8266393bdf45aa4cc69ed1b4e976b968ee92 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 Jan 2025 18:44:29 +0000 Subject: [PATCH 46/49] ci(release): 1.34.0-beta.12 [skip ci] ## [1.34.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.11...v1.34.0-beta.12) (2025-01-02) ### Bug Fixes * removed license for license-files ([b5acfb4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b5acfb414321989c45f76fad82f0d720ec889274)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a930738..5eb8a1b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.11...v1.34.0-beta.12) (2025-01-02) + + +### Bug Fixes + +* removed license for license-files ([b5acfb4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b5acfb414321989c45f76fad82f0d720ec889274)) + ## [1.34.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.10...v1.34.0-beta.11) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index 91c83578..aeaca7bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b11" +version = "1.34.0b12" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 159ed329d2e8fa86015df1e59a7e2ebb439c6ec0 Mon Sep 17 00:00:00 2001 From: PeriniM Date: Fri, 3 Jan 2025 13:18:17 +0100 Subject: [PATCH 47/49] fix: bump hatchling version to 1.26.3 --- pyproject.toml | 8 +------- uv.lock | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 91c83578..99bfff19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,12 +41,6 @@ dependencies = [ "scrapegraph-py>=1.7.0" ] -# License Information. -# See PEP-639 at https://peps.python.org/pep-0639/#add-license-files-key -license-files = [ - "LICENSE", -] - readme = "README.md" homepage = "https://scrapegraphai.com/" repository = "https://github.com/ScrapeGraphAI/Scrapegraph-ai" @@ -113,7 +107,7 @@ screenshot_scraper = [ ] [build-system] -requires = ["hatchling"] +requires = ["hatchling==1.26.3"] build-backend = "hatchling.build" [dependency-groups] diff --git a/uv.lock b/uv.lock index 5d297abc..ae5b7a16 100644 --- a/uv.lock +++ b/uv.lock @@ -4081,7 +4081,7 @@ wheels = [ [[package]] name = "scrapegraphai" -version = "1.34.0b10" +version = "1.34.0b11" source = { editable = "." } dependencies = [ { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, From 8c7c231baa8f022018be26e18b338917401c51c9 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 3 Jan 2025 12:21:44 +0000 Subject: [PATCH 48/49] ci(release): 1.34.0-beta.13 [skip ci] ## [1.34.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.12...v1.34.0-beta.13) (2025-01-03) ### Bug Fixes * bump hatchling version to 1.26.3 ([159ed32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/159ed329d2e8fa86015df1e59a7e2ebb439c6ec0)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5eb8a1b2..4aafebd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.34.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.12...v1.34.0-beta.13) (2025-01-03) + + +### Bug Fixes + +* bump hatchling version to 1.26.3 ([159ed32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/159ed329d2e8fa86015df1e59a7e2ebb439c6ec0)) + ## [1.34.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.11...v1.34.0-beta.12) (2025-01-02) diff --git a/pyproject.toml b/pyproject.toml index 56a78bf5..d53db985 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b12" +version = "1.34.0b13" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From a9569ac08ffbb81a08b7a93aab70de914047659f Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 3 Jan 2025 12:31:31 +0000 Subject: [PATCH 49/49] ci(release): 1.34.0-beta.14 [skip ci] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## [1.34.0-beta.14](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.13...v1.34.0-beta.14) (2025-01-03) ### Bug Fixes * add model tokens ([9b16cb9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9b16cb987fd93132d814ebd933af1565eb166331)) * revert ([b312251](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b312251cc56ee4c82554ecf116b5e6edd1560726)) * revert ([bb5de58](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb5de581c064a1d141f849081e52987500957d1c)) * validate URL only if the input type is a URL ([e2caee6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e2caee695ecce2d13aa5a82306097b1a80ba0e18)) ### Docs * added api reference 🔗 ([67038e1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/67038e195224e1a721fe123ad1d5604b3592df20)) * added official cookbook reference ([98aa74f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/98aa74ff2d35041884130be14efdf47ca5e716df)) * fixed missing import ([96064f2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/96064f20ee8a849a2548f293419cf9028386c47b)) * updated documentation reference ([fe89ae2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fe89ae29e6dc5f4322c25c693e2c9f6ce958d6e2)) ### CI * **release:** 1.33.10 [skip ci] ([a44b74a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a44b74aa6f7be7cdb4bdbebebc3b51a6d54a51e6)) * **release:** 1.33.11 [skip ci] ([30f48b3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/30f48b394f6eb8c7c9a1fa113bffabd2ac1ac585)) * **release:** 1.33.9 [skip ci] ([9b6d6c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9b6d6c0efb2fd1af5bf87cf61a0ba3d79876d21d)) --- CHANGELOG.md | 25 +++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee016a4f..dfc5d288 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,28 @@ +## [1.34.0-beta.14](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.13...v1.34.0-beta.14) (2025-01-03) + + +### Bug Fixes + +* add model tokens ([9b16cb9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9b16cb987fd93132d814ebd933af1565eb166331)) +* revert ([b312251](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b312251cc56ee4c82554ecf116b5e6edd1560726)) +* revert ([bb5de58](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb5de581c064a1d141f849081e52987500957d1c)) +* validate URL only if the input type is a URL ([e2caee6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e2caee695ecce2d13aa5a82306097b1a80ba0e18)) + + +### Docs + +* added api reference 🔗 ([67038e1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/67038e195224e1a721fe123ad1d5604b3592df20)) +* added official cookbook reference ([98aa74f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/98aa74ff2d35041884130be14efdf47ca5e716df)) +* fixed missing import ([96064f2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/96064f20ee8a849a2548f293419cf9028386c47b)) +* updated documentation reference ([fe89ae2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fe89ae29e6dc5f4322c25c693e2c9f6ce958d6e2)) + + +### CI + +* **release:** 1.33.10 [skip ci] ([a44b74a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a44b74aa6f7be7cdb4bdbebebc3b51a6d54a51e6)) +* **release:** 1.33.11 [skip ci] ([30f48b3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/30f48b394f6eb8c7c9a1fa113bffabd2ac1ac585)) +* **release:** 1.33.9 [skip ci] ([9b6d6c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9b6d6c0efb2fd1af5bf87cf61a0ba3d79876d21d)) + ## [1.34.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.34.0-beta.12...v1.34.0-beta.13) (2025-01-03) diff --git a/pyproject.toml b/pyproject.toml index 76c5d75f..7b795ee8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.34.0b13" +version = "1.34.0b14" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [