From 38c6dd2aa1ce31b981eb8c35a56e9533d19df81b Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 4 Nov 2024 09:21:29 +0100 Subject: [PATCH 1/9] feat: update chromium --- scrapegraphai/docloaders/chromium.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 48058436..cf784e95 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -1,6 +1,3 @@ -""" -chromiumloader module -""" import asyncio from typing import Any, AsyncIterator, Iterator, List, Optional from langchain_community.document_loaders.base import BaseLoader @@ -12,15 +9,16 @@ logger = get_logger("web-loader") class ChromiumLoader(BaseLoader): - """scrapes HTML pages from URLs using a (headless) instance of the - Chromium web driver with proxy protection + """Scrapes HTML pages from URLs using a (headless) instance of the + Chromium web driver with proxy protection. Attributes: backend: The web driver backend library; defaults to 'playwright'. browser_config: A dictionary containing additional browser kwargs. - headless: whether to run browser in headless mode. + headless: Whether to run browser in headless mode. proxy: A dictionary containing proxy settings; None disables protection. urls: A list of URLs to scrape content from. + requires_js_support: Flag to determine if JS rendering is required. """ RETRY_LIMIT = 3 @@ -34,15 +32,17 @@ def __init__( headless: bool = True, proxy: Optional[Proxy] = None, load_state: str = "domcontentloaded", + requires_js_support: bool = False, **kwargs: Any, ): """Initialize the loader with a list of URL paths. Args: backend: The web driver backend library; defaults to 'playwright'. - headless: whether to run browser in headless mode. + headless: Whether to run browser in headless mode. proxy: A dictionary containing proxy information; None disables protection. urls: A list of URLs to scrape content from. + requires_js_support: Whether to use JS rendering for scraping. kwargs: A dictionary containing additional browser kwargs. Raises: @@ -61,6 +61,7 @@ def __init__( self.proxy = parse_or_search_proxy(proxy) if proxy else None self.urls = urls self.load_state = load_state + self.requires_js_support = requires_js_support async def ascrape_undetected_chromedriver(self, url: str) -> str: """ @@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]: Yields: Document: The scraped content encapsulated within a Document object. """ - scraping_fn = getattr(self, f"ascrape_{self.backend}") + scraping_fn = ( + self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}") + ) for url in self.urls: html_content = asyncio.run(scraping_fn(url)) @@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]: Document: A Document object containing the scraped content, along with its source URL as metadata. """ - scraping_fn = getattr(self, f"ascrape_{self.backend}") + scraping_fn = ( + self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}") + ) tasks = [scraping_fn(url) for url in self.urls] results = await asyncio.gather(*tasks) From 12fa3155b4ef746e6d4c52e433221a9815682d92 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 6 Nov 2024 08:23:03 +0000 Subject: [PATCH 2/9] ci(release): 1.30.0 [skip ci] ## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06) ### Features * update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cba3b99..7449d264 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06) + + +### Features + +* update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b)) + ## [1.29.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0) (2024-11-04) diff --git a/pyproject.toml b/pyproject.toml index 49158ab5..2cef768f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.29.0" +version = "1.30.0" From 774df5410cda93c3f4d8a77f5efae2cbd694bffb Mon Sep 17 00:00:00 2001 From: Lorenzo Padoan Date: Thu, 7 Nov 2024 16:00:13 +0100 Subject: [PATCH 3/9] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 94beb617..091624f2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # 🕷️ ScrapeGraphAI: You Only Scrape Once +

+VinciGit00%2FScrapegraph-ai | Trendshift +

[English](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/japanese.md) | [한국어](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/korean.md) | [Русский](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/russian.md) From b400cc5a666e1670948cd50f284a2ccba3d136ee Mon Sep 17 00:00:00 2001 From: Lorenzo Padoan Date: Thu, 7 Nov 2024 16:02:19 +0100 Subject: [PATCH 4/9] Update README.md --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 091624f2..690290ed 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,5 @@ # 🕷️ ScrapeGraphAI: You Only Scrape Once -

-VinciGit00%2FScrapegraph-ai | Trendshift -

[English](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/japanese.md) | [한국어](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/korean.md) | [Русский](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/russian.md) @@ -15,6 +12,10 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT) [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX) +

+VinciGit00%2FScrapegraph-ai | Trendshift +

+ ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, Markdown, etc.). Just say which information you want to extract and the library will do it for you! From ebe0b0d56e365d2cad28a6c512850847b34efb9c Mon Sep 17 00:00:00 2001 From: bezineb5 Date: Sat, 9 Nov 2024 10:18:21 +0100 Subject: [PATCH 5/9] fix --- scrapegraphai/nodes/generate_answer_node.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 56d57d09..8e89ff39 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -122,22 +122,10 @@ def execute(self, state: dict) -> dict: partial_variables={"context": doc, "format_instructions": format_instructions} ) chain = prompt | self.llm_model - raw_response = chain.invoke({"question": user_prompt}) - if output_parser: - try: - answer = output_parser.parse(raw_response.content) - except JSONDecodeError: - lines = raw_response.split('\n') - if lines[0].strip().startswith('```'): - lines = lines[1:] - if lines[-1].strip().endswith('```'): - lines = lines[:-1] - cleaned_response = '\n'.join(lines) - answer = output_parser.parse(cleaned_response) - else: - answer = raw_response.content + chain = chain | output_parser + answer = chain.invoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state From 5100fbb01746379395a3500eae7eeeb4870be373 Mon Sep 17 00:00:00 2001 From: saied71 Date: Mon, 11 Nov 2024 14:04:52 +0330 Subject: [PATCH 6/9] add html source support for source --- scrapegraphai/nodes/fetch_node.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 06842ca4..55f05ab6 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -170,8 +170,9 @@ def handle_file(self, state, input_type, source): compressed_document = self.load_file_content(source, input_type) - return self.update_state(state, compressed_document) - + # return self.update_state(state, compressed_document) + state.update({self.output[0]: compressed_document}) + return state def load_file_content(self, source, input_type): """ Loads the content of a file based on its input type. @@ -230,8 +231,9 @@ def handle_local_source(self, state, source): Document(page_content=parsed_content, metadata={"source": "local_dir"}) ] - return self.update_state(state, compressed_document) - + # return self.update_state(state, compressed_document) + state.update({self.output[0]: compressed_document}) + return state def handle_web_source(self, state, source): """ Handles the web source by fetching HTML content from a URL, From a6269395a8c2b02c3cbda92055a3b39d64cdda82 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 18 Nov 2024 14:28:32 +0100 Subject: [PATCH 7/9] removed unused files --- extract_data.py | 27 --------------------------- extracted_data.py | 28 ---------------------------- 2 files changed, 55 deletions(-) delete mode 100644 extract_data.py delete mode 100644 extracted_data.py diff --git a/extract_data.py b/extract_data.py deleted file mode 100644 index df3babc2..00000000 --- a/extract_data.py +++ /dev/null @@ -1,27 +0,0 @@ -def extract_data(html: str) -> dict: - from bs4 import BeautifulSoup - - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(html, 'html.parser') - - # Initialize an empty list to hold project data - projects = [] - - # Find all project entries in the HTML - project_entries = soup.find_all('div', class_='grid-item') - - # Iterate over each project entry to extract title and description - for entry in project_entries: - # Extract the title from the h4 element - title = entry.find('h4', class_='card-title').get_text(strip=True) - # Extract the description from the p element - description = entry.find('p', class_='card-text').get_text(strip=True) - - # Append the extracted data as a dictionary to the projects list - projects.append({ - 'title': title, - 'description': description - }) - - # Return the structured data as a dictionary matching the desired JSON schema - return {'projects': projects} \ No newline at end of file diff --git a/extracted_data.py b/extracted_data.py deleted file mode 100644 index 45da5e49..00000000 --- a/extracted_data.py +++ /dev/null @@ -1,28 +0,0 @@ -def extract_data(html: str) -> dict: - from bs4 import BeautifulSoup - - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(html, 'html.parser') - - # Initialize an empty list to hold project data - projects = [] - - # Find all project entries in the HTML - project_entries = soup.find_all('div', class_='grid-item') - - # Iterate over each project entry to extract title and description - for entry in project_entries: - # Extract the title from the card-title class - title = entry.find('h4', class_='card-title').get_text(strip=True) - - # Extract the description from the card-text class - description = entry.find('p', class_='card-text').get_text(strip=True) - - # Append the extracted data as a dictionary to the projects list - projects.append({ - 'title': title, - 'description': description - }) - - # Return the structured data as a dictionary matching the desired JSON schema - return {'projects': projects} \ No newline at end of file From e99ddda6eb731853536100e500454791f7b4c4b8 Mon Sep 17 00:00:00 2001 From: Lorenzo Padoan Date: Mon, 18 Nov 2024 17:56:54 +0100 Subject: [PATCH 8/9] Update FUNDING.yml Add open collective --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 4c4dbecc..59b38648 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -2,7 +2,7 @@ github: ScrapeGraphAI patreon: # Replace with a single Patreon username -open_collective: +open_collective: open_collective: https://opencollective.com/scrapegraphai ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry From 5dfda11cef262ffdc64833870786b6d6cd8873f9 Mon Sep 17 00:00:00 2001 From: Lorenzo Padoan Date: Mon, 18 Nov 2024 17:59:18 +0100 Subject: [PATCH 9/9] Update FUNDING.yml --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 59b38648..0a2af007 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -2,7 +2,7 @@ github: ScrapeGraphAI patreon: # Replace with a single Patreon username -open_collective: open_collective: https://opencollective.com/scrapegraphai +open_collective: https://opencollective.com/scrapegraphai ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry