From 38c6dd2aa1ce31b981eb8c35a56e9533d19df81b Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Mon, 4 Nov 2024 09:21:29 +0100
Subject: [PATCH 1/9] feat: update chromium

---
 scrapegraphai/docloaders/chromium.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
index 48058436..cf784e95 100644
--- a/scrapegraphai/docloaders/chromium.py
+++ b/scrapegraphai/docloaders/chromium.py
@@ -1,6 +1,3 @@
-"""
-chromiumloader module
-"""
 import asyncio
 from typing import Any, AsyncIterator, Iterator, List, Optional
 from langchain_community.document_loaders.base import BaseLoader
@@ -12,15 +9,16 @@
 logger = get_logger("web-loader")
 
 class ChromiumLoader(BaseLoader):
-    """scrapes HTML pages from URLs using a (headless) instance of the
-    Chromium web driver with proxy protection
+    """Scrapes HTML pages from URLs using a (headless) instance of the
+    Chromium web driver with proxy protection.
 
     Attributes:
         backend: The web driver backend library; defaults to 'playwright'.
         browser_config: A dictionary containing additional browser kwargs.
-        headless: whether to run browser in headless mode.
+        headless: Whether to run browser in headless mode.
         proxy: A dictionary containing proxy settings; None disables protection.
         urls: A list of URLs to scrape content from.
+        requires_js_support: Flag to determine if JS rendering is required.
     """
 
     RETRY_LIMIT = 3
@@ -34,15 +32,17 @@ def __init__(
         headless: bool = True,
         proxy: Optional[Proxy] = None,
         load_state: str = "domcontentloaded",
+        requires_js_support: bool = False,
         **kwargs: Any,
     ):
         """Initialize the loader with a list of URL paths.
 
         Args:
             backend: The web driver backend library; defaults to 'playwright'.
-            headless: whether to run browser in headless mode.
+            headless: Whether to run browser in headless mode.
             proxy: A dictionary containing proxy information; None disables protection.
             urls: A list of URLs to scrape content from.
+            requires_js_support: Whether to use JS rendering for scraping.
             kwargs: A dictionary containing additional browser kwargs.
 
         Raises:
@@ -61,6 +61,7 @@ def __init__(
         self.proxy = parse_or_search_proxy(proxy) if proxy else None
         self.urls = urls
         self.load_state = load_state
+        self.requires_js_support = requires_js_support
 
     async def ascrape_undetected_chromedriver(self, url: str) -> str:
         """
@@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
         Yields:
             Document: The scraped content encapsulated within a Document object.
         """
-        scraping_fn = getattr(self, f"ascrape_{self.backend}")
+        scraping_fn = (
+            self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
+        )
 
         for url in self.urls:
             html_content = asyncio.run(scraping_fn(url))
@@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
             Document: A Document object containing the scraped content, along with its
             source URL as metadata.
         """
-        scraping_fn = getattr(self, f"ascrape_{self.backend}")
+        scraping_fn = (
+            self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
+        )
 
         tasks = [scraping_fn(url) for url in self.urls]
         results = await asyncio.gather(*tasks)

From 12fa3155b4ef746e6d4c52e433221a9815682d92 Mon Sep 17 00:00:00 2001
From: semantic-release-bot <semantic-release-bot@martynus.net>
Date: Wed, 6 Nov 2024 08:23:03 +0000
Subject: [PATCH 2/9] ci(release): 1.30.0 [skip ci]

## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06)

### Features

* update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b))
---
 CHANGELOG.md   | 7 +++++++
 pyproject.toml | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3cba3b99..7449d264 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## [1.30.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.29.0...v1.30.0) (2024-11-06)
+
+
+### Features
+
+* update chromium ([38c6dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/38c6dd2aa1ce31b981eb8c35a56e9533d19df81b))
+
 ## [1.29.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0...v1.29.0) (2024-11-04)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 49158ab5..2cef768f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "scrapegraphai"
 
 
-version = "1.29.0"
+version = "1.30.0"
 
 
 

From 774df5410cda93c3f4d8a77f5efae2cbd694bffb Mon Sep 17 00:00:00 2001
From: Lorenzo Padoan <lorenzo.padoan977@gmail.com>
Date: Thu, 7 Nov 2024 16:00:13 +0100
Subject: [PATCH 3/9] Update README.md

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 94beb617..091624f2 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 
 # 🕷️ ScrapeGraphAI: You Only Scrape Once
+<p align="center">
+<a href="https://trendshift.io/repositories/9761" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9761" alt="VinciGit00%2FScrapegraph-ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+<p align="center">
 [English](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/japanese.md)
 | [한국어](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/korean.md)
 | [Русский](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/russian.md)

From b400cc5a666e1670948cd50f284a2ccba3d136ee Mon Sep 17 00:00:00 2001
From: Lorenzo Padoan <lorenzo.padoan977@gmail.com>
Date: Thu, 7 Nov 2024 16:02:19 +0100
Subject: [PATCH 4/9] Update README.md

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 091624f2..690290ed 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,5 @@
 
 # 🕷️ ScrapeGraphAI: You Only Scrape Once
-<p align="center">
-<a href="https://trendshift.io/repositories/9761" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9761" alt="VinciGit00%2FScrapegraph-ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-<p align="center">
 [English](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/japanese.md)
 | [한국어](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/korean.md)
 | [Русский](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/russian.md)
@@ -15,6 +12,10 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT)
 [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX)
 
+<p align="center">
+<a href="https://trendshift.io/repositories/9761" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9761" alt="VinciGit00%2FScrapegraph-ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+<p align="center">
+
 ScrapeGraphAI is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, Markdown, etc.).
 
 Just say which information you want to extract and the library will do it for you!

From ebe0b0d56e365d2cad28a6c512850847b34efb9c Mon Sep 17 00:00:00 2001
From: bezineb5 <bben.pub@laposte.net>
Date: Sat, 9 Nov 2024 10:18:21 +0100
Subject: [PATCH 5/9] fix

---
 scrapegraphai/nodes/generate_answer_node.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
index 56d57d09..8e89ff39 100644
--- a/scrapegraphai/nodes/generate_answer_node.py
+++ b/scrapegraphai/nodes/generate_answer_node.py
@@ -122,22 +122,10 @@ def execute(self, state: dict) -> dict:
                 partial_variables={"context": doc, "format_instructions": format_instructions}
             )
             chain = prompt | self.llm_model
-            raw_response = chain.invoke({"question": user_prompt})
-
             if output_parser:
-                try:
-                    answer = output_parser.parse(raw_response.content)
-                except JSONDecodeError:
-                    lines = raw_response.split('\n')
-                    if lines[0].strip().startswith('```'):
-                        lines = lines[1:]
-                    if lines[-1].strip().endswith('```'):
-                        lines = lines[:-1]
-                    cleaned_response = '\n'.join(lines)
-                    answer = output_parser.parse(cleaned_response)
-            else:
-                answer = raw_response.content
+                chain = chain | output_parser
 
+            answer = chain.invoke({"question": user_prompt})
             state.update({self.output[0]: answer})
             return state
 

From 5100fbb01746379395a3500eae7eeeb4870be373 Mon Sep 17 00:00:00 2001
From: saied71 <saied.alimoradi@gmail.com>
Date: Mon, 11 Nov 2024 14:04:52 +0330
Subject: [PATCH 6/9] add html source support for source

---
 scrapegraphai/nodes/fetch_node.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
index 06842ca4..55f05ab6 100644
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -170,8 +170,9 @@ def handle_file(self, state, input_type, source):
 
         compressed_document = self.load_file_content(source, input_type)
 
-        return self.update_state(state, compressed_document)
-
+        # return self.update_state(state, compressed_document)
+        state.update({self.output[0]: compressed_document})
+        return state
     def load_file_content(self, source, input_type):
         """
         Loads the content of a file based on its input type.
@@ -230,8 +231,9 @@ def handle_local_source(self, state, source):
             Document(page_content=parsed_content, metadata={"source": "local_dir"})
         ]
 
-        return self.update_state(state, compressed_document)
-
+        # return self.update_state(state, compressed_document)
+        state.update({self.output[0]: compressed_document})
+        return state
     def handle_web_source(self, state, source):
         """
         Handles the web source by fetching HTML content from a URL, 

From a6269395a8c2b02c3cbda92055a3b39d64cdda82 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Mon, 18 Nov 2024 14:28:32 +0100
Subject: [PATCH 7/9] removed unused files

---
 extract_data.py   | 27 ---------------------------
 extracted_data.py | 28 ----------------------------
 2 files changed, 55 deletions(-)
 delete mode 100644 extract_data.py
 delete mode 100644 extracted_data.py

diff --git a/extract_data.py b/extract_data.py
deleted file mode 100644
index df3babc2..00000000
--- a/extract_data.py
+++ /dev/null
@@ -1,27 +0,0 @@
-def extract_data(html: str) -> dict:
-    from bs4 import BeautifulSoup
-
-    # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # Initialize an empty list to hold project data
-    projects = []
-
-    # Find all project entries in the HTML
-    project_entries = soup.find_all('div', class_='grid-item')
-
-    # Iterate over each project entry to extract title and description
-    for entry in project_entries:
-        # Extract the title from the h4 element
-        title = entry.find('h4', class_='card-title').get_text(strip=True)
-        # Extract the description from the p element
-        description = entry.find('p', class_='card-text').get_text(strip=True)
-
-        # Append the extracted data as a dictionary to the projects list
-        projects.append({
-            'title': title,
-            'description': description
-        })
-
-    # Return the structured data as a dictionary matching the desired JSON schema
-    return {'projects': projects}
\ No newline at end of file
diff --git a/extracted_data.py b/extracted_data.py
deleted file mode 100644
index 45da5e49..00000000
--- a/extracted_data.py
+++ /dev/null
@@ -1,28 +0,0 @@
-def extract_data(html: str) -> dict:
-    from bs4 import BeautifulSoup
-
-    # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # Initialize an empty list to hold project data
-    projects = []
-
-    # Find all project entries in the HTML
-    project_entries = soup.find_all('div', class_='grid-item')
-
-    # Iterate over each project entry to extract title and description
-    for entry in project_entries:
-        # Extract the title from the card-title class
-        title = entry.find('h4', class_='card-title').get_text(strip=True)
-        
-        # Extract the description from the card-text class
-        description = entry.find('p', class_='card-text').get_text(strip=True)
-        
-        # Append the extracted data as a dictionary to the projects list
-        projects.append({
-            'title': title,
-            'description': description
-        })
-
-    # Return the structured data as a dictionary matching the desired JSON schema
-    return {'projects': projects}
\ No newline at end of file

From e99ddda6eb731853536100e500454791f7b4c4b8 Mon Sep 17 00:00:00 2001
From: Lorenzo Padoan <lorenzo.padoan977@gmail.com>
Date: Mon, 18 Nov 2024 17:56:54 +0100
Subject: [PATCH 8/9] Update FUNDING.yml

Add open collective
---
 .github/FUNDING.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 4c4dbecc..59b38648 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -2,7 +2,7 @@
 
 github: ScrapeGraphAI
 patreon: # Replace with a single Patreon username
-open_collective: 
+open_collective: open_collective: https://opencollective.com/scrapegraphai
 ko_fi: # Replace with a single Ko-fi username
 tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry

From 5dfda11cef262ffdc64833870786b6d6cd8873f9 Mon Sep 17 00:00:00 2001
From: Lorenzo Padoan <lorenzo.padoan977@gmail.com>
Date: Mon, 18 Nov 2024 17:59:18 +0100
Subject: [PATCH 9/9] Update FUNDING.yml

---
 .github/FUNDING.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 59b38648..0a2af007 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -2,7 +2,7 @@
 
 github: ScrapeGraphAI
 patreon: # Replace with a single Patreon username
-open_collective: open_collective: https://opencollective.com/scrapegraphai
+open_collective: https://opencollective.com/scrapegraphai
 ko_fi: # Replace with a single Ko-fi username
 tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry