From fb87d01ced72c0912be86ae01d93ceefa5d2df08 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 11:27:10 +0200 Subject: [PATCH 1/9] Create browser_base.py --- examples/extras/browser_base.py | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 examples/extras/browser_base.py diff --git a/examples/extras/browser_base.py b/examples/extras/browser_base.py new file mode 100644 index 00000000..465c80ba --- /dev/null +++ b/examples/extras/browser_base.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "gpt-3.5-turbo", + }, + "browser_base": { + "api_key": os.getenv("BROWSER_BASE_API_KEY"), + "project_id": os.getenv("BROWSER_BASE_API_KEY"), + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) From 7076ab12d3e07d02a96ca00375454385303ae004 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 11:31:27 +0200 Subject: [PATCH 2/9] allignment --- pyproject.toml | 1 + requirements-dev.lock | 5 ++++ requirements.lock | 5 ++++ scrapegraphai/docloaders/__init__.py | 1 + scrapegraphai/docloaders/browser_base.py | 38 ++++++++++++++++++++++++ 5 files changed, 50 insertions(+) create mode 100644 scrapegraphai/docloaders/browser_base.py diff --git a/pyproject.toml b/pyproject.toml index 77d48e36..2738bfd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "langchain-fireworks>=0.1.3", "langchain-community>=0.2.9", "langchain-huggingface>=0.0.3", + "browserbase==0.3.0" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index 6bbbd4b9..24b7156d 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -54,6 +54,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai burr==0.22.1 # via scrapegraphai cachetools==5.4.0 @@ -208,6 +210,7 @@ httptools==0.6.1 # via uvicorn httpx==0.27.0 # via anthropic + # via browserbase # via fastapi # via fireworks-ai # via groq @@ -383,6 +386,7 @@ pillow==10.4.0 platformdirs==4.2.2 # via pylint playwright==1.45.0 + # via browserbase # via scrapegraphai # via undetected-playwright pluggy==1.5.0 @@ -412,6 +416,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via burr # via fastapi # via fastapi-pagination diff --git a/requirements.lock b/requirements.lock index b4d1015d..0e8bb930 100644 --- a/requirements.lock +++ b/requirements.lock @@ -37,6 +37,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai cachetools==5.4.0 # via google-auth certifi==2024.7.4 @@ -153,6 +155,7 @@ httplib2==0.22.0 # via google-auth-httplib2 httpx==0.27.0 # via anthropic + # via browserbase # via fireworks-ai # via groq # via openai @@ -275,6 +278,7 @@ pillow==10.4.0 # via langchain-nvidia-ai-endpoints # via sentence-transformers playwright==1.45.0 + # via browserbase # via scrapegraphai # via undetected-playwright proto-plus==1.24.0 @@ -299,6 +303,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via fireworks-ai # via google-cloud-aiplatform # via google-generativeai diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index a9e45407..51561a42 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,3 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader +from .broswer_base import browser_base_fetch diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py new file mode 100644 index 00000000..8f2a0b8e --- /dev/null +++ b/scrapegraphai/docloaders/browser_base.py @@ -0,0 +1,38 @@ +""" +browserbase integration module +""" +from browserbase import Browserbase + +def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: + """ + BrowserBase Fetch + This module provides an interface to the BrowserBase API. + The `browser_base_fetch` function takes three arguments: + - `api_key`: The API key provided by BrowserBase. + - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. + - `link`: The URL or link that you want to fetch data from. + It initializes a Browserbase object with the given API key and project ID, + then uses this object to load the specified link. + It returns the result of the loading operation. + Example usage: + ``` + from browser_base_fetch import browser_base_fetch + result = browser_base_fetch(api_key="your_api_key", + project_id="your_project_id", link="https://example.com") + print(result) + ``` + Please note that you need to replace "your_api_key" and "your_project_id" + with your actual BrowserBase API key and project ID. + Args: + api_key (str): The API key provided by BrowserBase. + project_id (str): The ID of the project on BrowserBase where you want to fetch data from. + link (str): The URL or link that you want to fetch data from. + Returns: + object: The result of the loading operation. + """ + + browserbase = Browserbase(api_key=api_key, project_id=project_id) + + result = browserbase.load(link) + + return result From 5ecdbe715f4bb223fa1be834fda07ccea2a51cb9 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 12:51:18 +0200 Subject: [PATCH 3/9] feat: add integration in the abstract grapgh --- ...ser_base.py => browser_base_integration.py} | 6 ++++-- scrapegraphai/docloaders/__init__.py | 2 +- scrapegraphai/graphs/abstract_graph.py | 15 ++++++++------- scrapegraphai/nodes/fetch_node.py | 18 ++++++++++++++---- 4 files changed, 27 insertions(+), 14 deletions(-) rename examples/extras/{browser_base.py => browser_base_integration.py} (98%) diff --git a/examples/extras/browser_base.py b/examples/extras/browser_base_integration.py similarity index 98% rename from examples/extras/browser_base.py rename to examples/extras/browser_base_integration.py index 465c80ba..97529879 100644 --- a/examples/extras/browser_base.py +++ b/examples/extras/browser_base_integration.py @@ -2,10 +2,12 @@ Basic example of scraping pipeline using SmartScraper """ -import os, json +import os +import json +from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info -from dotenv import load_dotenv + load_dotenv() # ************************************************ diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 51561a42..45a3783d 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,4 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader -from .broswer_base import browser_base_fetch +from .browser_base import browser_base_fetch diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 50de0a94..2ccc988b 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -72,15 +72,16 @@ def __init__(self, prompt: str, config: dict, self.source = source self.config = config self.schema = schema - self.llm_model = self._create_llm(config["llm"], chat=True) - self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder( - config["embeddings"]) - self.verbose = False if config is None else config.get( + self.llm_model = self._create_llm(self.config["llm"], chat=True) + self.embedder_model = self._create_default_embedder(llm_config=self.config["llm"]) if "embeddings" not in self.config else self._create_embedder( + self.config["embeddings"]) + self.verbose = False if self.config is None else self.config.get( "verbose", False) - self.headless = True if config is None else config.get( + self.headless = True if self.config is None else config.get( "headless", True) - self.loader_kwargs = config.get("loader_kwargs", {}) - self.cache_path = config.get("cache_path", False) + self.loader_kwargs = self.config.get("loader_kwargs", {}) + self.cache_path = self.config.get("cache_path", False) + self.browser_base = self.config.get("browser_base") # Create the graph self.graph = self._create_graph() diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 64a80cfe..95561a66 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -11,6 +11,7 @@ from langchain_core.documents import Document from ..utils.cleanup_html import cleanup_html from ..docloaders import ChromiumLoader +from ..docloaders.browser_base import browser_base_fetch from ..utils.convert_to_md import convert_to_md from ..utils.logging import get_logger from .base_node import BaseNode @@ -74,6 +75,8 @@ def __init__( False if node_config is None else node_config.get("cut", True) ) + self.browser_base = node_config.get("browser_base") + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -164,7 +167,7 @@ def execute(self, state): parsed_content = source - if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: parsed_content = convert_to_md(source) compressed_document = [ @@ -177,7 +180,7 @@ def execute(self, state): if response.status_code == 200: if not response.text.strip(): raise ValueError("No HTML body content found in the response.") - + parsed_content = response if not self.cut: @@ -198,8 +201,15 @@ def execute(self, state): if self.node_config is not None: loader_kwargs = self.node_config.get("loader_kwargs", {}) - loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) - document = loader.load() + if self.browser_base is not None: + document = [ + Document(page_content= browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), source), + metadata={}) + ] + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + document = loader.load() if not document or not document[0].page_content.strip(): raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") From 65f9e3a24c8f192d42fb467c03a33fd4b1f64588 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 12:58:35 +0200 Subject: [PATCH 4/9] Delete browser_base.py --- scrapegraphai/docloaders/browser_base.py | 38 ------------------------ 1 file changed, 38 deletions(-) delete mode 100644 scrapegraphai/docloaders/browser_base.py diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py deleted file mode 100644 index 8f2a0b8e..00000000 --- a/scrapegraphai/docloaders/browser_base.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -browserbase integration module -""" -from browserbase import Browserbase - -def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: - """ - BrowserBase Fetch - This module provides an interface to the BrowserBase API. - The `browser_base_fetch` function takes three arguments: - - `api_key`: The API key provided by BrowserBase. - - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. - - `link`: The URL or link that you want to fetch data from. - It initializes a Browserbase object with the given API key and project ID, - then uses this object to load the specified link. - It returns the result of the loading operation. - Example usage: - ``` - from browser_base_fetch import browser_base_fetch - result = browser_base_fetch(api_key="your_api_key", - project_id="your_project_id", link="https://example.com") - print(result) - ``` - Please note that you need to replace "your_api_key" and "your_project_id" - with your actual BrowserBase API key and project ID. - Args: - api_key (str): The API key provided by BrowserBase. - project_id (str): The ID of the project on BrowserBase where you want to fetch data from. - link (str): The URL or link that you want to fetch data from. - Returns: - object: The result of the loading operation. - """ - - browserbase = Browserbase(api_key=api_key, project_id=project_id) - - result = browserbase.load(link) - - return result From d03eedccd718379f267fa305165ad61a295112f8 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:05:13 +0200 Subject: [PATCH 5/9] Update chromium.py --- scrapegraphai/docloaders/chromium.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 474c22de..cb0cfd9a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -1,3 +1,6 @@ +""" +Chromium module +""" import asyncio from typing import Any, AsyncIterator, Iterator, List, Optional From e21d461710e036eb3f71382a2d0d832bf1863c39 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:16:49 +0200 Subject: [PATCH 6/9] push --- .../docloaders/{broswer_base.py => browser_base.py} | 8 +++++--- scrapegraphai/nodes/fetch_node.py | 9 ++++----- 2 files changed, 9 insertions(+), 8 deletions(-) rename scrapegraphai/docloaders/{broswer_base.py => browser_base.py} (83%) diff --git a/scrapegraphai/docloaders/broswer_base.py b/scrapegraphai/docloaders/browser_base.py similarity index 83% rename from scrapegraphai/docloaders/broswer_base.py rename to scrapegraphai/docloaders/browser_base.py index 6127c097..47798e29 100644 --- a/scrapegraphai/docloaders/broswer_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -2,8 +2,9 @@ browserbase integration module """ from browserbase import Browserbase +from typing import List -def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: +def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: """ BrowserBase Fetch @@ -15,7 +16,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: - `link`: The URL or link that you want to fetch data from. It initializes a Browserbase object with the given API key and project ID, - then uses this object to load the specified link. It returns the result of the loading operation. + then uses this object to load the specified link. + It returns the result of the loading operation. Example usage: @@ -41,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load(link) + result = browserbase.load([link]) return result diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 95561a66..741f6a22 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -202,11 +202,10 @@ def execute(self, state): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: - document = [ - Document(page_content= browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), source), - metadata={}) - ] + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), source) + + document = [Document(page_content= data, metadata={"source": "html file"})] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() From 968c69e217d9c180b9b8c2aa52ca59b9a1733525 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:23:54 +0200 Subject: [PATCH 7/9] fix: fixed bug on fetch_node Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/docloaders/browser_base.py | 2 +- scrapegraphai/nodes/fetch_node.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index 47798e29..dd290d2d 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -43,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load([link]) + result = browserbase.load(link) return result diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 741f6a22..86b02bf6 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -202,10 +202,11 @@ def execute(self, state): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: - data = browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), source) + if self.browser_base is not None: + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) - document = [Document(page_content= data, metadata={"source": "html file"})] + document = [Document(page_content=content, metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() From 6d8e02cd62ecf213cfff6e8258b79564db8eeb55 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:24:32 +0200 Subject: [PATCH 8/9] Update browser_base.py Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/docloaders/browser_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index dd290d2d..77628bc5 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -1,8 +1,8 @@ """ browserbase integration module """ -from browserbase import Browserbase from typing import List +from browserbase import Browserbase def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: """ @@ -43,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load(link) + result = browserbase.load([link]) return result From be870a43161cb2ed7f0f60553c2f3742c6b939eb Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:24:48 +0200 Subject: [PATCH 9/9] Update fetch_node.py Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/nodes/fetch_node.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 86b02bf6..4971ddb3 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -202,11 +202,10 @@ def execute(self, state): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: - if self.browser_base is not None: - data = browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), [source]) + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) - document = [Document(page_content=content, metadata={"source": source}) for content in data] + document = [Document(page_content=content, metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load()