diff --git a/examples/extras/browser_base_integration.py b/examples/extras/browser_base_integration.py new file mode 100644 index 00000000..97529879 --- /dev/null +++ b/examples/extras/browser_base_integration.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "gpt-3.5-turbo", + }, + "browser_base": { + "api_key": os.getenv("BROWSER_BASE_API_KEY"), + "project_id": os.getenv("BROWSER_BASE_API_KEY"), + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/requirements-dev.lock b/requirements-dev.lock index 6bbbd4b9..24b7156d 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -54,6 +54,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai burr==0.22.1 # via scrapegraphai cachetools==5.4.0 @@ -208,6 +210,7 @@ httptools==0.6.1 # via uvicorn httpx==0.27.0 # via anthropic + # via browserbase # via fastapi # via fireworks-ai # via groq @@ -383,6 +386,7 @@ pillow==10.4.0 platformdirs==4.2.2 # via pylint playwright==1.45.0 + # via browserbase # via scrapegraphai # via undetected-playwright pluggy==1.5.0 @@ -412,6 +416,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via burr # via fastapi # via fastapi-pagination diff --git a/requirements.lock b/requirements.lock index b4d1015d..0e8bb930 100644 --- a/requirements.lock +++ b/requirements.lock @@ -37,6 +37,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai cachetools==5.4.0 # via google-auth certifi==2024.7.4 @@ -153,6 +155,7 @@ httplib2==0.22.0 # via google-auth-httplib2 httpx==0.27.0 # via anthropic + # via browserbase # via fireworks-ai # via groq # via openai @@ -275,6 +278,7 @@ pillow==10.4.0 # via langchain-nvidia-ai-endpoints # via sentence-transformers playwright==1.45.0 + # via browserbase # via scrapegraphai # via undetected-playwright proto-plus==1.24.0 @@ -299,6 +303,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via fireworks-ai # via google-cloud-aiplatform # via google-generativeai diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 0efdc879..45a3783d 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,4 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader -from .broswer_base import browser_base_fetch \ No newline at end of file +from .browser_base import browser_base_fetch diff --git a/scrapegraphai/docloaders/broswer_base.py b/scrapegraphai/docloaders/browser_base.py similarity index 83% rename from scrapegraphai/docloaders/broswer_base.py rename to scrapegraphai/docloaders/browser_base.py index 6127c097..77628bc5 100644 --- a/scrapegraphai/docloaders/broswer_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -1,9 +1,10 @@ """ browserbase integration module """ +from typing import List from browserbase import Browserbase -def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: +def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: """ BrowserBase Fetch @@ -15,7 +16,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: - `link`: The URL or link that you want to fetch data from. It initializes a Browserbase object with the given API key and project ID, - then uses this object to load the specified link. It returns the result of the loading operation. + then uses this object to load the specified link. + It returns the result of the loading operation. Example usage: @@ -41,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load(link) + result = browserbase.load([link]) return result diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 474c22de..cb0cfd9a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -1,3 +1,6 @@ +""" +Chromium module +""" import asyncio from typing import Any, AsyncIterator, Iterator, List, Optional diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 0348b3cc..a7493351 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -59,10 +59,11 @@ def __init__(self, prompt: str, config: dict, self.llm_model = self._create_llm(config["llm"]) self.verbose = False if config is None else config.get( "verbose", False) - self.headless = True if config is None else config.get( + self.headless = True if self.config is None else config.get( "headless", True) - self.loader_kwargs = config.get("loader_kwargs", {}) - self.cache_path = config.get("cache_path", False) + self.loader_kwargs = self.config.get("loader_kwargs", {}) + self.cache_path = self.config.get("cache_path", False) + self.browser_base = self.config.get("browser_base") # Create the graph self.graph = self._create_graph() diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 64a80cfe..4971ddb3 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -11,6 +11,7 @@ from langchain_core.documents import Document from ..utils.cleanup_html import cleanup_html from ..docloaders import ChromiumLoader +from ..docloaders.browser_base import browser_base_fetch from ..utils.convert_to_md import convert_to_md from ..utils.logging import get_logger from .base_node import BaseNode @@ -74,6 +75,8 @@ def __init__( False if node_config is None else node_config.get("cut", True) ) + self.browser_base = node_config.get("browser_base") + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -164,7 +167,7 @@ def execute(self, state): parsed_content = source - if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: parsed_content = convert_to_md(source) compressed_document = [ @@ -177,7 +180,7 @@ def execute(self, state): if response.status_code == 200: if not response.text.strip(): raise ValueError("No HTML body content found in the response.") - + parsed_content = response if not self.cut: @@ -198,8 +201,14 @@ def execute(self, state): if self.node_config is not None: loader_kwargs = self.node_config.get("loader_kwargs", {}) - loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) - document = loader.load() + if self.browser_base is not None: + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) + + document = [Document(page_content=content, metadata={"source": source}) for content in data] + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + document = loader.load() if not document or not document[0].page_content.strip(): raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")