Skip to content

Commit 8e3d5de

Browse files
committed
add screenshot scraper
1 parent 8b8d8f0 commit 8e3d5de

File tree

7 files changed

+248
-4
lines changed

7 files changed

+248
-4
lines changed

examples/openai/screenshot_scraper.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import ScreenshotScraperGraph
9+
from scrapegraphai.utils import prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": os.getenv("OPENAI_API_KEY"),
21+
"model": "gpt-4o",
22+
},
23+
"verbose": True,
24+
"headless": False,
25+
}
26+
27+
# ************************************************
28+
# Create the ScreenshotScraperGraph instance and run it
29+
# ************************************************
30+
31+
smart_scraper_graph = ScreenshotScraperGraph(
32+
prompt="List me the email of the company",
33+
source="https://scrapegraphai.com/",
34+
config=graph_config
35+
)
36+
37+
result = smart_scraper_graph.run()
38+
print(json.dumps(result, indent=4))

examples/openai/smart_scraper_openai.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
Basic example of scraping pipeline using SmartScraper
33
"""
44

5-
import os, json
5+
import os
6+
import json
7+
from dotenv import load_dotenv
68
from scrapegraphai.graphs import SmartScraperGraph
79
from scrapegraphai.utils import prettify_exec_info
8-
from dotenv import load_dotenv
10+
911
load_dotenv()
1012

1113
# ************************************************
@@ -16,7 +18,7 @@
1618
graph_config = {
1719
"llm": {
1820
"api_key": os.getenv("OPENAI_API_KEY"),
19-
"model": "gpt-3.5-turbo",
21+
"model": "gpt-4o",
2022
},
2123
"verbose": True,
2224
"headless": False,

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@
2424
from .markdown_scraper_graph import MDScraperGraph
2525
from .markdown_scraper_multi_graph import MDScraperMultiGraph
2626
from .search_link_graph import SearchLinkGraph
27+
from .screenshot_scraper_graph import ScreenshotScraperGraph
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""
2+
ScreenshotScraperGraph Module
3+
"""
4+
5+
from typing import Optional
6+
import logging
7+
from pydantic import BaseModel
8+
from .base_graph import BaseGraph
9+
from .abstract_graph import AbstractGraph
10+
11+
from ..nodes import (
12+
FetchScreenNode,
13+
GenerateAnswerFromImageNode,
14+
)
15+
16+
class ScreenshotScraperGraph(AbstractGraph):
17+
"""
18+
smart_scraper.run()
19+
)
20+
"""
21+
22+
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
23+
super().__init__(prompt, config, source, schema)
24+
25+
26+
def _create_graph(self) -> BaseGraph:
27+
"""
28+
Creates the graph of nodes representing the workflow for web scraping.
29+
30+
Returns:
31+
BaseGraph: A graph instance representing the web scraping workflow.
32+
"""
33+
fetch_screen_node = FetchScreenNode(
34+
input="url",
35+
output=["imgs"],
36+
node_config={
37+
"link": self.source
38+
}
39+
)
40+
generate_answer_from_image_node = GenerateAnswerFromImageNode(
41+
input="doc",
42+
output=["parsed_doc"],
43+
node_config={
44+
"config": self.config
45+
}
46+
)
47+
48+
return BaseGraph(
49+
nodes=[
50+
fetch_screen_node,
51+
generate_answer_from_image_node,
52+
],
53+
edges=[
54+
(fetch_screen_node, generate_answer_from_image_node),
55+
],
56+
entry_point=fetch_screen_node,
57+
graph_name=self.__class__.__name__
58+
)
59+
60+
def run(self) -> str:
61+
"""
62+
Executes the scraping process and returns the answer to the prompt.
63+
64+
Returns:
65+
str: The answer to the prompt.
66+
"""
67+
68+
inputs = {"user_prompt": self.prompt}
69+
self.final_state, self.execution_info = self.graph.execute(inputs)
70+
71+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,6 @@
1919
from .graph_iterator_node import GraphIteratorNode
2020
from .merge_answers_node import MergeAnswersNode
2121
from .generate_answer_omni_node import GenerateAnswerOmniNode
22-
from .merge_generated_scripts import MergeGeneratedScriptsNode
22+
from .merge_generated_scripts import MergeGeneratedScriptsNode
23+
from .fetch_screen_node import FetchScreenNode
24+
from .generate_answer_from_image_node import GenerateAnswerFromImageNode
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from typing import List, Optional
2+
from playwright.sync_api import sync_playwright
3+
from .base_node import BaseNode
4+
5+
class FetchScreenNode(BaseNode):
6+
"""
7+
FetchScreenNode captures screenshots from a given URL and stores the image data as bytes.
8+
"""
9+
10+
def __init__(
11+
self,
12+
input: str,
13+
output: List[str],
14+
node_config: Optional[dict] = None,
15+
node_name: str = "FetchScreenNode",
16+
):
17+
super().__init__(node_name, "node", input, output, 2, node_config)
18+
self.url = node_config.get("link")
19+
20+
def execute(self, state: dict) -> dict:
21+
"""Captures screenshots from the input URL and stores them in the state dictionary as bytes."""
22+
23+
screenshots = []
24+
25+
with sync_playwright() as p:
26+
browser = p.chromium.launch()
27+
page = browser.new_page()
28+
page.goto(self.url)
29+
30+
viewport_height = page.viewport_size["height"]
31+
32+
# Initialize screenshot counter
33+
screenshot_counter = 1
34+
35+
# List to keep track of screenshot data
36+
screenshot_data_list = []
37+
38+
# Function to capture screenshots
39+
def capture_screenshot(scroll_position, counter):
40+
page.evaluate(f"window.scrollTo(0, {scroll_position});")
41+
screenshot_data = page.screenshot()
42+
screenshot_data_list.append(screenshot_data)
43+
44+
# Capture screenshots
45+
capture_screenshot(0, screenshot_counter) # First screenshot
46+
screenshot_counter += 1
47+
capture_screenshot(viewport_height, screenshot_counter) # Second screenshot
48+
49+
browser.close()
50+
51+
# Store screenshot data as bytes in the state dictionary
52+
for screenshot_data in screenshot_data_list:
53+
screenshots.append(screenshot_data)
54+
state["link"] = self.url
55+
state['screenshots'] = screenshots
56+
return state
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from typing import List, Optional
2+
from .base_node import BaseNode
3+
import base64
4+
import requests
5+
6+
class GenerateAnswerFromImageNode(BaseNode):
7+
"""
8+
GenerateAnswerFromImageNode analyzes images from the state dictionary using the OpenAI API
9+
and updates the state with the generated answers.
10+
"""
11+
12+
def __init__(
13+
self,
14+
input: str,
15+
output: List[str],
16+
node_config: Optional[dict] = None,
17+
node_name: str = "GenerateAnswerFromImageNode",
18+
):
19+
super().__init__(node_name, "node", input, output, 2, node_config)
20+
21+
def execute(self, state: dict) -> dict:
22+
"""Processes images from the state, generates answers, and updates the state."""
23+
# Retrieve the image data from the state dictionary
24+
images = state.get('screenshots', [])
25+
results = []
26+
27+
# OpenAI API Key
28+
for image_data in images:
29+
# Encode the image data to base64
30+
base64_image = base64.b64encode(image_data).decode('utf-8')
31+
32+
# Prepare API request
33+
headers = {
34+
"Content-Type": "application/json",
35+
"Authorization": f"Bearer {self.node_config.get("config").get("llm").get("api_key")}"
36+
}
37+
38+
payload = {
39+
"model": "gpt-4o-mini",
40+
"messages": [
41+
{
42+
"role": "user",
43+
"content": [
44+
{
45+
"type": "text",
46+
"text": state.get("user_prompt", "Extract information from the image")
47+
},
48+
{
49+
"type": "image_url",
50+
"image_url": {
51+
"url": f"data:image/jpeg;base64,{base64_image}"
52+
}
53+
}
54+
]
55+
}
56+
],
57+
"max_tokens": 300
58+
}
59+
60+
# Make the API request
61+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
62+
result = response.json()
63+
64+
# Extract the response text
65+
response_text = result.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
66+
67+
# Append the result to the results list
68+
results.append({
69+
"analysis": response_text
70+
})
71+
72+
# Update the state dictionary with the results
73+
state['answer'] = results
74+
return state

0 commit comments

Comments
 (0)