Skip to content

Pre/beta - Unit Tests #968

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions tests/graphs/abstract_graph_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,67 @@
"""


def test_llm_missing_tokens(monkeypatch, capsys):
"""Test that missing model tokens causes default to 8192 with an appropriate warning printed."""
# Patch out models_tokens to simulate missing tokens for the given model
from scrapegraphai.graphs import abstract_graph

monkeypatch.setattr(
abstract_graph, "models_tokens", {"openai": {"gpt-3.5-turbo": 4096}}
)
llm_config = {"model": "openai/not-known-model", "openai_api_key": "test"}
# Patch _create_graph to return a dummy graph to avoid real graph creation
with patch.object(TestGraph, "_create_graph", return_value=Mock(nodes=[])):
graph = TestGraph("Test prompt", {"llm": llm_config})
# Since "not-known-model" is missing, it should default to 8192
assert graph.model_token == 8192
captured = capsys.readouterr().out
assert "Max input tokens for model" in captured


def test_burr_kwargs():
"""Test that burr_kwargs configuration correctly sets use_burr and burr_config on the graph."""
dummy_graph = Mock()
dummy_graph.nodes = []
with patch.object(TestGraph, "_create_graph", return_value=dummy_graph):
config = {
"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"},
"burr_kwargs": {"some_key": "some_value"},
}
graph = TestGraph("Test prompt", config)
# Check that the burr_kwargs have been applied and an app_instance_id added if missing
assert dummy_graph.use_burr is True
assert dummy_graph.burr_config["some_key"] == "some_value"
assert "app_instance_id" in dummy_graph.burr_config


def test_set_common_params():
"""
Test that the set_common_params method correctly updates the configuration
of all nodes in the graph.
"""
# Create a mock graph with mock nodes
mock_graph = Mock()
mock_node1 = Mock()
mock_node2 = Mock()
mock_graph.nodes = [mock_node1, mock_node2]
# Create a TestGraph instance with the mock graph
with patch(
"scrapegraphai.graphs.abstract_graph.AbstractGraph._create_graph",
return_value=mock_graph,
):
graph = TestGraph(
"Test prompt",
{"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}},
)
# Call set_common_params with test parameters
test_params = {"param1": "value1", "param2": "value2"}
graph.set_common_params(test_params)
# Assert that update_config was called on each node with the correct parameters
mock_node1.update_config.assert_called_once_with(test_params, False)
mock_node2.update_config.assert_called_once_with(test_params, False)


class TestGraph(AbstractGraph):
def __init__(self, prompt: str, config: dict):
super().__init__(prompt, config)
Expand Down Expand Up @@ -78,6 +139,7 @@ class TestAbstractGraph:
{
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"region_name": "IDK",
"temperature": 0.7,
},
ChatBedrock,
),
Expand Down Expand Up @@ -136,6 +198,7 @@ def test_create_llm_unknown_provider(self):
{
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"region_name": "IDK",
"temperature": 0.7,
"rate_limit": {"requests_per_second": 1},
},
ChatBedrock,
Expand Down
262 changes: 262 additions & 0 deletions tests/test_chromium.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import sys
import time
from unittest.mock import ANY, AsyncMock, patch

import aiohttp
Expand Down Expand Up @@ -1934,3 +1935,264 @@ def fake_parse_or_search_proxy(proxy):
)
with pytest.raises(ValueError, match="Invalid proxy"):
ChromiumLoader(["http://example.com"], backend="playwright", proxy="bad_proxy")


@pytest.mark.asyncio
async def test_alazy_load_with_single_url_string(monkeypatch):
"""Test that alazy_load yields Document objects when urls is a string (iterating over characters)."""
# Passing a string as URL; lazy_load will iterate each character.
loader = ChromiumLoader(
"http://example.com", backend="playwright", requires_js_support=False
)

async def dummy_scraper(url, browser_name="chromium"):
return f"<html>{url}</html>"

monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
docs = [doc async for doc in loader.alazy_load()]
# The expected number of documents is the length of the string
expected_length = len("http://example.com")
assert len(docs) == expected_length
# Check that the first document’s source is the first character ('h')
assert docs[0].metadata["source"] == "h"


def test_lazy_load_with_single_url_string(monkeypatch):
"""Test that lazy_load yields Document objects when urls is a string (iterating over characters)."""
loader = ChromiumLoader(
"http://example.com", backend="playwright", requires_js_support=False
)

async def dummy_scraper(url, browser_name="chromium"):
return f"<html>{url}</html>"

monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
docs = list(loader.lazy_load())
expected_length = len("http://example.com")
assert len(docs) == expected_length
# The first character from the URL is 'h'
assert docs[0].metadata["source"] == "h"


@pytest.mark.asyncio
async def test_ascrape_playwright_scroll_invalid_type(monkeypatch):
"""Test that ascrape_playwright_scroll raises TypeError when invalid types are passed for scroll or sleep."""
# Create a dummy playwright so that evaluate and content can be called

loader = ChromiumLoader(["http://example.com"], backend="playwright")
# Passing a non‐numeric sleep value should eventually trigger an error
with pytest.raises(TypeError):
await loader.ascrape_playwright_scroll(
"http://example.com", scroll=6000, sleep="2", scroll_to_bottom=False
)


@pytest.mark.asyncio
async def test_alazy_load_non_iterable_urls():
"""Test that alazy_load raises TypeError when urls is not an iterable (e.g., integer)."""
with pytest.raises(TypeError):
# Passing an integer as urls should cause a TypeError during iteration.
loader = ChromiumLoader(123, backend="playwright")
[doc async for doc in loader.alazy_load()]


def test_lazy_load_non_iterable_urls():
"""Test that lazy_load raises TypeError when urls is not an iterable (e.g., integer)."""
with pytest.raises(TypeError):
loader = ChromiumLoader(456, backend="playwright")

class DummyPW:
async def __aenter__(self):
return self

async def __aexit__(self, exc_type, exc, tb):
return

class chromium:
@staticmethod
async def launch(headless, proxy, **kwargs):
return DummyBrowser()

class firefox:
@staticmethod
async def launch(headless, proxy, **kwargs):
return DummyBrowser()

monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())

# Create a loader instance with retry_limit=2 so that one failure is allowed.


@pytest.mark.asyncio
async def test_ascrape_playwright_caplog(monkeypatch, caplog):
"""
Test that ascrape_playwright recovers on failure and that error messages are logged.
This test simulates one failed attempt (via a Timeout) and then a successful attempt.
"""
# Create a loader instance with a retry limit of 2 and a short timeout.
loader = ChromiumLoader(
["http://example.com"], backend="playwright", retry_limit=2, timeout=1
)
attempt = {"count": 0}

async def dummy_ascrape(url, browser_name="chromium"):
if attempt["count"] < 1:
attempt["count"] += 1
raise asyncio.TimeoutError("Simulated Timeout")
return "Recovered Content"

monkeypatch.setattr(loader, "ascrape_playwright", dummy_ascrape)
with caplog.at_level("ERROR"):
result = await loader.ascrape_playwright("http://example.com")
assert "Recovered Content" in result
assert any(
"Attempt 1 failed: Simulated Timeout" in record.message
for record in caplog.records
)

class DummyContext:
def __init__(self):
self.new_page_called = False

async def new_page(self):
self.new_page_called = True
return DummyPage()

class DummyBrowser:
def __init__(self):
self.new_context_kwargs = None

async def new_context(self, **kwargs):
self.new_context_kwargs = kwargs
return DummyContext()

async def close(self):
return

class DummyPW:
async def __aenter__(self):
return self

async def __aexit__(self, exc_type, exc, tb):
return

class chromium:
@staticmethod
async def launch(headless, proxy, **kwargs):
return DummyBrowser()

monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())

# Initialize the loader with a non-empty storage_state value.
loader = ChromiumLoader(
["http://example.com"], backend="playwright", storage_state="dummy_state"
)

# Call ascrape_playwright and capture its result.
result = await loader.ascrape_playwright("http://example.com")

# To verify that ignore_https_errors was passed into new_context,
# simulate a separate launch to inspect the new_context_kwargs.
browser_instance = await DummyPW.chromium.launch(
headless=loader.headless, proxy=loader.proxy
)
await browser_instance.new_context(
storage_state=loader.storage_state, ignore_https_errors=True
)
kwargs = browser_instance.new_context_kwargs

assert kwargs is not None
assert kwargs.get("ignore_https_errors") is True
assert kwargs.get("storage_state") == "dummy_state"
assert "<html>Ignore HTTPS errors Test</html>" in result


@pytest.mark.asyncio
async def test_ascrape_with_js_support_context_error_cleanup(monkeypatch):
"""Test that ascrape_with_js_support calls browser.close() even if new_context fails."""
close_called = {"called": False}

class DummyBrowser:
async def new_context(self, **kwargs):
# Force an exception during context creation
raise Exception("Context error")

async def close(self):
close_called["called"] = True

class DummyPW:
async def __aenter__(self):
return self

async def __aexit__(self, exc_type, exc, tb):
return

class chromium:
@staticmethod
async def launch(headless, proxy, **kwargs):
return DummyBrowser()

class firefox:
@staticmethod
async def launch(headless, proxy, **kwargs):
return DummyBrowser()

monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())
loader = ChromiumLoader(
["http://example.com"],
backend="playwright",
requires_js_support=True,
retry_limit=1,
timeout=1,
)
with pytest.raises(RuntimeError, match="Failed to scrape after 1 attempts"):
await loader.ascrape_with_js_support("http://example.com")
assert close_called["called"] is True


@pytest.mark.asyncio
async def test_lazy_load_with_none_urls(monkeypatch):
"""Test that lazy_load raises TypeError when urls is None."""
loader = ChromiumLoader(None, backend="playwright")
with pytest.raises(TypeError):
list(loader.lazy_load())


@pytest.mark.asyncio
def test_lazy_load_sequential_timing(monkeypatch):
"""Test that lazy_load runs scraping sequentially rather than concurrently."""
urls = ["http://example.com/1", "http://example.com/2", "http://example.com/3"]
loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False)

async def dummy_scraper_with_delay(url, browser_name="chromium"):
await asyncio.sleep(0.5)
return f"<html>Delayed content for {url}</html>"

monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper_with_delay)
start = time.monotonic()
docs = list(loader.lazy_load())
elapsed = time.monotonic() - start
# At least 0.5 seconds per URL should be observed.
assert elapsed >= 1.5, (
f"Sequential lazy_load took too little time: {elapsed:.2f} seconds"
)
for doc, url in zip(docs, urls):
assert f"Delayed content for {url}" in doc.page_content
assert doc.metadata["source"] == url


@pytest.mark.asyncio
def test_lazy_load_with_tuple_urls(monkeypatch):
"""Test that lazy_load yields Document objects correctly when urls is provided as a tuple."""
urls = ("http://example.com", "http://test.com")
loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False)

async def dummy_scraper(url, browser_name="chromium"):
return f"<html>Tuple content for {url}</html>"

monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
docs = list(loader.lazy_load())
assert len(docs) == 2
for doc, url in zip(docs, urls):
assert f"Tuple content for {url}" in doc.page_content
assert doc.metadata["source"] == url
Empty file.
Loading