ScrapeGraphAI · VinciGit00 · Apr 15, 2025 · Apr 15, 2025
diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py
@@ -14,6 +14,67 @@
 """
 
 
+def test_llm_missing_tokens(monkeypatch, capsys):
+    """Test that missing model tokens causes default to 8192 with an appropriate warning printed."""
+    # Patch out models_tokens to simulate missing tokens for the given model
+    from scrapegraphai.graphs import abstract_graph
+
+    monkeypatch.setattr(
+        abstract_graph, "models_tokens", {"openai": {"gpt-3.5-turbo": 4096}}
+    )
+    llm_config = {"model": "openai/not-known-model", "openai_api_key": "test"}
+    # Patch _create_graph to return a dummy graph to avoid real graph creation
+    with patch.object(TestGraph, "_create_graph", return_value=Mock(nodes=[])):
+        graph = TestGraph("Test prompt", {"llm": llm_config})
+    # Since "not-known-model" is missing, it should default to 8192
+    assert graph.model_token == 8192
+    captured = capsys.readouterr().out
+    assert "Max input tokens for model" in captured
+
+
+def test_burr_kwargs():
+    """Test that burr_kwargs configuration correctly sets use_burr and burr_config on the graph."""
+    dummy_graph = Mock()
+    dummy_graph.nodes = []
+    with patch.object(TestGraph, "_create_graph", return_value=dummy_graph):
+        config = {
+            "llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"},
+            "burr_kwargs": {"some_key": "some_value"},
+        }
+        graph = TestGraph("Test prompt", config)
+    # Check that the burr_kwargs have been applied and an app_instance_id added if missing
+    assert dummy_graph.use_burr is True
+    assert dummy_graph.burr_config["some_key"] == "some_value"
+    assert "app_instance_id" in dummy_graph.burr_config
+
+
+def test_set_common_params():
+    """
+    Test that the set_common_params method correctly updates the configuration
+    of all nodes in the graph.
+    """
+    # Create a mock graph with mock nodes
+    mock_graph = Mock()
+    mock_node1 = Mock()
+    mock_node2 = Mock()
+    mock_graph.nodes = [mock_node1, mock_node2]
+    # Create a TestGraph instance with the mock graph
+    with patch(
+        "scrapegraphai.graphs.abstract_graph.AbstractGraph._create_graph",
+        return_value=mock_graph,
+    ):
+        graph = TestGraph(
+            "Test prompt",
+            {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}},
+        )
+    # Call set_common_params with test parameters
+    test_params = {"param1": "value1", "param2": "value2"}
+    graph.set_common_params(test_params)
+    # Assert that update_config was called on each node with the correct parameters
+    mock_node1.update_config.assert_called_once_with(test_params, False)
+    mock_node2.update_config.assert_called_once_with(test_params, False)
+
+
 class TestGraph(AbstractGraph):
     def __init__(self, prompt: str, config: dict):
         super().__init__(prompt, config)
@@ -78,6 +139,7 @@ class TestAbstractGraph:
                 {
                     "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
                     "region_name": "IDK",
+                    "temperature": 0.7,
                 },
                 ChatBedrock,
             ),
@@ -136,6 +198,7 @@ def test_create_llm_unknown_provider(self):
                 {
                     "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
                     "region_name": "IDK",
+                    "temperature": 0.7,
                     "rate_limit": {"requests_per_second": 1},
                 },
                 ChatBedrock,

diff --git a/tests/test_chromium.py b/tests/test_chromium.py
@@ -1,5 +1,6 @@
 import asyncio
 import sys
+import time
 from unittest.mock import ANY, AsyncMock, patch
 
 import aiohttp
@@ -1934,3 +1935,264 @@ def fake_parse_or_search_proxy(proxy):
     )
     with pytest.raises(ValueError, match="Invalid proxy"):
         ChromiumLoader(["http://example.com"], backend="playwright", proxy="bad_proxy")
+
+
+@pytest.mark.asyncio
+async def test_alazy_load_with_single_url_string(monkeypatch):
+    """Test that alazy_load yields Document objects when urls is a string (iterating over characters)."""
+    # Passing a string as URL; lazy_load will iterate each character.
+    loader = ChromiumLoader(
+        "http://example.com", backend="playwright", requires_js_support=False
+    )
+
+    async def dummy_scraper(url, browser_name="chromium"):
+        return f"<html>{url}</html>"
+
+    monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
+    docs = [doc async for doc in loader.alazy_load()]
+    # The expected number of documents is the length of the string
+    expected_length = len("http://example.com")
+    assert len(docs) == expected_length
+    # Check that the first document’s source is the first character ('h')
+    assert docs[0].metadata["source"] == "h"
+
+
+def test_lazy_load_with_single_url_string(monkeypatch):
+    """Test that lazy_load yields Document objects when urls is a string (iterating over characters)."""
+    loader = ChromiumLoader(
+        "http://example.com", backend="playwright", requires_js_support=False
+    )
+
+    async def dummy_scraper(url, browser_name="chromium"):
+        return f"<html>{url}</html>"
+
+    monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
+    docs = list(loader.lazy_load())
+    expected_length = len("http://example.com")
+    assert len(docs) == expected_length
+    # The first character from the URL is 'h'
+    assert docs[0].metadata["source"] == "h"
+
+
+@pytest.mark.asyncio
+async def test_ascrape_playwright_scroll_invalid_type(monkeypatch):
+    """Test that ascrape_playwright_scroll raises TypeError when invalid types are passed for scroll or sleep."""
+    # Create a dummy playwright so that evaluate and content can be called
+
+    loader = ChromiumLoader(["http://example.com"], backend="playwright")
+    # Passing a non‐numeric sleep value should eventually trigger an error
+    with pytest.raises(TypeError):
+        await loader.ascrape_playwright_scroll(
+            "http://example.com", scroll=6000, sleep="2", scroll_to_bottom=False
+        )
+
+
+@pytest.mark.asyncio
+async def test_alazy_load_non_iterable_urls():
+    """Test that alazy_load raises TypeError when urls is not an iterable (e.g., integer)."""
+    with pytest.raises(TypeError):
+        # Passing an integer as urls should cause a TypeError during iteration.
+        loader = ChromiumLoader(123, backend="playwright")
+        [doc async for doc in loader.alazy_load()]
+
+
+def test_lazy_load_non_iterable_urls():
+    """Test that lazy_load raises TypeError when urls is not an iterable (e.g., integer)."""
+    with pytest.raises(TypeError):
+        loader = ChromiumLoader(456, backend="playwright")
+
+    class DummyPW:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return
+
+        class chromium:
+            @staticmethod
+            async def launch(headless, proxy, **kwargs):
+                return DummyBrowser()
+
+        class firefox:
+            @staticmethod
+            async def launch(headless, proxy, **kwargs):
+                return DummyBrowser()
+
+    monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())
+
+    # Create a loader instance with retry_limit=2 so that one failure is allowed.
+
+
+@pytest.mark.asyncio
+async def test_ascrape_playwright_caplog(monkeypatch, caplog):
+    """
+    Test that ascrape_playwright recovers on failure and that error messages are logged.
+    This test simulates one failed attempt (via a Timeout) and then a successful attempt.
+    """
+    # Create a loader instance with a retry limit of 2 and a short timeout.
+    loader = ChromiumLoader(
+        ["http://example.com"], backend="playwright", retry_limit=2, timeout=1
+    )
+    attempt = {"count": 0}
+
+    async def dummy_ascrape(url, browser_name="chromium"):
+        if attempt["count"] < 1:
+            attempt["count"] += 1
+            raise asyncio.TimeoutError("Simulated Timeout")
+        return "Recovered Content"
+
+    monkeypatch.setattr(loader, "ascrape_playwright", dummy_ascrape)
+    with caplog.at_level("ERROR"):
+        result = await loader.ascrape_playwright("http://example.com")
+    assert "Recovered Content" in result
+    assert any(
+        "Attempt 1 failed: Simulated Timeout" in record.message
+        for record in caplog.records
+    )
+
+    class DummyContext:
+        def __init__(self):
+            self.new_page_called = False
+
+        async def new_page(self):
+            self.new_page_called = True
+            return DummyPage()
+
+    class DummyBrowser:
+        def __init__(self):
+            self.new_context_kwargs = None
+
+        async def new_context(self, **kwargs):
+            self.new_context_kwargs = kwargs
+            return DummyContext()
+
+        async def close(self):
+            return
+
+    class DummyPW:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return
+
+        class chromium:
+            @staticmethod
+            async def launch(headless, proxy, **kwargs):
+                return DummyBrowser()
+
+    monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())
+
+    # Initialize the loader with a non-empty storage_state value.
+    loader = ChromiumLoader(
+        ["http://example.com"], backend="playwright", storage_state="dummy_state"
+    )
+
+    # Call ascrape_playwright and capture its result.
+    result = await loader.ascrape_playwright("http://example.com")
+
+    # To verify that ignore_https_errors was passed into new_context,
+    # simulate a separate launch to inspect the new_context_kwargs.
+    browser_instance = await DummyPW.chromium.launch(
+        headless=loader.headless, proxy=loader.proxy
+    )
+    await browser_instance.new_context(
+        storage_state=loader.storage_state, ignore_https_errors=True
+    )
+    kwargs = browser_instance.new_context_kwargs
+
+    assert kwargs is not None
+    assert kwargs.get("ignore_https_errors") is True
+    assert kwargs.get("storage_state") == "dummy_state"
+    assert "<html>Ignore HTTPS errors Test</html>" in result
+
+
+@pytest.mark.asyncio
+async def test_ascrape_with_js_support_context_error_cleanup(monkeypatch):
+    """Test that ascrape_with_js_support calls browser.close() even if new_context fails."""
+    close_called = {"called": False}
+
+    class DummyBrowser:
+        async def new_context(self, **kwargs):
+            # Force an exception during context creation
+            raise Exception("Context error")
+
+        async def close(self):
+            close_called["called"] = True
+
+    class DummyPW:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return
+
+        class chromium:
+            @staticmethod
+            async def launch(headless, proxy, **kwargs):
+                return DummyBrowser()
+
+        class firefox:
+            @staticmethod
+            async def launch(headless, proxy, **kwargs):
+                return DummyBrowser()
+
+    monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW())
+    loader = ChromiumLoader(
+        ["http://example.com"],
+        backend="playwright",
+        requires_js_support=True,
+        retry_limit=1,
+        timeout=1,
+    )
+    with pytest.raises(RuntimeError, match="Failed to scrape after 1 attempts"):
+        await loader.ascrape_with_js_support("http://example.com")
+    assert close_called["called"] is True
+
+
+@pytest.mark.asyncio
+async def test_lazy_load_with_none_urls(monkeypatch):
+    """Test that lazy_load raises TypeError when urls is None."""
+    loader = ChromiumLoader(None, backend="playwright")
+    with pytest.raises(TypeError):
+        list(loader.lazy_load())
+
+
+@pytest.mark.asyncio
+def test_lazy_load_sequential_timing(monkeypatch):
+    """Test that lazy_load runs scraping sequentially rather than concurrently."""
+    urls = ["http://example.com/1", "http://example.com/2", "http://example.com/3"]
+    loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False)
+
+    async def dummy_scraper_with_delay(url, browser_name="chromium"):
+        await asyncio.sleep(0.5)
+        return f"<html>Delayed content for {url}</html>"
+
+    monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper_with_delay)
+    start = time.monotonic()
+    docs = list(loader.lazy_load())
+    elapsed = time.monotonic() - start
+    # At least 0.5 seconds per URL should be observed.
+    assert elapsed >= 1.5, (
+        f"Sequential lazy_load took too little time: {elapsed:.2f} seconds"
+    )
+    for doc, url in zip(docs, urls):
+        assert f"Delayed content for {url}" in doc.page_content
+        assert doc.metadata["source"] == url
+
+
+@pytest.mark.asyncio
+def test_lazy_load_with_tuple_urls(monkeypatch):
+    """Test that lazy_load yields Document objects correctly when urls is provided as a tuple."""
+    urls = ("http://example.com", "http://test.com")
+    loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False)
+
+    async def dummy_scraper(url, browser_name="chromium"):
+        return f"<html>Tuple content for {url}</html>"
+
+    monkeypatch.setattr(loader, "ascrape_playwright", dummy_scraper)
+    docs = list(loader.lazy_load())
+    assert len(docs) == 2
+    for doc, url in zip(docs, urls):
+        assert f"Tuple content for {url}" in doc.page_content
+        assert doc.metadata["source"] == url
diff --git a/tests/test_json_scraper_multi_graph.py b/tests/test_json_scraper_multi_graph.py