From bb16aa13a6ec8f42b85b621570f5fae55d788625 Mon Sep 17 00:00:00 2001 From: CodeBeaver Date: Tue, 15 Apr 2025 10:37:49 +0000 Subject: [PATCH] codebeaver/pre/beta-963 - . --- tests/test_chromium.py | 532 ++++++++++++++++++++++++++ tests/test_csv_scraper_multi_graph.py | 161 ++++++++ 2 files changed, 693 insertions(+) diff --git a/tests/test_chromium.py b/tests/test_chromium.py index e1ffa126..e6d79b52 100644 --- a/tests/test_chromium.py +++ b/tests/test_chromium.py @@ -1402,3 +1402,535 @@ async def dummy_playwright(url, browser_name="chromium"): monkeypatch.setattr(loader, "ascrape_playwright", dummy_playwright) result = await loader.scrape("http://example.com") assert "JS flag content" in result + + +@pytest.mark.asyncio +async def test_ascrape_playwright_calls_apply_stealth(monkeypatch): + """Test that ascrape_playwright calls Malenia.apply_stealth on the browser context.""" + flag = {"applied": False} + + async def dummy_apply_stealth(context): + flag["applied"] = True + + monkeypatch.setattr( + "scrapegraphai.docloaders.chromium.Malenia.apply_stealth", dummy_apply_stealth + ) + + class DummyPage: + async def goto(self, url, wait_until): + return + + async def wait_for_load_state(self, state): + return + + async def content(self): + return "Stealth Applied Content" + + class DummyContext: + async def new_page(self): + return DummyPage() + + class DummyBrowser: + async def new_context(self, **kwargs): + return DummyContext() + + async def close(self): + return + + class DummyPW: + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return + + class chromium: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + class firefox: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) + + loader = ChromiumLoader(["http://example.com"], backend="playwright") + result = await loader.ascrape_playwright("http://example.com") + assert flag["applied"] is True + assert "Stealth Applied Content" in result + + +@pytest.mark.asyncio +async def test_lazy_load_non_string_scraper(monkeypatch): + """Test that lazy_load yields Document objects even if the scraping function returns a non‐string value.""" + urls = ["http://example.com"] + loader = ChromiumLoader(urls, backend="playwright", requires_js_support=False) + + async def dummy_non_string(url): + # Return an integer instead of an HTML string + return 12345 + + monkeypatch.setattr(loader, "ascrape_playwright", dummy_non_string) + docs = list(loader.lazy_load()) + # Check that we get one Document and its page_content is the non‐string value returned by the scraper + from langchain_core.documents import Document + + assert len(docs) == 1 + for doc in docs: + assert isinstance(doc, Document) + assert doc.page_content == 12345 + assert doc.metadata["source"] in urls + + +@pytest.mark.asyncio +async def test_alazy_load_non_string_scraper(monkeypatch): + """Test that alazy_load yields Document objects with a non‐string page_content when the JS scraping function returns a non‐string value.""" + urls = ["http://nonstring.com"] + # Instantiate loader with requires_js_support True so that alazy_load calls ascrape_with_js_support + loader = ChromiumLoader(urls, backend="playwright", requires_js_support=True) + + # Define a dummy scraper that returns an integer (non‐string) + async def dummy_non_string(url, browser_name="chromium"): + return 54321 + + monkeypatch.setattr(loader, "ascrape_with_js_support", dummy_non_string) + docs = [doc async for doc in loader.alazy_load()] + from langchain_core.documents import Document + + assert len(docs) == 1 + assert isinstance(docs[0], Document) + assert docs[0].page_content == 54321 + assert docs[0].metadata["source"] == "http://nonstring.com" + + +@pytest.mark.asyncio +async def test_ascrape_playwright_scroll_timeout_none(monkeypatch, mock_playwright): + """Test ascrape_playwright_scroll when timeout is None and scroll_to_bottom is True. + The test uses a dummy page.evaluate sequence to simulate increasing then constant page height. + """ + mock_pw, mock_browser, mock_context, mock_page = mock_playwright + # Simulate a first scroll returns 1000, then 2000, then constant height (2000) + mock_page.evaluate.side_effect = [1000, 2000, 2000, 2000, 2000] + # When scrolling is done the final content is returned + mock_page.content.return_value = "Timeout None Content" + loader = ChromiumLoader(["http://example.com"], backend="playwright") + result = await loader.ascrape_playwright_scroll( + "http://example.com", + timeout=None, + scroll=6000, + sleep=0.1, + scroll_to_bottom=True, + ) + assert "timeout none content" in result.lower() + + +@pytest.mark.asyncio +async def test_ascrape_with_js_support_browser_error_cleanup(monkeypatch): + """Test ascrape_with_js_support to ensure that browser.close() is always called even if an exception occurs. + This simulates a navigation error and checks that on exception the browser is properly closed. + """ + close_called = {"called": False} + + class DummyPage: + async def goto(self, url, wait_until): + raise aiohttp.ClientError("Navigation error") + + async def wait_for_load_state(self, state): + return + + async def content(self): + return "Error Content" + + class DummyContext: + async def new_page(self): + return DummyPage() + + class DummyBrowser: + async def new_context(self, **kwargs): + return DummyContext() + + async def close(self): + close_called["called"] = True + + class DummyPW: + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return + + class chromium: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + class firefox: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) + loader = ChromiumLoader( + ["http://example.com"], + backend="playwright", + requires_js_support=True, + retry_limit=1, + timeout=1, + ) + with pytest.raises(RuntimeError): + await loader.ascrape_with_js_support("http://example.com") + assert close_called["called"] is True + + +def dummy_non_async_scraper(url): + """A dummy scraper function that is not asynchronous.""" + return "non-async result" + + +def test_lazy_load_with_non_async_scraper(monkeypatch, loader_with_dummy): + """Test that lazy_load raises a ValueError when a non-async function is used as the scraper. + In this case, using a non-async function in place of an async scraper should lead to a ValueError. + """ + monkeypatch.setattr( + loader_with_dummy, "ascrape_playwright", dummy_non_async_scraper + ) + with pytest.raises( + ValueError, match="a coroutine was expected, got 'non-async result'" + ): + list(loader_with_dummy.lazy_load()) + + +@pytest.mark.asyncio +async def test_ascrape_playwright_stealth_exception_cleanup(monkeypatch): + """Test that ascrape_playwright calls browser.close() even if Malenia.apply_stealth fails.""" + fail_flag = {"closed": False} + + class DummyPage: + async def goto(self, url, wait_until): + return + + async def wait_for_load_state(self, state): + return + + async def content(self): + return "Content" + + class DummyContext: + async def new_page(self): + return DummyPage() + + class DummyBrowser: + async def new_context(self, **kwargs): + return DummyContext() + + async def close(self): + fail_flag["closed"] = True + + class DummyPW: + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return + + class chromium: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + class firefox: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) + + async def fail_apply_stealth(context): + raise ValueError("Stealth failed") + + monkeypatch.setattr( + "scrapegraphai.docloaders.chromium.Malenia.apply_stealth", fail_apply_stealth + ) + + loader = ChromiumLoader( + ["http://example.com"], backend="playwright", retry_limit=1, timeout=1 + ) + with pytest.raises(RuntimeError, match="Failed to scrape after 1 attempts"): + await loader.ascrape_playwright("http://example.com") + assert fail_flag["closed"] is True + + +@pytest.mark.asyncio +async def test_ascrape_with_js_support_value_error_success(monkeypatch): + """Test that ascrape_with_js_support retries on ValueError and eventually succeeds.""" + attempt_count = {"count": 0} + + class DummyPage: + async def goto(self, url, wait_until): + if attempt_count["count"] < 1: + attempt_count["count"] += 1 + raise ValueError("Test value error") + return + + async def wait_for_load_state(self, state): + return + + async def content(self): + return "Success after ValueError" + + class DummyContext: + async def new_page(self): + return DummyPage() + + class DummyBrowser: + async def new_context(self, **kwargs): + return DummyContext() + + async def close(self): + return + + class DummyPW: + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return + + class chromium: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + class firefox: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) + loader = ChromiumLoader( + ["http://example.com"], + backend="playwright", + requires_js_support=True, + retry_limit=2, + timeout=1, + ) + result = await loader.ascrape_with_js_support("http://example.com") + assert "Success after ValueError" in result + + +@pytest.mark.asyncio +async def test_ascrape_with_js_support_value_error_failure(monkeypatch): + """Test that ascrape_with_js_support raises RuntimeError after exhausting retries on persistent ValueError.""" + + class DummyPage: + async def goto(self, url, wait_until): + raise ValueError("Persistent value error") + + async def wait_for_load_state(self, state): + return + + async def content(self): + return "Should not reach here" + + class DummyContext: + async def new_page(self): + return DummyPage() + + class DummyBrowser: + async def new_context(self, **kwargs): + return DummyContext() + + async def close(self): + return + + class DummyPW: + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return + + class chromium: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + class firefox: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) + loader = ChromiumLoader( + ["http://example.com"], + backend="playwright", + requires_js_support=True, + retry_limit=1, + timeout=1, + ) + with pytest.raises(RuntimeError, match="Failed to scrape after 1 attempts"): + await loader.ascrape_with_js_support("http://example.com") + + +@pytest.mark.asyncio +async def test_ascrape_playwright_scroll_scroll_to_bottom_false( + monkeypatch, mock_playwright +): + """Test ascrape_playwright_scroll with scroll_to_bottom=False. + Simulate a page whose scroll height increases initially then remains constant; + with a short timeout the function should break and return the page content. + """ + mock_pw, mock_browser, mock_context, mock_page = mock_playwright + # simulate a sequence of scroll heights: first increases then remains constant + mock_page.evaluate.side_effect = [1000, 1500, 1500, 1500, 1500] + mock_page.content.return_value = ( + "Timeout reached without scrolling bottom" + ) + + # Create loader with default load_state and short timeout such that the loop terminates + loader = ChromiumLoader( + ["http://example.com"], backend="playwright", load_state="domcontentloaded" + ) + result = await loader.ascrape_playwright_scroll( + "http://example.com", timeout=1, scroll=6000, sleep=0.1, scroll_to_bottom=False + ) + assert "Timeout reached" in result + + +@pytest.mark.asyncio +async def test_ascrape_with_js_support_browser_name_override_new(monkeypatch): + """Test that ascrape_with_js_support calls the firefox branch correctly when browser_name is set to "firefox". + This simulates a dummy playwright that returns a DummyBrowser and content when using firefox. + """ + + class DummyPage: + async def goto(self, url, wait_until): + return + + async def wait_for_load_state(self, state): + return + + async def content(self): + return "Firefox content" + + class DummyContext: + async def new_page(self): + return DummyPage() + + class DummyBrowser: + async def new_context(self, **kwargs): + self.context_kwargs = kwargs + return DummyContext() + + async def close(self): + return + + class DummyPW: + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return + + class firefox: + @staticmethod + async def launch(headless, proxy, **kwargs): + return DummyBrowser() + + class chromium: + @staticmethod + async def launch(headless, proxy, **kwargs): + raise Exception("Chromium branch not used for this test") + + monkeypatch.setattr("playwright.async_api.async_playwright", lambda: DummyPW()) + loader = ChromiumLoader( + ["http://example.com"], backend="playwright", requires_js_support=True + ) + result = await loader.ascrape_with_js_support( + "http://example.com", browser_name="firefox" + ) + assert "Firefox content" in result + + +@pytest.mark.asyncio +async def test_ascrape_playwright_scroll_load_state(mock_playwright): + """Test that ascrape_playwright_scroll waits for the custom load_state value.""" + mock_pw, mock_browser, mock_context, mock_page = mock_playwright + url = "http://example.com" + # Instantiate the loader with a non-default load_state ("custom_state") + loader = ChromiumLoader([url], backend="playwright", load_state="custom_state") + + # Simulate constant page height so that scrolling stops. + # First call returns 1000 then remains constant. + mock_page.evaluate.side_effect = [1000, 1000] + mock_page.content.return_value = "Done" + + result = await loader.ascrape_playwright_scroll( + url, timeout=1, scroll=5000, sleep=0.1, scroll_to_bottom=True + ) + + # Check that wait_for_load_state was called with the custom load_state value. + mock_page.wait_for_load_state.assert_called_with("custom_state") + assert "Done" in result + + +@pytest.mark.asyncio +async def test_alazy_load_concurrency(monkeypatch): + """Test that alazy_load runs tasks concurrently by measuring elapsed time. + Each dummy task sleeps for 0.5 seconds. If run sequentially the total time + would be at least 1.5 seconds for three URLs. Running concurrently should be + significantly faster. + """ + import time + + urls = ["http://example.com/1", "http://example.com/2", "http://example.com/3"] + loader = ChromiumLoader(urls, backend="playwright") + + async def dummy_delay(url): + await asyncio.sleep(0.5) + return f"Content for {url}" + + monkeypatch.setattr(loader, "ascrape_playwright", dummy_delay) + start_time = time.monotonic() + docs = [doc async for doc in loader.alazy_load()] + elapsed = time.monotonic() - start_time + # In sequential execution elapsed time would be at least 1.5s; + # if tasks run concurrently it should be considerably less. + assert elapsed < 1.0, f"Expected concurrent execution but took {elapsed} seconds" + for doc, url in zip(docs, urls): + assert url in doc.metadata["source"] + assert f"Content for {url}" in doc.page_content + + +@pytest.mark.asyncio +async def test_scrape_playwright_value_error_retry_failure(monkeypatch): + """Test that ascrape_playwright retries on ValueError and ultimately raises RuntimeError after exhausting retries.""" + + async def always_value_error(url, browser_name="chromium"): + raise ValueError("Forced value error") + + urls = ["http://example.com"] + # requires_js_support is False so that scraper calls ascrape_playwright. + loader = ChromiumLoader( + urls, backend="playwright", requires_js_support=False, retry_limit=2, timeout=1 + ) + monkeypatch.setattr(loader, "ascrape_playwright", always_value_error) + with pytest.raises(RuntimeError, match="Failed to scrape after 2 attempts"): + await loader.scrape("http://example.com") + + +@pytest.mark.asyncio +async def test_invalid_proxy_raises_error(monkeypatch): + """Test that providing an invalid proxy causes a ValueError during initialization (via parse_or_search_proxy).""" + + def fake_parse_or_search_proxy(proxy): + raise ValueError("Invalid proxy") + + monkeypatch.setattr( + "scrapegraphai.docloaders.chromium.parse_or_search_proxy", + fake_parse_or_search_proxy, + ) + with pytest.raises(ValueError, match="Invalid proxy"): + ChromiumLoader(["http://example.com"], backend="playwright", proxy="bad_proxy") diff --git a/tests/test_csv_scraper_multi_graph.py b/tests/test_csv_scraper_multi_graph.py index e69de29b..c4af1ffd 100644 --- a/tests/test_csv_scraper_multi_graph.py +++ b/tests/test_csv_scraper_multi_graph.py @@ -0,0 +1,161 @@ +from copy import deepcopy + +import pytest + +from scrapegraphai.graphs.csv_scraper_multi_graph import CSVScraperMultiGraph + +# Monkey-patch _create_llm to avoid unsupported provider error during tests +CSVScraperMultiGraph._create_llm = lambda self, llm_config: llm_config + + +# Dummy graph classes to simulate behavior during tests +class DummyGraph: + """Dummy graph that returns a predefined answer.""" + + def __init__(self, answer): + self.answer = answer + + def execute(self, inputs): + # Returns a tuple of (final_state, execution_info) + return ({"answer": self.answer}, {}) + + +class DummyGraphNoAnswer: + """Dummy graph that simulates absence of answer in final_state.""" + + def execute(self, inputs): + # Returns an empty final_state + return ({}, {}) + + +class DummyBaseGraph: + """Dummy BaseGraph to test _create_graph method without side effects.""" + + def __init__(self, nodes, edges, entry_point, graph_name): + self.nodes = nodes + self.edges = edges + self.entry_point = entry_point + self.graph_name = graph_name + + config = { + "llm": {"model": "dummy_model", "model_provider": "dummy_provider"}, + "key": "value", + } + """Test that CSVScraperMultiGraph.run returns the expected answer when provided by the graph.""" + prompt = "Test prompt" + source = ["url1", "url2"] + + # Instantiate the graph + multi_graph = CSVScraperMultiGraph(prompt, source, config) + + # Override the graph attribute with a dummy graph returning an expected answer + multi_graph.graph = DummyGraph("expected answer") + + result = multi_graph.run() + assert result == "expected answer" + + +def test_run_no_answer(): + """Test that CSVScraperMultiGraph.run returns a fallback message when no answer is provided.""" + prompt = "Another test prompt" + source = ["url3"] + config = { + "llm": {"model": "dummy_model", "model_provider": "dummy_provider"}, + "another_key": "another_value", + } + + multi_graph = CSVScraperMultiGraph(prompt, source, config) + multi_graph.graph = DummyGraphNoAnswer() + + result = multi_graph.run() + assert result == "No answer found." + + +def test_create_graph_structure(monkeypatch): + """Test that _create_graph constructs a graph with the expected structure.""" + prompt = "Structure test" + source = ["url4"] + config = { + "llm": {"model": "dummy_model", "model_provider": "dummy_provider"}, + "struct_key": "struct_value", + } + + multi_graph = CSVScraperMultiGraph(prompt, source, config) + + # Monkey-patch the _create_graph method to avoid dependencies on external nodes + monkeypatch.setattr( + multi_graph, + "_create_graph", + lambda: DummyBaseGraph( + nodes=["graph_iterator_node", "merge_answers_node"], + edges=[("graph_iterator_node", "merge_answers_node")], + entry_point="graph_iterator_node", + graph_name=multi_graph.__class__.__name__, + ), + ) + + graph = multi_graph._create_graph() + assert graph.graph_name == "CSVScraperMultiGraph" + assert len(graph.nodes) == 2 + assert len(graph.edges) == 1 + + +def test_config_deepcopy(): + """Test that the configuration dictionary is deep-copied. + Modifying the original config after instantiation should not affect the multi_graph copy. + """ + config = { + "llm": {"model": "dummy_model", "provider": "provider1"}, + "nested": {"a": [1, 2]}, + } + original_config = deepcopy(config) + multi_graph = CSVScraperMultiGraph("Deep copy test", ["url_deep"], config) + # Modify the original config after instantiation + config["nested"]["a"].append(3) + # The multi_graph.copy_config should remain unchanged. + assert multi_graph.copy_config["nested"]["a"] == original_config["nested"]["a"] + + +def test_run_argument_passing(): + """Test that CSVScraperMultiGraph.run passes the correct input arguments + to the graph's execute method and returns the expected answer.""" + + class DummyGraphCapture: + def __init__(self): + self.captured_inputs = None + + def execute(self, inputs): + self.captured_inputs = inputs + return ({"answer": "captured answer"}, {}) + + prompt = "Argument test prompt" + source = ["url_arg1", "url_arg2"] + config = {"llm": {"model": "dummy_model", "provider": "dummy_provider"}} + + multi_graph = CSVScraperMultiGraph(prompt, source, config) + dummy_graph = DummyGraphCapture() + multi_graph.graph = dummy_graph + + result = multi_graph.run() + # Check that the dummy graph captured the inputs as expected + expected_inputs = {"user_prompt": prompt, "jsons": source} + assert dummy_graph.captured_inputs == expected_inputs + assert result == "captured answer" + + +def test_run_with_exception_in_execute(): + """Test that CSVScraperMultiGraph.run propagates exceptions from the graph's execute method.""" + + class DummyGraphException: + def execute(self, inputs): + raise Exception("Test exception") + + prompt = "Exception test prompt" + source = ["url_exception"] + config = {"llm": {"model": "dummy_model", "provider": "dummy_provider"}} + + multi_graph = CSVScraperMultiGraph(prompt, source, config) + multi_graph.graph = DummyGraphException() + + with pytest.raises(Exception, match="Test exception"): + multi_graph.run()