Run unit tests with real LLM calls (#8486)

TomeHirata · web-flow · commit b4d1a7ec34d9 · 2025-07-08T09:07:20.000+09:00
* use real LLM for unit tests

* use ollama

* use Llama 3.2 3b

* add verbose option

* split test into a separate job

* remove LLM pulling

* fix option name

* rename env var
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -82,11 +82,49 @@ jobs:
         with:
           args: check --fix-only
       - name: Run tests with pytest
-        run: uv run -p .venv pytest tests/
+        run: uv run -p .venv pytest -vv tests/
       - name: Install optional dependencies
         run: uv sync -p .venv --extra dev --extra test_extras
       - name: Run extra tests
         run: uv run -p .venv pytest tests/ -m extra --extra
+  
+  llm_call_test:
+    name: Run Tests with Real LM
+    runs-on: ubuntu-latest
+    services:
+      ollama:
+        image: ollama/ollama:latest
+        ports:
+          - 11434:11434
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+      - name: Install uv with caching
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            **/pyproject.toml
+            **/uv.lock
+      - name: Create and activate virtual environment
+        run: |
+          uv venv .venv
+          echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
+      - name: Install dependencies
+        run: |
+          uv sync --dev -p .venv --extra dev
+          uv pip list
+      - name: Pull LLM
+        run: |
+          timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done'
+          curl -X POST http://localhost:11434/api/pull \
+            -H "Content-Type: application/json" \
+            -d '{"name": "llama3.2:3b"}'
+          echo "LM_FOR_TEST=ollama/llama3.2:3b" >> $GITHUB_ENV
+      - name: Run tests
+        run: uv run -p .venv pytest -m llm_call --llm_call -vv --durations=5 tests/
 
   build_package:
     name: Build Package
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,10 +1,11 @@
 import copy
+import os
 
 import pytest
 
 from tests.test_utils.server import litellm_test_server, read_litellm_test_server_request_logs  # noqa: F401
 
-SKIP_DEFAULT_FLAGS = ["reliability", "extra"]
+SKIP_DEFAULT_FLAGS = ["reliability", "extra", "llm_call"]
 
 
 @pytest.fixture(autouse=True)
@@ -49,3 +50,11 @@ def pytest_collection_modifyitems(config, items):
         for item in items:
             if flag in item.keywords:
                 item.add_marker(skip_mark)
+
+
+@pytest.fixture
+def lm_for_test():
+    model = os.environ.get("LM_FOR_TEST", None)
+    if model is None:
+        pytest.skip("LM_FOR_TEST is not set in the environment variables")
+    return model
diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py
@@ -230,30 +230,30 @@ def emit(self, record):
         logger.removeHandler(handler)
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
-def test_single_module_call_with_usage_tracker():
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
+@pytest.mark.llm_call
+def test_single_module_call_with_usage_tracker(lm_for_test):
+    dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True)
 
     predict = dspy.ChainOfThought("question -> answer")
     output = predict(question="What is the capital of France?")
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
+    assert lm_usage[lm_for_test]["prompt_tokens"] > 0
+    assert lm_usage[lm_for_test]["completion_tokens"] > 0
+    assert lm_usage[lm_for_test]["total_tokens"] > 0
 
     # Test no usage being tracked when cache is enabled
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=True), track_usage=True)
+    dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=True), track_usage=True)
     for _ in range(2):
         output = predict(question="What is the capital of France?")
 
     assert len(output.get_lm_usage()) == 0
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
-def test_multi_module_call_with_usage_tracker():
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
+@pytest.mark.llm_call
+def test_multi_module_call_with_usage_tracker(lm_for_test):
+    dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True)
 
     class MyProgram(dspy.Module):
         def __init__(self):
@@ -270,12 +270,13 @@ def __call__(self, question: str) -> str:
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
+    assert lm_usage[lm_for_test]["prompt_tokens"] > 0
+    assert lm_usage[lm_for_test]["prompt_tokens"] > 0
+    assert lm_usage[lm_for_test]["completion_tokens"] > 0
+    assert lm_usage[lm_for_test]["total_tokens"] > 0
 
 
+# TODO: prepare second model for testing this unit test in ci
 @pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
 def test_usage_tracker_in_parallel():
     class MyProgram(dspy.Module):
diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
@@ -1,5 +1,4 @@
 import asyncio
-import os
 import time
 from unittest import mock
 from unittest.mock import AsyncMock
@@ -131,9 +130,9 @@ def module_start_status_message(self, instance, inputs):
         assert status_messages[2].message == "Predict starting!"
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
+@pytest.mark.llm_call
 @pytest.mark.anyio
-async def test_stream_listener_chat_adapter():
+async def test_stream_listener_chat_adapter(lm_for_test):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -154,7 +153,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
+    with dspy.context(lm=dspy.LM(lm_for_test, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -194,9 +193,9 @@ async def acall(self, x: str):
     assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..."
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
+@pytest.mark.llm_call
 @pytest.mark.anyio
-async def test_stream_listener_json_adapter():
+async def test_stream_listener_json_adapter(lm_for_test):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -217,7 +216,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.JSONAdapter()):
+    with dspy.context(lm=dspy.LM(lm_for_test, cache=False), adapter=dspy.JSONAdapter()):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -261,8 +260,8 @@ async def gpt_4o_mini_stream(*args, **kwargs):
     assert all_chunks[0].chunk == "How are you doing?"
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
-def test_sync_streaming():
+@pytest.mark.llm_call
+def test_sync_streaming(lm_for_test):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -284,7 +283,7 @@ def __call__(self, x: str, **kwargs):
         async_streaming=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
+    with dspy.context(lm=dspy.LM(lm_for_test, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         for value in output:
diff --git a/tests/utils/test_usage_tracker.py b/tests/utils/test_usage_tracker.py
@@ -1,7 +1,3 @@
-import os
-
-import pytest
-
 import dspy
 from dspy.utils.usage_tracker import UsageTracker, track_usage
 
@@ -137,12 +133,8 @@ def test_track_usage_with_multiple_models():
     assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900
 
 
-@pytest.mark.skipif(
-    not os.getenv("OPENAI_API_KEY"),
-    reason="Skip the test if OPENAI_API_KEY is not set.",
-)
-def test_track_usage_context_manager():
-    lm = dspy.LM("openai/gpt-4o-mini", cache=False)
+def test_track_usage_context_manager(lm_for_test):
+    lm = dspy.LM(lm_for_test, cache=False)
     dspy.settings.configure(lm=lm)
 
     predict = dspy.ChainOfThought("question -> answer")
@@ -151,12 +143,12 @@ def test_track_usage_context_manager():
         predict(question="What is the capital of Italy?")
 
     assert len(tracker.usage_data) > 0
-    assert len(tracker.usage_data["openai/gpt-4o-mini"]) == 2
+    assert len(tracker.usage_data[lm_for_test]) == 2
 
     total_usage = tracker.get_total_tokens()
-    assert "openai/gpt-4o-mini" in total_usage
+    assert lm_for_test in total_usage
     assert len(total_usage.keys()) == 1
-    assert isinstance(total_usage["openai/gpt-4o-mini"], dict)
+    assert isinstance(total_usage[lm_for_test], dict)
 
 
 def test_merge_usage_entries_with_new_keys():