stanfordnlp
diff --git a/‎dspy/dsp/utils/settings.py
Lines changed: 54 additions & 21 deletions b/‎dspy/dsp/utils/settings.py
Lines changed: 54 additions & 21 deletions
diff --git a/‎dspy/primitives/program.py
Lines changed: 3 additions & 3 deletions b/‎dspy/primitives/program.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎dspy/streaming/streamify.py
Lines changed: 0 additions & 7 deletions b/‎dspy/streaming/streamify.py
Lines changed: 0 additions & 7 deletions
diff --git a/‎dspy/utils/asyncify.py
Lines changed: 4 additions & 4 deletions b/‎dspy/utils/asyncify.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎dspy/utils/parallelizer.py
Lines changed: 4 additions & 4 deletions b/‎dspy/utils/parallelizer.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/adapters/test_two_step_adapter.py
Lines changed: 2 additions & 3 deletions b/‎tests/adapters/test_two_step_adapter.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎tests/callback/test_callback.py
Lines changed: 4 additions & 5 deletions b/‎tests/callback/test_callback.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎tests/predict/test_chain_of_thought.py
Lines changed: 4 additions & 4 deletions b/‎tests/predict/test_chain_of_thought.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/predict/test_predict.py
Lines changed: 69 additions & 3 deletions b/‎tests/predict/test_predict.py
Lines changed: 69 additions & 3 deletions
@@ -1,3 +1,5 @@
+import asyncio
+import contextvars
 import copy
 import threading
 from contextlib import contextmanager
@@ -38,13 +40,7 @@
 # Global lock for settings configuration
 global_lock = threading.Lock()
 
-
-class ThreadLocalOverrides(threading.local):
-    def __init__(self):
-        self.overrides = dotdict()
-
-
-thread_local_overrides = ThreadLocalOverrides()
+thread_local_overrides = contextvars.ContextVar("context_overrides", default=dotdict())
 
 
 class Settings:
@@ -75,7 +71,7 @@ def lock(self):
         return global_lock
 
     def __getattr__(self, name):
-        overrides = getattr(thread_local_overrides, "overrides", dotdict())
+        overrides = thread_local_overrides.get()
         if name in overrides:
             return overrides[name]
         elif name in main_thread_config:
@@ -96,7 +92,7 @@ def __setitem__(self, key, value):
         self.__setattr__(key, value)
 
     def __contains__(self, key):
-        overrides = getattr(thread_local_overrides, "overrides", dotdict())
+        overrides = thread_local_overrides.get()
         return key in overrides or key in main_thread_config
 
     def get(self, key, default=None):
@@ -106,23 +102,60 @@ def get(self, key, default=None):
             return default
 
     def copy(self):
-        overrides = getattr(thread_local_overrides, "overrides", dotdict())
+        overrides = thread_local_overrides.get()
         return dotdict({**main_thread_config, **overrides})
 
     @property
     def config(self):
         return self.copy()
 
-    def configure(self, **kwargs):
+    def _ensure_configure_allowed(self):
         global main_thread_config, config_owner_thread_id
         current_thread_id = threading.get_ident()
 
-        with self.lock:
-            # First configuration: establish ownership. If ownership established, only that thread can configure.
-            if config_owner_thread_id in [None, current_thread_id]:
-                config_owner_thread_id = current_thread_id
-            else:
-                raise RuntimeError("dspy.settings can only be changed by the thread that initially configured it.")
+        if config_owner_thread_id is None:
+            # First `configure` call is always allowed.
+            config_owner_thread_id = current_thread_id
+            return
+
+        if config_owner_thread_id != current_thread_id:
+            # Disallow a second `configure` calls from other threads.
+            raise RuntimeError("dspy.settings can only be changed by the thread that initially configured it.")
+
+        # Async task doesn't allow a second `configure` call, must use dspy.context(...) instead.
+        is_async_task = False
+        try:
+            if asyncio.current_task() is not None:
+                is_async_task = True
+        except RuntimeError:
+            # This exception (e.g., "no current task") means we are not in an async loop/task,
+            # or asyncio module itself is not fully functional in this specific sub-thread context.
+            is_async_task = False
+
+        if not is_async_task:
+            return
+
+        # We are in an async task. Now check for IPython and allow calling `configure` from IPython.
+        in_ipython = False
+        try:
+            from IPython import get_ipython
+
+            # get_ipython is a global injected by IPython environments.
+            # We check its existence and type to be more robust.
+            in_ipython = get_ipython() is not None
+        except Exception:
+            # If `IPython` is not installed or `get_ipython` failed, we are not in an IPython environment.
+            in_ipython = False
+
+        if not in_ipython:
+            raise RuntimeError(
+                "dspy.settings.configure(...) cannot be called a second time from an async task. Use "
+                "`dspy.context(...)` instead."
+            )
+
+    def configure(self, **kwargs):
+        # If no exception is raised, the `configure` call is allowed.
+        self._ensure_configure_allowed()
 
         # Update global config
         for k, v in kwargs.items():
@@ -136,17 +169,17 @@ def context(self, **kwargs):
         If threads are spawned inside this block using ParallelExecutor, they will inherit these overrides.
         """
 
-        original_overrides = getattr(thread_local_overrides, "overrides", dotdict()).copy()
+        original_overrides = thread_local_overrides.get().copy()
         new_overrides = dotdict({**main_thread_config, **original_overrides, **kwargs})
-        thread_local_overrides.overrides = new_overrides
+        token = thread_local_overrides.set(new_overrides)
 
         try:
             yield
         finally:
-            thread_local_overrides.overrides = original_overrides
+            thread_local_overrides.reset(token)
 
     def __repr__(self):
-        overrides = getattr(thread_local_overrides, "overrides", dotdict())
+        overrides = thread_local_overrides.get()
         combined_config = {**main_thread_config, **overrides}
         return repr(combined_config)
 
 
@@ -2,7 +2,7 @@
 
 import magicattr
 
-from dspy.dsp.utils.settings import settings
+from dspy.dsp.utils.settings import settings, thread_local_overrides
 from dspy.predict.parallel import Parallel
 from dspy.primitives.module import BaseModule
 from dspy.utils.callback import with_callbacks
@@ -51,7 +51,7 @@ def __call__(self, *args, **kwargs):
         caller_modules.append(self)
 
         with settings.context(caller_modules=caller_modules):
-            if settings.track_usage and settings.usage_tracker is None:
+            if settings.track_usage and thread_local_overrides.get().get("usage_tracker") is None:
                 with track_usage() as usage_tracker:
                     output = self.forward(*args, **kwargs)
                 output.set_lm_usage(usage_tracker.get_total_tokens())
@@ -66,7 +66,7 @@ async def acall(self, *args, **kwargs):
         caller_modules.append(self)
 
         with settings.context(caller_modules=caller_modules):
-            if settings.track_usage and settings.usage_tracker is None:
+            if settings.track_usage and thread_local_overrides.get().get("usage_tracker") is None:
                 with track_usage() as usage_tracker:
                     output = await self.aforward(*args, **kwargs)
                     output.set_lm_usage(usage_tracker.get_total_tokens())
 
@@ -222,16 +222,10 @@ def apply_sync_streaming(async_generator: AsyncGenerator) -> Generator:
 
     # To propagate prediction request ID context to the child thread
     context = contextvars.copy_context()
-    from dspy.dsp.utils.settings import thread_local_overrides
-
-    parent_overrides = thread_local_overrides.overrides.copy()
 
     def producer():
         """Runs in a background thread to fetch items asynchronously."""
 
-        original_overrides = thread_local_overrides.overrides
-        thread_local_overrides.overrides = parent_overrides.copy()
-
         async def runner():
             try:
                 async for item in async_generator:
@@ -241,7 +235,6 @@ async def runner():
                 queue.put(stop_sentinel)
 
         context.run(asyncio.run, runner())
-        thread_local_overrides.overrides = original_overrides
 
     # Start the producer in a background thread
     thread = threading.Thread(target=producer, daemon=True)
 
@@ -46,17 +46,17 @@ async def async_program(*args, **kwargs) -> Any:
         # Capture the current overrides at call-time.
         from dspy.dsp.utils.settings import thread_local_overrides
 
-        parent_overrides = thread_local_overrides.overrides.copy()
+        parent_overrides = thread_local_overrides.get().copy()
 
         def wrapped_program(*a, **kw):
             from dspy.dsp.utils.settings import thread_local_overrides
 
-            original_overrides = thread_local_overrides.overrides
-            thread_local_overrides.overrides = parent_overrides.copy()
+            original_overrides = thread_local_overrides.get()
+            token = thread_local_overrides.set({**original_overrides, **parent_overrides.copy()})
             try:
                 return program(*a, **kw)
             finally:
-                thread_local_overrides.overrides = original_overrides
+                thread_local_overrides.reset(token)
 
         # Create a fresh asyncified callable each time, ensuring the latest context is used.
         call_async = asyncer.asyncify(wrapped_program, abandon_on_cancel=True, limiter=get_limiter())
 
@@ -86,16 +86,16 @@ def worker(parent_overrides, submission_id, index, item):
             # Apply parent's thread-local overrides
             from dspy.dsp.utils.settings import thread_local_overrides
 
-            original = thread_local_overrides.overrides
-            thread_local_overrides.overrides = parent_overrides.copy()
+            original = thread_local_overrides.get()
+            token = thread_local_overrides.set({**original, **parent_overrides.copy()})
             if parent_overrides.get("usage_tracker"):
                 # Usage tracker needs to be deep copied across threads so that each thread tracks its own usage
                 thread_local_overrides.overrides["usage_tracker"] = copy.deepcopy(parent_overrides["usage_tracker"])
 
             try:
                 return index, function(item)
             finally:
-                thread_local_overrides.overrides = original
+                thread_local_overrides.reset(token)
 
         # Handle Ctrl-C in the main thread
         @contextlib.contextmanager
@@ -121,7 +121,7 @@ def handler(sig, frame):
             with interrupt_manager():
                 from dspy.dsp.utils.settings import thread_local_overrides
 
-                parent_overrides = thread_local_overrides.overrides.copy()
+                parent_overrides = thread_local_overrides.get().copy()
 
                 futures_map = {}
                 futures_set = set()
 
@@ -94,9 +94,8 @@ class TestSignature(dspy.Signature):
     mock_extraction_lm.kwargs = {"temperature": 1.0}
     mock_extraction_lm.model = "openai/gpt-4o"
 
-    dspy.configure(lm=mock_main_lm, adapter=dspy.TwoStepAdapter(extraction_model=mock_extraction_lm))
-
-    result = await program.acall(question="What is 5 + 7?")
+    with dspy.context(lm=mock_main_lm, adapter=dspy.TwoStepAdapter(extraction_model=mock_extraction_lm)):
+        result = await program.acall(question="What is 5 + 7?")
 
     assert result.answer == 12
 
 
@@ -189,13 +189,12 @@ def test_callback_complex_module():
 @pytest.mark.asyncio
 async def test_callback_async_module():
     callback = MyCallback()
-    dspy.settings.configure(
+    with dspy.context(
         lm=DummyLM({"How are you?": {"answer": "test output", "reasoning": "No more responses"}}),
         callbacks=[callback],
-    )
-
-    cot = dspy.ChainOfThought("question -> answer", n=3)
-    result = await cot.acall(question="How are you?")
+    ):
+        cot = dspy.ChainOfThought("question -> answer", n=3)
+        result = await cot.acall(question="How are you?")
     assert result["answer"] == "test output"
     assert result["reasoning"] == "No more responses"
 
 
@@ -19,7 +19,7 @@ def test_initialization_with_string_signature():
 @pytest.mark.asyncio
 async def test_async_chain_of_thought():
     lm = DummyLM([{"reasoning": "find the number after 1", "answer": "2"}])
-    dspy.settings.configure(lm=lm)
-    program = ChainOfThought("question -> answer")
-    result = await program.acall(question="What is 1+1?")
-    assert result.answer == "2"
+    with dspy.context(lm=lm):
+        program = ChainOfThought("question -> answer")
+        result = await program.acall(question="What is 1+1?")
+        assert result.answer == "2"
@@ -1,5 +1,8 @@
+import asyncio
 import copy
 import enum
+import time
+import types
 from datetime import datetime
 from unittest.mock import patch
 
@@ -506,6 +509,69 @@ def test_lm_usage():
         assert result.get_lm_usage()["openai/gpt-4o-mini"]["total_tokens"] == 10
 
 
+def test_lm_usage_with_parallel():
+    program = Predict("question -> answer")
+
+    def program_wrapper(question):
+        # Sleep to make it possible to cause a race condition
+        time.sleep(0.5)
+        return program(question=question)
+
+    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
+    with patch(
+        "dspy.clients.lm.litellm_completion",
+        return_value=ModelResponse(
+            choices=[{"message": {"content": "[[ ## answer ## ]]\nParis"}}],
+            usage={"total_tokens": 10},
+        ),
+    ):
+        parallelizer = dspy.Parallel()
+        input_pairs = [
+            (program_wrapper, {"question": "What is the capital of France?"}),
+            (program_wrapper, {"question": "What is the capital of France?"}),
+        ]
+        results = parallelizer(input_pairs)
+        assert results[0].answer == "Paris"
+        assert results[1].answer == "Paris"
+        assert results[0].get_lm_usage()["openai/gpt-4o-mini"]["total_tokens"] == 10
+        assert results[1].get_lm_usage()["openai/gpt-4o-mini"]["total_tokens"] == 10
+
+
+@pytest.mark.asyncio
+async def test_lm_usage_with_async():
+    program = Predict("question -> answer")
+
+    original_aforward = program.aforward
+
+    async def patched_aforward(self, **kwargs):
+        await asyncio.sleep(1)
+        return await original_aforward(**kwargs)
+
+    program.aforward = types.MethodType(patched_aforward, program)
+
+    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True):
+        with patch(
+            "litellm.acompletion",
+            return_value=ModelResponse(
+                choices=[{"message": {"content": "[[ ## answer ## ]]\nParis"}}],
+                usage={"total_tokens": 10},
+            ),
+        ):
+            coroutines = [
+                program.acall(question="What is the capital of France?"),
+                program.acall(question="What is the capital of France?"),
+                program.acall(question="What is the capital of France?"),
+                program.acall(question="What is the capital of France?"),
+            ]
+            results = await asyncio.gather(*coroutines)
+            assert results[0].answer == "Paris"
+            assert results[1].answer == "Paris"
+            assert results[0].get_lm_usage()["openai/gpt-4o-mini"]["total_tokens"] == 10
+            assert results[1].get_lm_usage()["openai/gpt-4o-mini"]["total_tokens"] == 10
+            assert results[2].get_lm_usage()["openai/gpt-4o-mini"]["total_tokens"] == 10
+            assert results[3].get_lm_usage()["openai/gpt-4o-mini"]["total_tokens"] == 10
+
+
 def test_positional_arguments():
     program = Predict("question -> answer")
     with pytest.raises(ValueError) as e:
@@ -569,9 +635,9 @@ class ConstrainedSignature(dspy.Signature):
 @pytest.mark.asyncio
 async def test_async_predict():
     program = Predict("question -> answer")
-    dspy.settings.configure(lm=DummyLM([{"answer": "Paris"}]))
-    result = await program.acall(question="What is the capital of France?")
-    assert result.answer == "Paris"
+    with dspy.context(lm=DummyLM([{"answer": "Paris"}])):
+        result = await program.acall(question="What is the capital of France?")
+        assert result.answer == "Paris"
 
 
 def test_predicted_outputs_piped_from_predict_to_lm_call():