Raise temperature when doing multiple rollouts (and warn otherwise) (#8748)

okhat · web-flow · commit 03543dfda3fd · 2025-08-31T14:19:22.000-04:00
* warn once per LM instance for zero-temp rollout

* Remove duplicate warnings import in test_lm.py
diff --git a/docs/docs/cheatsheet.md b/docs/docs/cheatsheet.md
@@ -10,12 +10,13 @@ This page will contain snippets for frequent usage patterns.
 
 ### Forcing fresh LM outputs
 
-DSPy caches LM calls. Provide a unique ``rollout_id`` to bypass an existing
-cache entry while still caching the new result:
+DSPy caches LM calls. Provide a unique ``rollout_id`` and set a non-zero
+``temperature`` (e.g., 1.0) to bypass an existing cache entry while still caching
+the new result:
 
 ```python
 predict = dspy.Predict("question -> answer")
-predict(question="1+1", config={"rollout_id": 1})
+predict(question="1+1", config={"rollout_id": 1, "temperature": 1.0})
 ```
 
 ### dspy.Signature
diff --git a/docs/docs/learn/programming/language_models.md b/docs/docs/learn/programming/language_models.md
@@ -167,28 +167,29 @@ gpt_4o_mini = dspy.LM('openai/gpt-4o-mini', temperature=0.9, max_tokens=3000, st
 By default LMs in DSPy are cached. If you repeat the same call, you will get the same outputs. But you can turn off caching by setting `cache=False`.
 
 If you want to keep caching enabled but force a new request (for example, to obtain diverse outputs),
-pass a unique `rollout_id` in your call. DSPy hashes both the inputs and the `rollout_id` when
-looking up a cache entry, so different values force a new LM request while
+pass a unique `rollout_id` and set a non-zero `temperature` in your call. DSPy hashes both the inputs
+and the `rollout_id` when looking up a cache entry, so different values force a new LM request while
 still caching future calls with the same inputs and `rollout_id`. The ID is also recorded in
-`lm.history`, which makes it easy to track or compare different rollouts during experiments.
+`lm.history`, which makes it easy to track or compare different rollouts during experiments. Changing
+only the `rollout_id` while keeping `temperature=0` will not affect the LM's output.
 
 ```python linenums="1"
-lm("Say this is a test!", rollout_id=1)
+lm("Say this is a test!", rollout_id=1, temperature=1.0)
 ```
 
 You can pass these LM kwargs directly to DSPy modules as well. Supplying them at
 initialization sets the defaults for every call:
 
 ```python linenums="1"
-predict = dspy.Predict("question -> answer", rollout_id=1)
+predict = dspy.Predict("question -> answer", rollout_id=1, temperature=1.0)
 ```
 
 To override them for a single invocation, provide a ``config`` dictionary when
 calling the module:
 
 ```python linenums="1"
 predict = dspy.Predict("question -> answer")
-predict(question="What is 1 + 52?", config={"rollout_id": 5})
+predict(question="What is 1 + 52?", config={"rollout_id": 5, "temperature": 1.0})
 ```
 
 In both cases, ``rollout_id`` is forwarded to the underlying LM, affects
diff --git a/dspy/clients/base_lm.py b/dspy/clients/base_lm.py
@@ -113,8 +113,8 @@ def copy(self, **kwargs):
         """Returns a copy of the language model with possibly updated parameters.
 
         Any provided keyword arguments update the corresponding attributes or LM kwargs of
-        the copy. For example, ``lm.copy(rollout_id=1)`` returns an LM whose requests use a
-        different rollout ID to bypass cache collisions.
+        the copy. For example, ``lm.copy(rollout_id=1, temperature=1.0)`` returns an LM whose
+        requests use a different rollout ID at non-zero temperature to bypass cache collisions.
         """
 
         import copy
@@ -130,6 +130,8 @@ def copy(self, **kwargs):
                     new_instance.kwargs.pop(key, None)
                 else:
                     new_instance.kwargs[key] = value
+        if hasattr(new_instance, "_warned_zero_temp_rollout"):
+            new_instance._warned_zero_temp_rollout = False
 
         return new_instance
 
diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
@@ -2,6 +2,7 @@
 import os
 import re
 import threading
+import warnings
 from typing import Any, Literal, cast
 
 import litellm
@@ -61,8 +62,9 @@ def __init__(
                 from the models available for inference.
             rollout_id: Optional integer used to differentiate cache entries for otherwise
                 identical requests. Different values bypass DSPy's caches while still caching
-                future calls with the same inputs and rollout ID. This argument is stripped
-                before sending requests to the provider.
+                future calls with the same inputs and rollout ID. Note that `rollout_id`
+                only affects generation when `temperature` is non-zero. This argument is
+                stripped before sending requests to the provider.
         """
         # Remember to update LM.copy() if you modify the constructor!
         self.model = model
@@ -75,6 +77,7 @@ def __init__(
         self.finetuning_model = finetuning_model
         self.launch_kwargs = launch_kwargs or {}
         self.train_kwargs = train_kwargs or {}
+        self._warned_zero_temp_rollout = False
 
         # Handle model-specific configuration for different model families
         model_family = model.split("/")[-1].lower() if "/" in model else model.lower()
@@ -96,6 +99,20 @@ def __init__(
             if self.kwargs.get("rollout_id") is None:
                 self.kwargs.pop("rollout_id", None)
 
+        self._warn_zero_temp_rollout(self.kwargs.get("temperature"), self.kwargs.get("rollout_id"))
+
+    def _warn_zero_temp_rollout(self, temperature: float | None, rollout_id):
+        if (
+            not self._warned_zero_temp_rollout
+            and rollout_id is not None
+            and (temperature is None or temperature == 0)
+        ):
+            warnings.warn(
+                "rollout_id has no effect when temperature=0; set temperature>0 to bypass the cache.",
+                stacklevel=3,
+            )
+            self._warned_zero_temp_rollout = True
+
     def _get_cached_completion_fn(self, completion_fn, cache):
         ignored_args_for_cache_key = ["api_key", "api_base", "base_url"]
         if cache:
@@ -115,6 +132,7 @@ def forward(self, prompt=None, messages=None, **kwargs):
 
         messages = messages or [{"role": "user", "content": prompt}]
         kwargs = {**self.kwargs, **kwargs}
+        self._warn_zero_temp_rollout(kwargs.get("temperature"), kwargs.get("rollout_id"))
         if kwargs.get("rollout_id") is None:
             kwargs.pop("rollout_id", None)
 
@@ -145,6 +163,7 @@ async def aforward(self, prompt=None, messages=None, **kwargs):
 
         messages = messages or [{"role": "user", "content": prompt}]
         kwargs = {**self.kwargs, **kwargs}
+        self._warn_zero_temp_rollout(kwargs.get("temperature"), kwargs.get("rollout_id"))
         if kwargs.get("rollout_id") is None:
             kwargs.pop("rollout_id", None)
 
diff --git a/dspy/predict/best_of_n.py b/dspy/predict/best_of_n.py
@@ -14,8 +14,9 @@ def __init__(
         fail_count: int | None = None,
     ):
         """
-        Runs a module up to `N` times with different rollout IDs and returns the best prediction
-        out of `N` attempts or the first prediction that passes the `threshold`.
+        Runs a module up to `N` times with different rollout IDs at `temperature=1.0` and
+        returns the best prediction out of `N` attempts or the first prediction that passes the
+        `threshold`.
 
         Args:
             module (Module): The module to run.
@@ -53,14 +54,12 @@ def one_word_answer(args, pred):
 
     def forward(self, **kwargs):
         lm = self.module.get_lm() or dspy.settings.lm
-        base_rollout = lm.kwargs.get("rollout_id")
-        start = 0 if base_rollout is None else base_rollout
+        start = lm.kwargs.get("rollout_id", 0)
         rollout_ids = [start + i for i in range(self.N)]
-        rollout_ids = list(dict.fromkeys(rollout_ids))[: self.N]
         best_pred, best_trace, best_reward = None, None, -float("inf")
 
         for idx, rid in enumerate(rollout_ids):
-            lm_ = lm.copy(rollout_id=rid)
+            lm_ = lm.copy(rollout_id=rid, temperature=1.0)
             mod = self.module.deepcopy()
             mod.set_lm(lm_)
 
diff --git a/dspy/predict/predict.py b/dspy/predict/predict.py
@@ -27,8 +27,8 @@ class Predict(Module, Parameter):
             invocation by passing a ``config`` dictionary when calling the
             module. For example::
 
-                predict = dspy.Predict("q -> a", rollout_id=1)
-                predict(q="What is 1 + 52?", config={"rollout_id": 2})
+                predict = dspy.Predict("q -> a", rollout_id=1, temperature=1.0)
+                predict(q="What is 1 + 52?", config={"rollout_id": 2, "temperature": 1.0})
     """
 
     def __init__(self, signature: str | type[Signature], callbacks: list[BaseCallback] | None = None, **config):
diff --git a/dspy/predict/refine.py b/dspy/predict/refine.py
@@ -48,7 +48,8 @@ def __init__(
         fail_count: int | None = None,
     ):
         """
-        Refines a module by running it up to N times with different rollout IDs and returns the best prediction.
+        Refines a module by running it up to N times with different rollout IDs at `temperature=1.0`
+        and returns the best prediction.
 
         This module runs the provided module multiple times with varying rollout identifiers and selects
         either the first prediction that exceeds the specified threshold or the one with the highest reward.
@@ -96,16 +97,14 @@ def one_word_answer(args, pred):
 
     def forward(self, **kwargs):
         lm = self.module.get_lm() or dspy.settings.lm
-        base_rollout = lm.kwargs.get("rollout_id")
-        start = 0 if base_rollout is None else base_rollout
+        start = lm.kwargs.get("rollout_id", 0)
         rollout_ids = [start + i for i in range(self.N)]
-        rollout_ids = list(dict.fromkeys(rollout_ids))[: self.N]
         best_pred, best_trace, best_reward = None, None, -float("inf")
         advice = None
         adapter = dspy.settings.adapter or dspy.ChatAdapter()
 
         for idx, rid in enumerate(rollout_ids):
-            lm_ = lm.copy(rollout_id=rid)
+            lm_ = lm.copy(rollout_id=rid, temperature=1.0)
             mod = self.module.deepcopy()
             mod.set_lm(lm_)
 
diff --git a/dspy/propose/grounded_proposer.py b/dspy/propose/grounded_proposer.py
@@ -284,7 +284,8 @@ def __init__(
         set_tip_randomly=True,
         set_history_randomly=True,
         verbose=False,
-        rng=None
+        rng=None,
+        init_temperature: float = 1.0,
     ):
         super().__init__()
         self.program_aware = program_aware
@@ -299,6 +300,7 @@ def __init__(
         self.rng = rng or random
 
         self.prompt_model = get_prompt_model(prompt_model)
+        self.init_temperature = init_temperature
 
         self.program_code_string = None
         if self.program_aware:
@@ -412,7 +414,10 @@ def propose_instruction_for_predictor(
         )
 
         # Generate a new instruction for our predictor using a unique rollout id to bypass cache
-        rollout_lm = self.prompt_model.copy(rollout_id=self.rng.randint(0, 10**9))
+        rollout_lm = self.prompt_model.copy(
+            rollout_id=self.rng.randint(0, 10**9),
+            temperature=self.init_temperature,
+        )
 
         with dspy.settings.context(lm=rollout_lm):
             proposed_instruction = instruction_generator(
diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py
@@ -47,6 +47,9 @@ def __init__(
         """A Teleprompter class that composes a set of demos/examples to go into a predictor's prompt.
         These demos come from a combination of labeled examples in the training set, and bootstrapped demos.
 
+        Each bootstrap round copies the LM with a new ``rollout_id`` at ``temperature=1.0`` to
+        bypass caches and gather diverse traces.
+
         Args:
             metric (Callable): A function that compares an expected value and predicted value,
                 outputting the result of that comparison.
@@ -181,7 +184,8 @@ def _bootstrap_one_example(self, example, round_idx=0):
         try:
             with dspy.settings.context(trace=[], **self.teacher_settings):
                 lm = dspy.settings.lm
-                lm = lm.copy(rollout_id=round_idx) if round_idx > 0 else lm
+                # Use a fresh rollout with temperature=1.0 to bypass caches.
+                lm = lm.copy(rollout_id=round_idx, temperature=1.0) if round_idx > 0 else lm
                 new_settings = {"lm": lm} if round_idx > 0 else {}
 
                 with dspy.settings.context(**new_settings):
diff --git a/dspy/teleprompt/infer_rules.py b/dspy/teleprompt/infer_rules.py
@@ -143,7 +143,10 @@ class CustomRulesInduction(dspy.Signature):
 
     def forward(self, examples_text):
         with dspy.settings.context(**self.teacher_settings):
-            lm = dspy.settings.lm.copy(rollout_id=self.rng.randint(0, 10**9))
+            # Generate rules with a fresh rollout and non-zero temperature.
+            lm = dspy.settings.lm.copy(
+                rollout_id=self.rng.randint(0, 10**9), temperature=1.0
+            )
             with dspy.settings.context(lm=lm):
                 rules = self.rules_induction(examples_text=examples_text).natural_language_rules
 
diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -58,7 +58,7 @@ def __init__(
         num_threads: int | None = None,
         max_errors: int | None = None,
         seed: int = 9,
-        init_temperature: float = 0.5,
+        init_temperature: float = 1.0,
         verbose: bool = False,
         track_stats: bool = True,
         log_dir: str | None = None,
@@ -400,6 +400,7 @@ def _propose_instructions(
             set_history_randomly=False,
             verbose=self.verbose,
             rng=self.rng,
+            init_temperature=self.init_temperature,
         )
 
         logger.info(f"\nProposing N={self.num_instruct_candidates} instructions...\n")
diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
@@ -14,11 +14,9 @@
 
 def prepare_models_for_resampling(program: dspy.Module, n: int):
     lm = program.get_lm() or dspy.settings.lm
-    base_rollout = lm.kwargs.get("rollout_id")
-    start = 0 if base_rollout is None else base_rollout
+    start = lm.kwargs.get("rollout_id", 0)
     rollout_ids = [start + i for i in range(n)]
-    rollout_ids = list(dict.fromkeys(rollout_ids))[:n]
-    return [lm.copy(rollout_id=r) for r in rollout_ids]
+    return [lm.copy(rollout_id=r, temperature=1.0) for r in rollout_ids]
 
 
 def wrap_program(program: dspy.Module, metric: Callable):
diff --git a/tests/clients/test_lm.py b/tests/clients/test_lm.py
@@ -1,5 +1,6 @@
 import json
 import time
+import warnings
 from unittest import mock
 from unittest.mock import patch
 
@@ -142,6 +143,25 @@ def fake_completion(*, cache, num_retries, retry_strategy, **request):
     dspy.cache = original_cache
 
 
+def test_zero_temperature_rollout_warns_once(monkeypatch):
+    def fake_completion(*, cache, num_retries, retry_strategy, **request):
+        return ModelResponse(
+            choices=[Choices(message=Message(role="assistant", content="Hi!"))],
+            usage={"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+            model="openai/dspy-test-model",
+        )
+
+    monkeypatch.setattr(litellm, "completion", fake_completion)
+
+    lm = dspy.LM(model="openai/dspy-test-model", model_type="chat")
+    with pytest.warns(UserWarning, match="rollout_id has no effect"):
+        lm("Query", rollout_id=1)
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        lm("Query", rollout_id=2)
+        assert len(record) == 0
+
+
 def test_text_lms_can_be_queried(litellm_test_server):
     api_base, _ = litellm_test_server
     expected_response = ["Hi!"]
diff --git a/tests/propose/test_grounded_proposer.py b/tests/propose/test_grounded_proposer.py
@@ -37,10 +37,21 @@ def test_propose_instructions_for_program(demo_candidates):
     ],
 )
 def test_propose_instruction_for_predictor(demo_candidates):
-    prompt_model = DummyLM([{"proposed_instruction": "instruction"}] * 10)
+    class TrackingDummyLM(DummyLM):
+        def copy(self, **kwargs):
+            self.last_copy_kwargs = kwargs
+            return super().copy(**kwargs)
+
+    prompt_model = TrackingDummyLM([{"proposed_instruction": "instruction"}] * 10)
     program = Predict("question -> answer")
 
-    proposer = GroundedProposer(prompt_model=prompt_model, program=program, trainset=[], verbose=False)
+    proposer = GroundedProposer(
+        prompt_model=prompt_model,
+        program=program,
+        trainset=[],
+        verbose=False,
+        init_temperature=0.7,
+    )
     result = proposer.propose_instruction_for_predictor(
         program=program,
         predictor=None,
@@ -51,3 +62,4 @@ def test_propose_instruction_for_predictor(demo_candidates):
         tip=None,
     )
     assert result == "instruction"
+    assert prompt_model.last_copy_kwargs["temperature"] == 0.7