huggingface · clefourrier · Aug 4, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 25, 2025
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
@@ -30,7 +30,7 @@ lighteval accelerate \
      "leaderboard|truthfulqa:mc|0|0"
 ```
 
-Here, we first choose a backend (either `accelerate`, `nanotron`, or `vllm`), and then specify the model and task(s) to run.
+Here, we first choose a backend (either `accelerate`, `nanotron`, `endpoint`, or `vllm`), and then specify the model and task(s) to run.
 
 The syntax for the model arguments is `key1=value1,key2=value2,etc`.
 Valid key-value pairs correspond with the backend configuration, and are detailed [below](#Model Arguments).
@@ -104,13 +104,32 @@ GPUs.
 
 ## Backend configuration
 
+#### General information
+
 The `model-args` argument takes a string representing a list of model
 argument. The arguments allowed vary depending on the backend you use and
 correspond to the fields of the model configs.
 
-The model config can be found [here](./package_reference/models).
+The model configurations can be found [here](./package_reference/models).
+
+All models allow you to post process your reasoning model predictions,
+to remove the thinking tokens from the trace used to compute the metrics,
+using `--remove-reasoning-tags`, and `--reasoning-tags` to specify which
+reasoning tags to remove (defaults to <think> and </think>).
+
+Here's an example with `mistralai/Magistral-Small-2507` which outputs custom
+think tokens.
+
+```bash
+lighteval vllm \
+    "model_name=mistralai/Magistral-Small-2507,dtype=float16,data_parallel_size=4" \
+    "lighteval|aime24|0|0" \
+    --remove-reasoning-tags \
+    --reasoning-tags="[('[THINK]','[/THINK]')]"
+```
+
 
-## Nanotron
+#### Nanotron
 
 To evaluate a model trained with nanotron on a single gpu.
 

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -170,27 +170,10 @@ class Detail:
         """Experiment details of one single example of one task.
 
         Attributes:
-            example (str): Current task example query
-            instruction (str): Instruction prepended to the example and few shots.
-                For example "In this task, you are given information of type x. You need to predict y."
-            full_prompt (str): Expanded full prompt (instruction if present, then prompt)
-            num_effective_few_shots (int): Number of actual few shots used for the example.
-                This depends on the model context length and few-shots samples size: when using effective few-shots,
-                only `num_effective_few_shots` few-shot samples are kept, allowing
-                1) each of the used few-shot examples and the prompt to not be truncated
-                2) this context still allows the model to predict up to the requested max numbers of tokens within its remaining context size.
-            num_asked_few_shots (int): Initially asked number of few-shot samples.
-            predictions (list): List of the actual model predictions
-            input_tokens (list): List of the input tokens given to the model
-            cont_tokens (list): List of the continuation tokens predicted by the model
-            truncated (list): Size of the truncations (if it was needed to fit the prompt in the model context length)
-            padded (list): Size of the padding (if it was needed for the current example)
-            gold (list): Example gold targets (for generative evaluations)
-            pred_logits (list): List of the actual model predicted logits
-            choices (list): List of the possible choices (for multichoice/loglikelihood evaluations)
-            gold_index (list): Indices of the gold targets among the [`choices`]
-            metrics (dict): Metric name to current example score
-
+            doc (Doc): The [`Doc`] object containing the current example information.
+            model_response (ModelResponse): The [`ModelResponse`] object containing the model response for the current example.
+            metric (dict): The metric scores for the current example.
+                Example: {"accuracy": 0.5, "f1": 0.7, "exact_match": 0.6}
         """
 
         doc: Doc

diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -60,6 +60,16 @@ def accelerate(  # noqa C901
     load_responses_from_details_date_id: Annotated[
         Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === saving ===
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
@@ -131,6 +141,8 @@ def accelerate(  # noqa C901
         custom_tasks_directory=custom_tasks,
         num_fewshot_seeds=num_fewshot_seeds,
         max_samples=max_samples,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
         load_responses_from_details_date_id=load_responses_from_details_date_id,
     )
 

diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py
@@ -31,10 +31,10 @@
 app = typer.Typer()
 
 
-HELP_PANNEL_NAME_1 = "Common Parameters"
-HELP_PANNEL_NAME_2 = "Logging Parameters"
-HELP_PANNEL_NAME_3 = "Debug Parameters"
-HELP_PANNEL_NAME_4 = "Modeling Parameters"
+HELP_PANEL_NAME_1 = "Common Parameters"
+HELP_PANEL_NAME_2 = "Logging Parameters"
+HELP_PANEL_NAME_3 = "Debug Parameters"
+HELP_PANEL_NAME_4 = "Modeling Parameters"
 
 
 @app.command(rich_help_panel="Evaluation Backends")
@@ -45,46 +45,56 @@ def custom(
     tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
     # === Common parameters ===
     dataset_loading_processes: Annotated[
-        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = 1,
     custom_tasks: Annotated[
-        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
     num_fewshot_seeds: Annotated[
-        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1)
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = 1,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === saving ===
     output_dir: Annotated[
-        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = "results",
     results_path_template: Annotated[
         str | None,
         Option(
             help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
-            rich_help_panel=HELP_PANNEL_NAME_2,
+            rich_help_panel=HELP_PANEL_NAME_2,
         ),
     ] = None,
     push_to_hub: Annotated[
-        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
     push_to_tensorboard: Annotated[
-        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2)
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
     public_run: Annotated[
-        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2)
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
     results_org: Annotated[
-        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2)
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = None,
     save_details: Annotated[
-        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2)
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
     # === debug ===
     max_samples: Annotated[
-        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
     ] = None,
     job_id: Annotated[
-        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3)
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3)
     ] = 0,
 ):
     """
@@ -113,6 +123,8 @@ def custom(
         custom_tasks_directory=custom_tasks,
         num_fewshot_seeds=num_fewshot_seeds,
         max_samples=max_samples,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
     pipeline = Pipeline(
         tasks=tasks,

diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
@@ -62,6 +62,16 @@ def inference_endpoint(
     load_responses_from_details_date_id: Annotated[
         Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === saving ===
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
@@ -136,6 +146,8 @@ def inference_endpoint(
         num_fewshot_seeds=num_fewshot_seeds,
         max_samples=max_samples,
         load_responses_from_details_date_id=load_responses_from_details_date_id,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
     pipeline = Pipeline(
         tasks=tasks,
@@ -175,6 +187,16 @@ def tgi(
     load_responses_from_details_date_id: Annotated[
         Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === saving ===
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
@@ -253,6 +275,8 @@ def tgi(
         num_fewshot_seeds=num_fewshot_seeds,
         max_samples=max_samples,
         load_responses_from_details_date_id=load_responses_from_details_date_id,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
     pipeline = Pipeline(
         tasks=tasks,
@@ -295,6 +319,16 @@ def litellm(
     load_responses_from_details_date_id: Annotated[
         Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === saving ===
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
@@ -376,6 +410,8 @@ def litellm(
         num_fewshot_seeds=num_fewshot_seeds,
         max_samples=max_samples,
         load_responses_from_details_date_id=load_responses_from_details_date_id,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
     pipeline = Pipeline(
         tasks=tasks,
@@ -449,6 +485,16 @@ def inference_providers(
             rich_help_panel=HELP_PANEL_NAME_2,
         ),
     ] = False,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === debug ===
     max_samples: Annotated[
         Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
@@ -493,6 +539,8 @@ def inference_providers(
         num_fewshot_seeds=num_fewshot_seeds,
         max_samples=max_samples,
         load_responses_from_details_date_id=None,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
     pipeline = Pipeline(
         tasks=tasks,

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
@@ -43,6 +43,16 @@ def nanotron(
         str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
     ],
     lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
 ):
     """
     Evaluate models using nanotron as backend.
@@ -101,6 +111,8 @@ def nanotron(
         custom_tasks_directory=lighteval_config.tasks.custom_tasks,
         num_fewshot_seeds=1,
         max_samples=lighteval_config.tasks.max_samples,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
 
     pipeline = Pipeline(

diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
@@ -53,6 +53,16 @@ def sglang(
     load_responses_from_details_date_id: Annotated[
         Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = True,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === saving ===
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
@@ -122,6 +132,8 @@ def sglang(
         num_fewshot_seeds=num_fewshot_seeds,
         max_samples=max_samples,
         load_responses_from_details_date_id=load_responses_from_details_date_id,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
 
     if model_args.endswith(".yaml"):

diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
@@ -56,6 +56,16 @@ def vllm(
     load_responses_from_details_date_id: Annotated[
         Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = None,
+    remove_reasoning_tags: Annotated[
+        bool, Option(help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = False,
+    reasoning_tags: Annotated[
+        str | None,
+        Option(
+            help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('<think>', '</think>')].",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
     # === saving ===
     output_dir: Annotated[
         str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
@@ -126,6 +136,8 @@ def vllm(
         max_samples=max_samples,
         cot_prompt=cot_prompt,
         load_responses_from_details_date_id=load_responses_from_details_date_id,
+        remove_reasoning_tags=remove_reasoning_tags,
+        reasoning_tags=reasoning_tags,
     )
 
     if model_args.endswith(".yaml"):

diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py
@@ -236,7 +236,7 @@ def add_to_specifics_with_timeout(
 
     def sample_level_fn(doc: Doc, model_response: ModelResponse) -> float:
         golds = doc.get_golds()
-        predictions = model_response.text
+        predictions = model_response.final_text
 
         gold_extraction_regexes = get_extraction_regexes(doc, gold_extraction_target, language)
         pred_extraction_regexes = get_extraction_regexes(doc, pred_extraction_target, language)