NVIDIA · jon-barker · Dec 18, 2025 · Dec 18, 2025 · Dec 19, 2025 · Dec 19, 2025
@@ -1,5 +1,6 @@
 - agent_type: examples.rl.environments.math.gsm8k_agent.GSM8KAgent
   agent_args:
     answer_format: "boxed"
+    format_reward: 0.5
   weight: 1.0
   evaluation_only: false
@@ -22,7 +22,7 @@
 ), "math_verify is not installed but now required. Install it using `pip install math-verify` to continue."
 
 NEGATIVE_REWARD = 0.0
-
+PARTIAL_END_REWARD = 0.75
 
 class MathAgent(RewardOnlyAgent):
     def __init__(self, format_reward: float = 0.0, answer_format: str = "tagged", **kwargs):
@@ -37,32 +37,70 @@ def compute_score(self, response: str, golden: dict, golden_key: str = "answer")
         Uses the final answer in the response string to compute the score.
         """
         # Allow <answer> tags or \boxed{} tags (this is a bit of cheating in favor of deepseek distilled models I think)
-        for pattern in [
-            r'<answer>(.*?)</answer>',
-            r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}",
-        ]:
-            match = re.finditer(pattern, response, re.DOTALL)
-            matches = list(match)
-            if matches:
-                final_answer = matches[-1].group(1).strip()
-                break
-        else:
-            # Did not format the answer correctly
-            return NEGATIVE_REWARD
+        matched_format = None
+        end_tokens = ["<|end_of_text|>", "<|endoftext|>", "</s>"]
 
-        try:
-            parsed_answer = parse(final_answer)
-        except ValueError as e:
-            print("Failed to parse the answer.")
-            traceback.print_stack()
-            return NEGATIVE_REWARD
+        # Only an answer immediately followed by a known end token yields 1.0 reward.
+        answer_tag_pattern = r'<answer>(.*?)</answer>'
+        answer_tag_match = list(re.finditer(answer_tag_pattern, response, re.DOTALL))
+        if answer_tag_match:
+            # Only consider the last occurrence
+            last_match = answer_tag_match[-1]
+            final_answer = last_match.group(1).strip()
+            after = response[last_match.end():].lstrip()  # strip whitespace between </answer> and token
 
-        correct_answer = verify(str(golden[golden_key]), parsed_answer)
-        if correct_answer:
-            return 1.0
+            try:
+                parsed_answer = parse(final_answer)
+            except ValueError as e:
+                print("Failed to parse the answer.")
+                traceback.print_stack()
+                return NEGATIVE_REWARD
+
+            correct_answer = verify(str(golden[golden_key]), parsed_answer)
+            if correct_answer:
+                # Accept either <|end_of_text|> or <|endoftext|> as valid terminators, for flexibility.
+                for token in end_tokens:
+                    if after.startswith(token):
+                        return 1.0
+                # If the end token is present later (extra text before it), give partial credit.
+                for token in end_tokens:
+                    if token in after:
+                        return PARTIAL_END_REWARD
+                # If a correct answer but missing immediate end, give format reward (not NEGATIVE_REWARD).
+                return self.format_reward
+            else:
+                # Incorrect answer, regardless of format/end-of-text
+                return self.format_reward
         else:
-            # Formatting is correct but the answer is incorrect
-            return self.format_reward
+            # Fallback: check boxed answer format for diagnostic/format reward as before
+            boxed_pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}"
+            boxed_match = list(re.finditer(boxed_pattern, response, re.DOTALL))
+            if boxed_match:
+                last_match = boxed_match[-1]
+                final_answer = last_match.group(1).strip()
+                after = response[last_match.end():].lstrip()
+                try:
+                    parsed_answer = parse(final_answer)
+                except ValueError as e:
+                    print("Failed to parse the answer.")
+                    traceback.print_stack()
+                    return NEGATIVE_REWARD
+
+                correct_answer = verify(str(golden[golden_key]), parsed_answer)
+                if correct_answer:
+                    for token in end_tokens:
+                        if after.startswith(token):
+                            return 1.0
+                    for token in end_tokens:
+                        if token in after:
+                            return PARTIAL_END_REWARD
+                    return self.format_reward
+                else:
+                    # Formatting is correct but the answer is incorrect
+                    return self.format_reward
+            else:
+                # Did not format the answer correctly
+                return NEGATIVE_REWARD
 
     def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:
         """Take a string math problem and return the prompt. Supports requesting tagged or boxed answers. Supports chat mode prompts."""
@@ -80,6 +118,7 @@ def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:
     The question will be a word math problem. Show your work in <think> </think> tags. 
     {answer_format}
     User: {kwargs[problem_key]}
-    Assistant: Let me solve this step by step.
-    <think>"""
+    Assistant: """
+    # Assistant: Let me solve this step by step.
+    # <think>"""
         return prefix
@@ -0,0 +1,115 @@
+#!/bin/bash
+TP=${TP:-2}
+PP=${PP:-1}
+EP=${EP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-1}
+LLM="dsv2_lite"
+
+echo "Using Deepseek-v2-lite  model checkpoint (not the exact model weights..)"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-11999}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.4}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-8}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-8}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-64}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-20}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+
+MODEL_OPTIONS="\
+  --use-checkpoint-args \
+  --enable-experimental \
+  --cross-entropy-loss-fusion \
+  --cross-entropy-fusion-impl native \
+  --moe-aux-loss-coeff 0.0 \
+  --moe-router-dtype fp64 \
+  --moe-router-load-balancing-type none \
+  --moe-token-dispatcher-type alltoall \
+  --attention-backend flash \
+  --disable-gloo-process-groups \
+  --grpo-default-temperature 1.2 \
+  --grpo-default-top-p 0.95 \
+  --disable-chunked-prefill \
+  --calculate-per-token-loss \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --distributed-timeout-minutes 60 \
+  --use-mcore-models \
+  --no-mmap-bin-files \
+  --disable-bias-linear \
+  --norm-epsilon 1e-5 \
+  --init-method-std 0.014 \
+  --exit-duration-in-mins 5750 \
+  --max-position-embeddings 8192 \
+  --tensor-model-parallel-size $TP  \
+  --pipeline-model-parallel-size $PP  \
+  --expert-model-parallel-size $EP \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --weight-decay 0.01 \
+  --clip-grad 0.1 \
+  --tiktoken-pattern v2 \
+  --tokenizer-type TikTokenizer \
+  --tokenizer-model ${TOKENIZER_MODEL} \
+  --no-use-tokenizer-model-from-checkpoint-args \
+  --dist-ckpt-strictness log_unexpected
+  --ckpt-format torch_dist \
+  --ckpt-fully-parallel-save \
+  --ckpt-fully-parallel-load \
+  --use-distributed-optimizer \
+  --overlap-grad-reduce \
+  --overlap-param-gather \
+  --no-create-attention-mask-in-dataloader \
+  --lr 1e-7 \
+  --lr-warmup-samples 0 \
+  --no-load-optim \
+  --decode-only-cuda-graphs \
+  --rl-inference-logprobs-is-correction \
+  --rl-importance-sampling-truncation-coef 5.0 \
+  "
+
+# 1. remove importance sampling
+
+
+# 2. removed any form of load balancing loss
@@ -0,0 +1,128 @@
+#!/bin/bash
+TP=${TP:-2}
+PP=${PP:-1}
+EP=${EP:-32}
+NODES_REQUIRED=${NODES_REQUIRED:-4}
+LLM="nemotron6_3b_moe"
+
+ROOT_DIR="/lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6"
+
+CHECKPOINT="${ROOT_DIR}/3b_hybrid_moe/checkpoints/phase2_lc_reinit_emb/"
+
+TOKENIZER_MODEL="${ROOT_DIR}/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json"
+
+echo "Using Nemotron6 3B MOE model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-11999}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-20}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-2}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-16}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-32}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-1024}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-20}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+MODEL_OPTIONS="\
+  --rl-skip-bos-token \
+  --no-rl-use-sequence-packing \
+  --rl-partial-rollouts \
+  --rl-offload-optimizer-during-inference \
+  --moe-pad-experts-for-cuda-graph-inference \
+  --inference-dynamic-batching-max-tokens 8192 \
+  --inference-dynamic-batching-max-requests 128 \
+  --inference-dynamic-batching-num-cuda-graphs 2 \
+  --decode-only-cuda-graphs \
+  --cuda-graph-impl local \
+  --cuda-graph-scope full \
+  --use-checkpoint-args \
+  --enable-experimental \
+  --cross-entropy-loss-fusion \
+  --cross-entropy-fusion-impl native \
+  --moe-aux-loss-coeff 0.0 \
+  --moe-router-dtype fp64 \
+  --moe-router-load-balancing-type aux_loss \
+  --moe-router-score-function sigmoid \
+  --moe-token-dispatcher-type alltoall \
+  --moe-router-enable-expert-bias \
+  --moe-router-topk-scaling-factor 2.5 \
+  --disable-gloo-process-groups \
+  --grpo-default-top-k -1 \
+  --grpo-default-temperature 1.0 \
+  --grpo-default-top-p 1.0 \
+  --rl-inference-logprobs-is-correction \
+  --rl-importance-sampling-truncation-coef 10.0 \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --distributed-timeout-minutes 60 \
+  --use-mcore-models \
+  --no-mmap-bin-files \
+  --disable-bias-linear \
+  --norm-epsilon 1e-5 \
+  --init-method-std 0.014 \
+  --exit-duration-in-mins 5750 \
+  --max-position-embeddings $MAX_SEQ_LENGTH \
+  --tensor-model-parallel-size $TP  \
+  --pipeline-model-parallel-size $PP  \
+  --expert-model-parallel-size $EP \
+  --expert-tensor-parallel-size 1 \
+  --weight-decay 0.01 \
+  --clip-grad 1.0 \
+  --tiktoken-pattern v2 \
+  --tokenizer-type TikTokenizer \
+  --tokenizer-model ${TOKENIZER_MODEL} \
+  --dist-ckpt-strictness log_unexpected
+  --ckpt-format torch_dist \
+  --ckpt-fully-parallel-save \
+  --ckpt-fully-parallel-load \
+  --use-distributed-optimizer \
+  --overlap-grad-reduce \
+  --overlap-param-gather \
+  --no-create-attention-mask-in-dataloader \
+  --lr 3e-6 \
+  --min-lr 3e-6 \
+  --lr-decay-style constant \
+  --lr-warmup-samples 640 \
+  --lr-warmup-init 0.3e-7 \
+  --no-load-optim \
+  --no-load-rng \
+  "