Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/rl/environment_configs/gsm8k.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
- agent_type: examples.rl.environments.math.gsm8k_agent.GSM8KAgent
agent_args:
answer_format: "boxed"
format_reward: 0.5
weight: 1.0
evaluation_only: false
91 changes: 65 additions & 26 deletions examples/rl/environments/math/math_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
), "math_verify is not installed but now required. Install it using `pip install math-verify` to continue."

NEGATIVE_REWARD = 0.0

PARTIAL_END_REWARD = 0.75

class MathAgent(RewardOnlyAgent):
def __init__(self, format_reward: float = 0.0, answer_format: str = "tagged", **kwargs):
Expand All @@ -37,32 +37,70 @@ def compute_score(self, response: str, golden: dict, golden_key: str = "answer")
Uses the final answer in the response string to compute the score.
"""
# Allow <answer> tags or \boxed{} tags (this is a bit of cheating in favor of deepseek distilled models I think)
for pattern in [
r'<answer>(.*?)</answer>',
r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}",
]:
match = re.finditer(pattern, response, re.DOTALL)
matches = list(match)
if matches:
final_answer = matches[-1].group(1).strip()
break
else:
# Did not format the answer correctly
return NEGATIVE_REWARD
matched_format = None
end_tokens = ["<|end_of_text|>", "<|endoftext|>", "</s>"]

try:
parsed_answer = parse(final_answer)
except ValueError as e:
print("Failed to parse the answer.")
traceback.print_stack()
return NEGATIVE_REWARD
# Only an answer immediately followed by a known end token yields 1.0 reward.
answer_tag_pattern = r'<answer>(.*?)</answer>'
answer_tag_match = list(re.finditer(answer_tag_pattern, response, re.DOTALL))
if answer_tag_match:
# Only consider the last occurrence
last_match = answer_tag_match[-1]
final_answer = last_match.group(1).strip()
after = response[last_match.end():].lstrip() # strip whitespace between </answer> and token

correct_answer = verify(str(golden[golden_key]), parsed_answer)
if correct_answer:
return 1.0
try:
parsed_answer = parse(final_answer)
except ValueError as e:
print("Failed to parse the answer.")
traceback.print_stack()
return NEGATIVE_REWARD

correct_answer = verify(str(golden[golden_key]), parsed_answer)
if correct_answer:
# Accept either <|end_of_text|> or <|endoftext|> as valid terminators, for flexibility.
for token in end_tokens:
if after.startswith(token):
return 1.0
# If the end token is present later (extra text before it), give partial credit.
for token in end_tokens:
if token in after:
return PARTIAL_END_REWARD
# If a correct answer but missing immediate end, give format reward (not NEGATIVE_REWARD).
return self.format_reward
else:
# Incorrect answer, regardless of format/end-of-text
return self.format_reward
else:
# Formatting is correct but the answer is incorrect
return self.format_reward
# Fallback: check boxed answer format for diagnostic/format reward as before
boxed_pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}"
boxed_match = list(re.finditer(boxed_pattern, response, re.DOTALL))
if boxed_match:
last_match = boxed_match[-1]
final_answer = last_match.group(1).strip()
after = response[last_match.end():].lstrip()
try:
parsed_answer = parse(final_answer)
except ValueError as e:
print("Failed to parse the answer.")
traceback.print_stack()
return NEGATIVE_REWARD

correct_answer = verify(str(golden[golden_key]), parsed_answer)
if correct_answer:
for token in end_tokens:
if after.startswith(token):
return 1.0
for token in end_tokens:
if token in after:
return PARTIAL_END_REWARD
return self.format_reward
else:
# Formatting is correct but the answer is incorrect
return self.format_reward
else:
# Did not format the answer correctly
return NEGATIVE_REWARD

def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:
"""Take a string math problem and return the prompt. Supports requesting tagged or boxed answers. Supports chat mode prompts."""
Expand All @@ -80,6 +118,7 @@ def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:
The question will be a word math problem. Show your work in <think> </think> tags.
{answer_format}
User: {kwargs[problem_key]}
Assistant: Let me solve this step by step.
<think>"""
Assistant: """
# Assistant: Let me solve this step by step.
# <think>"""
return prefix
115 changes: 115 additions & 0 deletions examples/rl/model_configs/dsv2_lite_moe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/bin/bash
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be helpful to clarify that this is only architecturally identical to DSv2-lite. The model weights are different.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're not actually pointing to any weights here. Is it the case that if someone did have the genuine dsv2_lite weights converted to mcore format we'd expect them to work correctly though?

TP=${TP:-2}
PP=${PP:-1}
EP=${EP:-1}
NODES_REQUIRED=${NODES_REQUIRED:-1}
LLM="dsv2_lite"

echo "Using Deepseek-v2-lite model checkpoint (not the exact model weights..)"
SCRIPT_PATH="${BASH_SOURCE[0]}"
source $(dirname $SCRIPT_PATH)/common.sh

# In all cases, one can override those values.
# However, running without envs will give you some
# good perf out of the box for established envs.
if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
echo "Using DAPO environment config"
GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-11999}
EXIT_INTERVAL=${EXIT_INTERVAL:-16}
CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
else
# Some default values if config is unsupported.
echo "Undected environment config, using default values"
GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.4}
MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-8}
GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-8}
GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-64}
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
EXIT_INTERVAL=${EXIT_INTERVAL:-20}
CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
fi

ENV_DEPENDENT="\
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $TRAINING_BATCH_SIZE \
--grpo-group-size $GRPO_GROUP_SIZE \
--grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
--grpo-iterations $GRPO_ITERATIONS \
--grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
--grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
--grpo-kl-beta $GRPO_KL_BETA \
--langrl-env-config $ENV_CONFIG "


MODEL_OPTIONS="\
--use-checkpoint-args \
--enable-experimental \
--cross-entropy-loss-fusion \
--cross-entropy-fusion-impl native \
--moe-aux-loss-coeff 0.0 \
--moe-router-dtype fp64 \
--moe-router-load-balancing-type none \
--moe-token-dispatcher-type alltoall \
--attention-backend flash \
--disable-gloo-process-groups \
--grpo-default-temperature 1.2 \
--grpo-default-top-p 0.95 \
--disable-chunked-prefill \
--calculate-per-token-loss \
--seq-length $MAX_SEQ_LENGTH \
--inference-max-seq-length $MAX_SEQ_LENGTH \
--inference-max-batch-size $MAX_INFERENCE_BS \
--pretrained-checkpoint $CHECKPOINT \
--distributed-timeout-minutes 60 \
--use-mcore-models \
--no-mmap-bin-files \
--disable-bias-linear \
--norm-epsilon 1e-5 \
--init-method-std 0.014 \
--exit-duration-in-mins 5750 \
--max-position-embeddings 8192 \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--expert-model-parallel-size $EP \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--weight-decay 0.01 \
--clip-grad 0.1 \
--tiktoken-pattern v2 \
--tokenizer-type TikTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--no-use-tokenizer-model-from-checkpoint-args \
--dist-ckpt-strictness log_unexpected
--ckpt-format torch_dist \
--ckpt-fully-parallel-save \
--ckpt-fully-parallel-load \
--use-distributed-optimizer \
--overlap-grad-reduce \
--overlap-param-gather \
--no-create-attention-mask-in-dataloader \
--lr 1e-7 \
--lr-warmup-samples 0 \
--no-load-optim \
--decode-only-cuda-graphs \
--rl-inference-logprobs-is-correction \
--rl-importance-sampling-truncation-coef 5.0 \
"

# 1. remove importance sampling


# 2. removed any form of load balancing loss
128 changes: 128 additions & 0 deletions examples/rl/model_configs/nemotron6_3b_moe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/bin/bash
TP=${TP:-2}
PP=${PP:-1}
EP=${EP:-32}
NODES_REQUIRED=${NODES_REQUIRED:-4}
LLM="nemotron6_3b_moe"

ROOT_DIR="/lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6"

CHECKPOINT="${ROOT_DIR}/3b_hybrid_moe/checkpoints/phase2_lc_reinit_emb/"

TOKENIZER_MODEL="${ROOT_DIR}/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json"

echo "Using Nemotron6 3B MOE model checkpoint"
SCRIPT_PATH="${BASH_SOURCE[0]}"
source $(dirname $SCRIPT_PATH)/common.sh

# In all cases, one can override those values.
# However, running without envs will give you some
# good perf out of the box for established envs.
if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
echo "Using DAPO environment config"
GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-11999}
EXIT_INTERVAL=${EXIT_INTERVAL:-20}
CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
else
# Some default values if config is unsupported.
echo "Undected environment config, using default values"
GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-2}
GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-16}
GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-32}
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-1024}
EXIT_INTERVAL=${EXIT_INTERVAL:-20}
CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
fi

ENV_DEPENDENT="\
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $TRAINING_BATCH_SIZE \
--grpo-group-size $GRPO_GROUP_SIZE \
--grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
--grpo-iterations $GRPO_ITERATIONS \
--grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
--grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
--grpo-kl-beta $GRPO_KL_BETA \
--langrl-env-config $ENV_CONFIG "

MODEL_OPTIONS="\
--rl-skip-bos-token \
--no-rl-use-sequence-packing \
--rl-partial-rollouts \
--rl-offload-optimizer-during-inference \
--moe-pad-experts-for-cuda-graph-inference \
--inference-dynamic-batching-max-tokens 8192 \
--inference-dynamic-batching-max-requests 128 \
--inference-dynamic-batching-num-cuda-graphs 2 \
--decode-only-cuda-graphs \
--cuda-graph-impl local \
--cuda-graph-scope full \
--use-checkpoint-args \
--enable-experimental \
--cross-entropy-loss-fusion \
--cross-entropy-fusion-impl native \
--moe-aux-loss-coeff 0.0 \
--moe-router-dtype fp64 \
--moe-router-load-balancing-type aux_loss \
--moe-router-score-function sigmoid \
--moe-token-dispatcher-type alltoall \
--moe-router-enable-expert-bias \
--moe-router-topk-scaling-factor 2.5 \
--disable-gloo-process-groups \
--grpo-default-top-k -1 \
--grpo-default-temperature 1.0 \
--grpo-default-top-p 1.0 \
--rl-inference-logprobs-is-correction \
--rl-importance-sampling-truncation-coef 10.0 \
--seq-length $MAX_SEQ_LENGTH \
--inference-max-seq-length $MAX_SEQ_LENGTH \
--inference-max-batch-size $MAX_INFERENCE_BS \
--pretrained-checkpoint $CHECKPOINT \
--distributed-timeout-minutes 60 \
--use-mcore-models \
--no-mmap-bin-files \
--disable-bias-linear \
--norm-epsilon 1e-5 \
--init-method-std 0.014 \
--exit-duration-in-mins 5750 \
--max-position-embeddings $MAX_SEQ_LENGTH \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--expert-model-parallel-size $EP \
--expert-tensor-parallel-size 1 \
--weight-decay 0.01 \
--clip-grad 1.0 \
--tiktoken-pattern v2 \
--tokenizer-type TikTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--dist-ckpt-strictness log_unexpected
--ckpt-format torch_dist \
--ckpt-fully-parallel-save \
--ckpt-fully-parallel-load \
--use-distributed-optimizer \
--overlap-grad-reduce \
--overlap-param-gather \
--no-create-attention-mask-in-dataloader \
--lr 3e-6 \
--min-lr 3e-6 \
--lr-decay-style constant \
--lr-warmup-samples 640 \
--lr-warmup-init 0.3e-7 \
--no-load-optim \
--no-load-rng \
"
Loading
Loading