Compare commits

...

51 commits

Author SHA1 Message Date
lewtun
a401d64a70
Merge branch 'main' into r1-zero 2025-05-25 12:04:07 +02:00
edbeeching
0ed9ea370b Merge branch 'main' into r1-zero 2025-05-10 08:01:02 +00:00
edbeeching
b43069370d Merge branch 'main' into r1-zero 2025-05-09 20:09:15 +00:00
Lewis Tunstall
464d9511f2 Merge branch 'main' into r1-zero 2025-05-08 16:16:29 +02:00
edbeeching
d9a6c086f1 add latest configs 2025-05-07 08:00:23 +00:00
edbeeching
a82c1fd00a adds weighted code reward 2025-05-07 07:59:24 +00:00
edbeeching
8d993d54e9 add gen batch exp config 2025-05-05 08:12:23 +00:00
Lewis Tunstall
2c0cac5da2 Merge branch 'main' into r1-zero 2025-04-26 12:03:15 +02:00
Lewis Tunstall
46c16569a1 Pin transformers 2025-04-26 06:39:02 +00:00
Lewis Tunstall
5f0b8f80c5 Remove hf-transfer in favour of hf-xet 2025-04-24 06:56:30 +00:00
Lewis Tunstall
f27c7327e0 Fix 2025-04-23 18:12:25 +00:00
Lewis Tunstall
2f4b0daba9 Revert slurm 2025-04-23 18:06:51 +00:00
Lewis Tunstall
cebaad59f8 Wait 2025-04-23 09:34:13 +00:00
Lewis Tunstall
2715d3174a Hack 2025-04-23 08:51:49 +00:00
Lewis Tunstall
c24ffd78db Fix attempt on Slurm 2025-04-23 08:09:05 +00:00
Lewis Tunstall
0df1654ef9 Tune recipe 2025-04-23 07:20:33 +00:00
Lewis Tunstall
be72ce6513 Fix sharding in Slurm 2025-04-23 07:20:27 +00:00
Lewis Tunstall
2f0b983cc9 Add 32B recipe 2025-04-22 09:42:33 +00:00
Lewis Tunstall
06bdd50334 Merge branch 'main' into r1-zero 2025-04-17 19:56:35 +02:00
Lewis Tunstall
f3920f8890 Pin TRL 2025-04-17 11:41:37 +00:00
lewtun
a5f3baafb6
Merge branch 'main' into r1-zero 2025-04-17 11:12:01 +02:00
Lewis Tunstall
b6a73c07f3 Merge branch 'main' into r1-zero 2025-04-16 14:44:25 +02:00
Lewis Tunstall
3c312f883c Add hack for lighteval 2025-04-14 07:00:35 +00:00
Lewis Tunstall
8500f41c69 Parse GAS 2025-04-12 18:16:14 +00:00
Lewis Tunstall
c1d2352772 Add q3 2025-04-11 18:39:51 +00:00
Lewis Tunstall
2d74588a9a Merge branch 'main' into r1-zero 2025-04-11 20:18:42 +02:00
Lewis Tunstall
7a8dead7ad Fix 2025-04-11 14:30:09 +00:00
Lewis Tunstall
b29e672c18 Add level configs and DAPO 2025-04-10 15:21:31 +00:00
Lewis Tunstall
9bed487d67 Add v01 2025-04-09 05:46:57 +00:00
Lewis Tunstall
939c74c446 Fix liger 2025-04-09 05:12:09 +00:00
Lewis Tunstall
f62e42af5c Pin TRL for overlong masking 2025-04-08 19:03:43 +00:00
lewtun
23b7b69f79
Merge branch 'main' into r1-zero 2025-04-08 20:53:51 +02:00
Lewis Tunstall
0f98a5ac74 Fix soft reward to be really soft 2025-04-07 12:58:25 +00:00
Lewis Tunstall
10a555b4bb Add soft format reward 2025-04-07 12:11:36 +00:00
Lewis Tunstall
1d7d66a095 Merge branch 'main' into r1-zero 2025-04-04 13:44:29 +02:00
Lewis Tunstall
995beb88fc Clean up 2025-04-04 07:12:14 +00:00
Lewis Tunstall
f1832c5cb6 Pin TRL 2025-04-03 07:37:13 +00:00
Lewis Tunstall
d51de45bb2 Use proper dataset 2025-04-02 13:11:10 +00:00
Lewis Tunstall
2897519b8e Revert config 2025-04-01 18:59:35 +00:00
Lewis Tunstall
82a1167186 Log unique only 2025-04-01 18:59:35 +00:00
Lewis Tunstall
f22657b7ed Set defaults 2025-04-01 18:59:35 +00:00
Lewis Tunstall
5fe41f0839 Pin trl 2025-04-01 18:59:35 +00:00
lewtun
8f26046af4
Merge branch 'main' into r1-zero 2025-04-01 12:06:33 +02:00
Lewis Tunstall
d9c8cd8557 Use None for unferified 2025-04-01 08:37:54 +00:00
Lewis Tunstall
1078b73d51 Fix order of inputs 2025-04-01 07:39:58 +00:00
Lewis Tunstall
5747cfc095 Return None for invalid samples 2025-03-31 17:59:55 +00:00
Lewis Tunstall
1d6c0bbc4b Fix accuracy rewards 2025-03-31 15:31:04 +00:00
Lewis Tunstall
b35213cd64 Add medium, hard, ultra hard recipes 2025-03-31 15:30:44 +00:00
Lewis Tunstall
9e0e47806c Add new difficulty levels 2025-03-29 19:18:22 +00:00
Lewis Tunstall
8a4af61829 Fix chat template 2025-03-29 16:33:34 +00:00
Lewis Tunstall
b5e6f9c384 Add R1 Zero 7B 2025-03-29 12:05:54 +00:00
15 changed files with 945 additions and 23 deletions

View file

@ -0,0 +1,68 @@
# Config for 4 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-32B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
# gradient_checkpointing_kwargs:
# use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-32B-Math
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-32B-Math-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
vllm_server_timeout: 1200
warmup_ratio: 0.1

View file

@ -0,0 +1,70 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
generation_batch_size: 512
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Code
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-7B-Code-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- weighted_binary_code_reward
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
e2b_router_url: ip-10-53-83-71:8000
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.0
epsilon: 0.2

View file

@ -0,0 +1,64 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_train_epochs: 0.1 # 21.6k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: true
ref_model_sync_steps: 100
ref_model_mixup_alpha: 1.0
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,66 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v01.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_train_epochs: 0.1 # 21.6k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v01.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,67 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: level_2_3_4_5
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v02.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 0.12 # 19.9k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v02.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,67 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: level_3_4_5
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v03.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 0.16 # 19.5k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v03.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,67 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: level_4_5
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v04.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 0.25 # 19.8k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v04.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,66 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: open-r1/R1-Zero-Qwen-7B-Math
model_revision: v04.00-step-000000310
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: level_5
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v04.10
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 0.53 # 19.9k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v04.10
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,67 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v05.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-7B-Math-v05.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,66 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/Big-Math-RL-Verified-Processed
dataset_config: quintile_3
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v06.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 0.897 # 20k prompts
output_dir: data/R1-Zero-Qwen-7B-Math-v06.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.5
- 0.5
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1

View file

@ -0,0 +1,69 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
generation_batch_size: 8192
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v07.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-7B-Math-v07.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1
epsilon: 0.2

View file

@ -0,0 +1,69 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
generation_batch_size: 512
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
hub_model_revision: v08.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-7B-Math-v08.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
- soft_format
reward_weights:
- 1.0
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
sync_ref_model: false
seed: 42
temperature: 1.0
warmup_ratio: 0.1
epsilon: 0.2

View file

@ -143,6 +143,7 @@ class GRPOConfig(trl.GRPOConfig):
hub_model_revision: Optional[str] = field(
default="main", metadata={"help": "The Hub model branch to push the model to."}
)
num_completions_to_print: int = field(default=0, metadata={"help": "Number of completions to print."})
overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
wandb_entity: Optional[str] = field(
@ -157,6 +158,12 @@ class GRPOConfig(trl.GRPOConfig):
default=None,
metadata={"help": ("The group to store runs under.")},
)
wandb_log_unique_prompts: bool = field(
default=True,
metadata={
"help": ("Whether to log the unique prompts to wandb. This will create a new run for each unique prompt.")
},
)
@dataclass

View file

@ -82,7 +82,7 @@ def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str]
return rewards
def format_reward(completions, **kwargs):
def format_reward(completions, **kwargs) -> list[float]:
"""Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags."""
pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
completion_contents = [completion[0]["content"] for completion in completions]
@ -90,6 +90,34 @@ def format_reward(completions, **kwargs):
return [1.0 if match else 0.0 for match in matches]
def soft_format_reward(completions, **kwargs) -> list[float]:
"""
Reward is 1.0 only if there is exactly one <think>...</think> block
followed by exactly one <answer>...</answer> block, and no other occurrences.
"""
think_pattern = r"<think>.*?</think>"
answer_pattern = r"<answer>.*?</answer>"
completion_contents = [completion[0]["content"] for completion in completions]
rewards = []
for content in completion_contents:
think_matches = re.findall(think_pattern, content, re.DOTALL)
answer_matches = re.findall(answer_pattern, content, re.DOTALL)
# Enforce exactly one of each
if len(think_matches) == 1 and len(answer_matches) == 1:
# Check that <think> comes before <answer>
think_index = content.find(think_matches[0])
answer_index = content.find(answer_matches[0])
if think_index < answer_index:
rewards.append(1.0)
continue
rewards.append(0.0)
return rewards
def tag_count_reward(completions, **kwargs) -> list[float]:
"""Reward function that checks if we produce the desired number of think and answer tags associated with `format_reward()`.
@ -507,6 +535,21 @@ def binary_code_reward(
return output
def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
# combines binary reward with a weighted reward code reward
rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
BINARY_THRESHOLD = 0.99
NON_BINARY_WEIGHT = 0.1 # We should expose this before merging
output = []
for reward in rewards:
if reward is None:
output.append(None)
else:
binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
output.append(binary_reward + NON_BINARY_WEIGHT * reward)
return output
def code_reward(
completions,
@ -647,6 +690,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
REWARD_FUNCS_REGISTRY = {
"accuracy": accuracy_reward,
"format": format_reward,
"soft_format": soft_format_reward,
"reasoning_steps": reasoning_steps_reward,
"cosine": get_cosine_scaled_reward(
min_value_wrong=script_args.cosine_min_value_wrong,
@ -678,6 +722,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
),
binary_code_reward,
),
"weighted_binary_code_reward": update_wrapper(
partial(
weighted_binary_code_reward,
num_parallel=script_args.parallel_code_exec_per_proc,
e2b_router_url=script_args.e2b_router_url,
),
weighted_binary_code_reward,
),
"ioi_code": update_wrapper(
partial(
ioi_code_reward,

View file

@ -27,6 +27,7 @@ from open_r1.rewards import (
get_soft_overlong_punishment,
len_reward,
reasoning_steps_reward,
soft_format_reward,
tag_count_reward,
)
@ -75,28 +76,7 @@ class TestGetRewardFuncs(unittest.TestCase):
self.assertEqual(func_name, func.__name__)
class TestRewards(unittest.TestCase):
def test_accuracy_reward_correct_answer(self):
"""Test accuracy_reward with a correct answer."""
completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
solution = [r"\frac{63}{400}"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 1.0)
def test_accuracy_reward_wrong_answer(self):
"""Test accuracy_reward with an incorrect answer."""
completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
solution = [r"\frac{63}{400}"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 0.0)
def test_accuracy_reward_wrong_answer_no_latex(self):
"""Test accuracy_reward with an incorrect answer and gold solution with no latex."""
completion = [[{"content": r"\boxed{3}"}]]
solution = ["6"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 0.0)
class TestFormatRewards(unittest.TestCase):
def test_format_reward_correct(self):
"""Test format_reward with correct format."""
completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
@ -118,6 +98,60 @@ class TestRewards(unittest.TestCase):
rewards = format_reward(completion)
self.assertEqual(rewards[0], 0.0)
class TestSoftFormatReward(unittest.TestCase):
def test_correct_with_newlines(self):
completion = [
[{"content": "Here is my reasoning: <think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]
]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 1.0)
def test_correct_without_newlines(self):
completion = [[{"content": "Here is my reasoning: <think>Some reasoning</think><answer>The answer</answer>"}]]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 1.0)
def test_correct_with_extra_spaces(self):
completion = [
[{"content": "Here is my reasoning: <think> Some reasoning </think> <answer> The answer </answer>"}]
]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 1.0)
def test_correct_with_strict_format(self):
completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 1.0)
def test_incorrect_with_multiple_reasoning_block(self):
completion = [
[
{
"content": "Here is my reasoning: <think> Some reasoning </think> <answer> The answer </answer> New rambling <think> Some reasoning </think> <answer> The answer </answer>"
}
]
]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 0.0)
def test_incorrect_with_answer_before_think(self):
completion = [[{"content": "<answer>The answer</answer><think>Some reasoning</think>"}]]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 0.0)
def test_incorrect_missing_think_block(self):
completion = [[{"content": "Here is my reasoning: <answer>The answer</answer>"}]]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 0.0)
def test_incorrect_missing_answer_block(self):
completion = [[{"content": "Here is my reasoning: <think>Some reasoning</think>"}]]
rewards = soft_format_reward(completion)
self.assertEqual(rewards[0], 0.0)
class TestReasoningStepsReward(unittest.TestCase):
def test_reasoning_steps_reward(self):
"""Test reasoning_steps_reward with various formats."""
test_cases = [
@ -136,6 +170,29 @@ class TestRewards(unittest.TestCase):
rewards = reasoning_steps_reward(completion)
self.assertAlmostEqual(rewards[0], expected_reward)
class TestRewards(unittest.TestCase):
def test_accuracy_reward_correct_answer(self):
"""Test accuracy_reward with a correct answer."""
completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
solution = [r"\frac{63}{400}"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 1.0)
def test_accuracy_reward_wrong_answer(self):
"""Test accuracy_reward with an incorrect answer."""
completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
solution = [r"\frac{63}{400}"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 0.0)
def test_accuracy_reward_wrong_answer_no_latex(self):
"""Test accuracy_reward with an incorrect answer and gold solution with no latex."""
completion = [[{"content": r"\boxed{3}"}]]
solution = ["6"]
rewards = accuracy_reward(completion, solution)
self.assertEqual(rewards[0], 0.0)
def test_multiple_completions(self):
"""Test handling multiple completions at once."""
completions = [