mirror of
https://github.com/huggingface/open-r1.git
synced 2026-06-24 01:54:06 +00:00
Compare commits
52 commits
main
...
grpo-math-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
aa6cefcd40 | ||
|
|
4b73342048 | ||
|
|
0ed9ea370b | ||
|
|
b43069370d | ||
|
|
464d9511f2 | ||
|
|
d9a6c086f1 | ||
|
|
a82c1fd00a | ||
|
|
8d993d54e9 | ||
|
|
2c0cac5da2 | ||
|
|
46c16569a1 | ||
|
|
5f0b8f80c5 | ||
|
|
f27c7327e0 | ||
|
|
2f4b0daba9 | ||
|
|
cebaad59f8 | ||
|
|
2715d3174a | ||
|
|
c24ffd78db | ||
|
|
0df1654ef9 | ||
|
|
be72ce6513 | ||
|
|
2f0b983cc9 | ||
|
|
06bdd50334 | ||
|
|
f3920f8890 | ||
|
|
a5f3baafb6 |
||
|
|
b6a73c07f3 | ||
|
|
3c312f883c | ||
|
|
8500f41c69 | ||
|
|
c1d2352772 | ||
|
|
2d74588a9a | ||
|
|
7a8dead7ad | ||
|
|
b29e672c18 | ||
|
|
9bed487d67 | ||
|
|
939c74c446 | ||
|
|
f62e42af5c | ||
|
|
23b7b69f79 |
||
|
|
0f98a5ac74 | ||
|
|
10a555b4bb | ||
|
|
1d7d66a095 | ||
|
|
995beb88fc | ||
|
|
f1832c5cb6 | ||
|
|
d51de45bb2 | ||
|
|
2897519b8e | ||
|
|
82a1167186 | ||
|
|
f22657b7ed | ||
|
|
5fe41f0839 | ||
|
|
8f26046af4 |
||
|
|
d9c8cd8557 | ||
|
|
1078b73d51 | ||
|
|
5747cfc095 | ||
|
|
1d6c0bbc4b | ||
|
|
b35213cd64 | ||
|
|
9e0e47806c | ||
|
|
8a4af61829 | ||
|
|
b5e6f9c384 |
25 changed files with 1640 additions and 23 deletions
68
recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml
Normal file
68
recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# Config for 4 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-32B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
# gradient_checkpointing_kwargs:
|
||||
# use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-32B-Math
|
||||
hub_model_revision: v00.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Zero-Qwen-32B-Math-v00.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
vllm_server_timeout: 1200
|
||||
warmup_ratio: 0.1
|
||||
70
recipes/OpenR1-Zero-7B-Code/grpo/config_v00.00.yaml
Normal file
70
recipes/OpenR1-Zero-7B-Code/grpo/config_v00.00.yaml
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Code
|
||||
hub_model_revision: v00.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Zero-Qwen-7B-Code-v00.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- weighted_binary_code_reward
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
e2b_router_url: ip-10-53-83-71:8000
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
64
recipes/OpenR1-Zero-7B-Math/grpo/config_v00.00.yaml
Normal file
64
recipes/OpenR1-Zero-7B-Math/grpo/config_v00.00.yaml
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/Big-Math-RL-Verified-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
beta: 0.001
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v00.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_train_epochs: 0.1 # 21.6k prompts
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v00.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.2
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: true
|
||||
ref_model_sync_steps: 100
|
||||
ref_model_mixup_alpha: 1.0
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
66
recipes/OpenR1-Zero-7B-Math/grpo/config_v01.00.yaml
Normal file
66
recipes/OpenR1-Zero-7B-Math/grpo/config_v01.00.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/Big-Math-RL-Verified-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v01.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_train_epochs: 0.1 # 21.6k prompts
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v01.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v02.00.yaml
Normal file
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v02.00.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/Big-Math-RL-Verified-Processed
|
||||
dataset_config: level_2_3_4_5
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v02.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 0.12 # 19.9k prompts
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v02.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v03.00.yaml
Normal file
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v03.00.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/Big-Math-RL-Verified-Processed
|
||||
dataset_config: level_3_4_5
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v03.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 0.16 # 19.5k prompts
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v03.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v04.00.yaml
Normal file
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v04.00.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/Big-Math-RL-Verified-Processed
|
||||
dataset_config: level_4_5
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v04.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 0.25 # 19.8k prompts
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v04.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
66
recipes/OpenR1-Zero-7B-Math/grpo/config_v04.10.yaml
Normal file
66
recipes/OpenR1-Zero-7B-Math/grpo/config_v04.10.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Zero-Qwen-7B-Math
|
||||
model_revision: v04.00-step-000000310
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/Big-Math-RL-Verified-Processed
|
||||
dataset_config: level_5
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v04.10
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 0.53 # 19.9k prompts
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v04.10
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v05.00.yaml
Normal file
67
recipes/OpenR1-Zero-7B-Math/grpo/config_v05.00.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v05.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v05.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
66
recipes/OpenR1-Zero-7B-Math/grpo/config_v06.00.yaml
Normal file
66
recipes/OpenR1-Zero-7B-Math/grpo/config_v06.00.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/Big-Math-RL-Verified-Processed
|
||||
dataset_config: quintile_3
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v06.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 0.897 # 20k prompts
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v06.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.5
|
||||
- 0.5
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
69
recipes/OpenR1-Zero-7B-Math/grpo/config_v07.00.yaml
Normal file
69
recipes/OpenR1-Zero-7B-Math/grpo/config_v07.00.yaml
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
generation_batch_size: 8192
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v07.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v07.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
epsilon: 0.2
|
||||
69
recipes/OpenR1-Zero-7B-Math/grpo/config_v08.00.yaml
Normal file
69
recipes/OpenR1-Zero-7B-Math/grpo/config_v08.00.yaml
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
|
||||
hub_model_revision: v08.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Zero-Qwen-7B-Math-v08.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
- soft_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.25
|
||||
- 0.25
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.1
|
||||
epsilon: 0.2
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v00.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v00.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v01.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v01.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- weighed_binary_code_reward
|
||||
reward_weights:
|
||||
- 1.0
|
||||
e2b_router_url: ip-10-53-85-92:8000
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 32
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v02.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 16384
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v02.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 2
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 32
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v03.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 16384
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v03.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 2
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- soft_overlong_punishment
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
max_completion_len: 16384
|
||||
soft_punish_cache: 4196
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.01-filtering
|
||||
dataset_config: default
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 32
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v04.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 20480
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v04.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 2
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
max_completion_len: 20480
|
||||
soft_punish_cache: 4196
|
||||
use_liger_loss: false
|
||||
use_liger_kernel: true
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.01-filtering
|
||||
dataset_config: default
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 32
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v05.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 20480
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v05.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 2
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- soft_overlong_punishment
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
max_completion_len: 20480
|
||||
soft_punish_cache: 4196
|
||||
use_liger_loss: false
|
||||
use_liger_kernel: true
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.02-v01.02-filtering
|
||||
dataset_config: default
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v06.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 20480
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v06.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
max_completion_len: 20480
|
||||
soft_punish_cache: 4196
|
||||
use_liger_loss: false
|
||||
use_liger_kernel: true
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.02-v01.02-filtering
|
||||
dataset_config: default
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v07.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 20480
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v07.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- soft_overlong_punishment
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
max_completion_len: 20480
|
||||
soft_punish_cache: 4196
|
||||
use_liger_loss: false
|
||||
use_liger_kernel: true
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.02-v01.02-0.3-0.7-filter
|
||||
dataset_config: default
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v08.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 32768
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v08.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
max_completion_len: 32768
|
||||
soft_punish_cache: 4196
|
||||
use_liger_loss: false
|
||||
use_liger_kernel: true
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
|
||||
model_revision: v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
# chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.02-v01.02-0.3-0.7-filter
|
||||
dataset_config: default
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- gpqa
|
||||
- aime24
|
||||
- lcb_v04
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
generation_batch_size: 512
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Distill-Qwen-Math-7B-Merges-GRPO
|
||||
hub_model_revision: v09.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 32768
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Distill-Qwen-Math-7B-Merges-GRPO-v09.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- soft_overlong_punishment
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
sync_ref_model: false
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
warmup_ratio: 0.0
|
||||
epsilon: 0.2
|
||||
max_completion_len: 32768
|
||||
soft_punish_cache: 4196
|
||||
use_liger_loss: false
|
||||
use_liger_kernel: true
|
||||
|
|
@ -42,6 +42,7 @@ class GRPOConfig(trl.GRPOConfig):
|
|||
hub_model_revision: Optional[str] = field(
|
||||
default="main", metadata={"help": "The Hub model branch to push the model to."}
|
||||
)
|
||||
num_completions_to_print: int = field(default=0, metadata={"help": "Number of completions to print."})
|
||||
overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
|
||||
push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
|
||||
wandb_entity: Optional[str] = field(
|
||||
|
|
@ -56,6 +57,12 @@ class GRPOConfig(trl.GRPOConfig):
|
|||
default=None,
|
||||
metadata={"help": ("The group to store runs under.")},
|
||||
)
|
||||
wandb_log_unique_prompts: bool = field(
|
||||
default=True,
|
||||
metadata={
|
||||
"help": ("Whether to log the unique prompts to wandb. This will create a new run for each unique prompt.")
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str]
|
|||
return rewards
|
||||
|
||||
|
||||
def format_reward(completions, **kwargs):
|
||||
def format_reward(completions, **kwargs) -> list[float]:
|
||||
"""Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags."""
|
||||
pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
|
||||
completion_contents = [completion[0]["content"] for completion in completions]
|
||||
|
|
@ -88,6 +88,34 @@ def format_reward(completions, **kwargs):
|
|||
return [1.0 if match else 0.0 for match in matches]
|
||||
|
||||
|
||||
def soft_format_reward(completions, **kwargs) -> list[float]:
|
||||
"""
|
||||
Reward is 1.0 only if there is exactly one <think>...</think> block
|
||||
followed by exactly one <answer>...</answer> block, and no other occurrences.
|
||||
"""
|
||||
think_pattern = r"<think>.*?</think>"
|
||||
answer_pattern = r"<answer>.*?</answer>"
|
||||
|
||||
completion_contents = [completion[0]["content"] for completion in completions]
|
||||
rewards = []
|
||||
|
||||
for content in completion_contents:
|
||||
think_matches = re.findall(think_pattern, content, re.DOTALL)
|
||||
answer_matches = re.findall(answer_pattern, content, re.DOTALL)
|
||||
|
||||
# Enforce exactly one of each
|
||||
if len(think_matches) == 1 and len(answer_matches) == 1:
|
||||
# Check that <think> comes before <answer>
|
||||
think_index = content.find(think_matches[0])
|
||||
answer_index = content.find(answer_matches[0])
|
||||
if think_index < answer_index:
|
||||
rewards.append(1.0)
|
||||
continue
|
||||
rewards.append(0.0)
|
||||
|
||||
return rewards
|
||||
|
||||
|
||||
def tag_count_reward(completions, **kwargs) -> list[float]:
|
||||
"""Reward function that checks if we produce the desired number of think and answer tags associated with `format_reward()`.
|
||||
|
||||
|
|
@ -447,6 +475,21 @@ def binary_code_reward(
|
|||
|
||||
return output
|
||||
|
||||
def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
|
||||
# combines binary reward with a weighted reward code reward
|
||||
rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
|
||||
BINARY_THRESHOLD = 0.99
|
||||
NON_BINARY_WEIGHT = 0.1 # We should expose this before merging
|
||||
|
||||
output = []
|
||||
for reward in rewards:
|
||||
if reward is None:
|
||||
output.append(None)
|
||||
else:
|
||||
binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
|
||||
output.append(binary_reward + NON_BINARY_WEIGHT * reward)
|
||||
|
||||
return output
|
||||
|
||||
def code_reward(
|
||||
completions,
|
||||
|
|
@ -578,6 +621,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
|
|||
REWARD_FUNCS_REGISTRY = {
|
||||
"accuracy": accuracy_reward,
|
||||
"format": format_reward,
|
||||
"soft_format": soft_format_reward,
|
||||
"reasoning_steps": reasoning_steps_reward,
|
||||
"cosine": get_cosine_scaled_reward(
|
||||
min_value_wrong=script_args.cosine_min_value_wrong,
|
||||
|
|
@ -609,6 +653,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
|
|||
),
|
||||
binary_code_reward,
|
||||
),
|
||||
"weighted_binary_code_reward": update_wrapper(
|
||||
partial(
|
||||
weighted_binary_code_reward,
|
||||
num_parallel=script_args.parallel_code_exec_per_proc,
|
||||
e2b_router_url=script_args.e2b_router_url,
|
||||
),
|
||||
weighted_binary_code_reward,
|
||||
),
|
||||
"ioi_code": update_wrapper(
|
||||
partial(
|
||||
ioi_code_reward,
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ from open_r1.rewards import (
|
|||
get_soft_overlong_punishment,
|
||||
len_reward,
|
||||
reasoning_steps_reward,
|
||||
soft_format_reward,
|
||||
tag_count_reward,
|
||||
)
|
||||
|
||||
|
|
@ -75,28 +76,7 @@ class TestGetRewardFuncs(unittest.TestCase):
|
|||
self.assertEqual(func_name, func.__name__)
|
||||
|
||||
|
||||
class TestRewards(unittest.TestCase):
|
||||
def test_accuracy_reward_correct_answer(self):
|
||||
"""Test accuracy_reward with a correct answer."""
|
||||
completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
|
||||
solution = [r"\frac{63}{400}"]
|
||||
rewards = accuracy_reward(completion, solution)
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_accuracy_reward_wrong_answer(self):
|
||||
"""Test accuracy_reward with an incorrect answer."""
|
||||
completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
|
||||
solution = [r"\frac{63}{400}"]
|
||||
rewards = accuracy_reward(completion, solution)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_accuracy_reward_wrong_answer_no_latex(self):
|
||||
"""Test accuracy_reward with an incorrect answer and gold solution with no latex."""
|
||||
completion = [[{"content": r"\boxed{3}"}]]
|
||||
solution = ["6"]
|
||||
rewards = accuracy_reward(completion, solution)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
class TestFormatRewards(unittest.TestCase):
|
||||
def test_format_reward_correct(self):
|
||||
"""Test format_reward with correct format."""
|
||||
completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
|
||||
|
|
@ -118,6 +98,60 @@ class TestRewards(unittest.TestCase):
|
|||
rewards = format_reward(completion)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
|
||||
class TestSoftFormatReward(unittest.TestCase):
|
||||
def test_correct_with_newlines(self):
|
||||
completion = [
|
||||
[{"content": "Here is my reasoning: <think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]
|
||||
]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_correct_without_newlines(self):
|
||||
completion = [[{"content": "Here is my reasoning: <think>Some reasoning</think><answer>The answer</answer>"}]]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_correct_with_extra_spaces(self):
|
||||
completion = [
|
||||
[{"content": "Here is my reasoning: <think> Some reasoning </think> <answer> The answer </answer>"}]
|
||||
]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_correct_with_strict_format(self):
|
||||
completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_incorrect_with_multiple_reasoning_block(self):
|
||||
completion = [
|
||||
[
|
||||
{
|
||||
"content": "Here is my reasoning: <think> Some reasoning </think> <answer> The answer </answer> New rambling <think> Some reasoning </think> <answer> The answer </answer>"
|
||||
}
|
||||
]
|
||||
]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_incorrect_with_answer_before_think(self):
|
||||
completion = [[{"content": "<answer>The answer</answer><think>Some reasoning</think>"}]]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_incorrect_missing_think_block(self):
|
||||
completion = [[{"content": "Here is my reasoning: <answer>The answer</answer>"}]]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_incorrect_missing_answer_block(self):
|
||||
completion = [[{"content": "Here is my reasoning: <think>Some reasoning</think>"}]]
|
||||
rewards = soft_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
|
||||
class TestReasoningStepsReward(unittest.TestCase):
|
||||
def test_reasoning_steps_reward(self):
|
||||
"""Test reasoning_steps_reward with various formats."""
|
||||
test_cases = [
|
||||
|
|
@ -136,6 +170,29 @@ class TestRewards(unittest.TestCase):
|
|||
rewards = reasoning_steps_reward(completion)
|
||||
self.assertAlmostEqual(rewards[0], expected_reward)
|
||||
|
||||
|
||||
class TestRewards(unittest.TestCase):
|
||||
def test_accuracy_reward_correct_answer(self):
|
||||
"""Test accuracy_reward with a correct answer."""
|
||||
completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
|
||||
solution = [r"\frac{63}{400}"]
|
||||
rewards = accuracy_reward(completion, solution)
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_accuracy_reward_wrong_answer(self):
|
||||
"""Test accuracy_reward with an incorrect answer."""
|
||||
completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
|
||||
solution = [r"\frac{63}{400}"]
|
||||
rewards = accuracy_reward(completion, solution)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_accuracy_reward_wrong_answer_no_latex(self):
|
||||
"""Test accuracy_reward with an incorrect answer and gold solution with no latex."""
|
||||
completion = [[{"content": r"\boxed{3}"}]]
|
||||
solution = ["6"]
|
||||
rewards = accuracy_reward(completion, solution)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_multiple_completions(self):
|
||||
"""Test handling multiple completions at once."""
|
||||
completions = [
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue