mirror of
https://github.com/huggingface/open-r1.git
synced 2026-06-24 01:54:06 +00:00
Compare commits
8 commits
main
...
code-grpo-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1bab913b88 | ||
|
|
b3c7971ebb | ||
|
|
9b6c9704da | ||
|
|
0662164248 | ||
|
|
7ddc0282cd | ||
|
|
243db805c2 | ||
|
|
3684ab2575 | ||
|
|
98cbed7596 |
33 changed files with 2057 additions and 6 deletions
67
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
Normal file
67
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
|
||||
hub_model_revision: v03.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 24000
|
||||
max_steps: -1
|
||||
num_generations: 8
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
e2b_router_url: "ip-10-53-85-124:8000"
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
66
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
Normal file
66
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Model arguments
|
||||
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 128
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
|
||||
hub_model_revision: v04.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 24000
|
||||
max_steps: -1
|
||||
num_generations: 8
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
e2b_router_url: ip-10-53-86-47:8000
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
mask_truncated_completions: true
|
||||
loss_type: dr_grpo
|
||||
66
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
Normal file
66
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Model arguments
|
||||
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
|
||||
hub_model_revision: v05.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 24000
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
e2b_router_url: ip-10-53-86-47:8000
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
mask_truncated_completions: true
|
||||
loss_type: dr_grpo
|
||||
66
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
Normal file
66
recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Model arguments
|
||||
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
|
||||
hub_model_revision: v06.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 24000
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- weighted_binary_code_reward
|
||||
e2b_router_url: ip-10-53-86-47:8000
|
||||
reward_weights:
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
mask_truncated_completions: true
|
||||
loss_type: dr_grpo
|
||||
|
|
@ -14,7 +14,7 @@ dataset_num_proc: 48
|
|||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: 'no'
|
||||
gradient_accumulation_steps: 8
|
||||
gradient_accumulation_steps: 2
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
|
|
@ -27,20 +27,20 @@ logging_strategy: steps
|
|||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
packing: false
|
||||
packing: true
|
||||
max_grad_norm: 0.2
|
||||
max_length: 32768
|
||||
max_length: 16000
|
||||
max_steps: -1
|
||||
num_train_epochs: 10
|
||||
output_dir: data/OlympicCoder-7B
|
||||
overwrite_output_dir: true
|
||||
per_device_eval_batch_size: 1
|
||||
per_device_train_batch_size: 2
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
save_strategy: epoch
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
use_liger_kernel: true
|
||||
use_liger_kernel: false
|
||||
warmup_ratio: 0.03
|
||||
58
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
Normal file
58
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
beta: 0.001
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v01.05
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 8
|
||||
num_train_epochs: 1
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 1
|
||||
push_to_hub: true
|
||||
use_liger_kernel: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.2
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
warmup_ratio: 0.1
|
||||
60
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
Normal file
60
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
beta: 0.001
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v01.06
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_train_epochs: 1
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.06
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
use_liger_kernel: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.2
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
warmup_ratio: 0.1
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
61
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
Normal file
61
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
beta: 0.001
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v01.07
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_train_epochs: 1
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.07
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
use_liger_kernel: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.2
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
warmup_ratio: 0.1
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
scale_rewards: false
|
||||
62
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
Normal file
62
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v01.08
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_train_epochs: 1
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.08
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
use_liger_kernel: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.2
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
warmup_ratio: 0.1
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
num_iterations: 4
|
||||
scale_rewards: false
|
||||
62
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
Normal file
62
recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v01.09
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_train_epochs: 1
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.09
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
use_liger_kernel: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- accuracy
|
||||
- format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.2
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
warmup_ratio: 0.1
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
num_iterations: 4
|
||||
scale_rewards: true
|
||||
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
Normal file
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
Normal file
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.02
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.02
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
Normal file
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.03
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.03
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
Normal file
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.04
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.04
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.05
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.05
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
Normal file
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.06
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 64
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.06
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 64
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.07
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 64
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.07
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.08
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.08
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 4
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.09
|
||||
hub_strategy: every_save
|
||||
learning_rate: 5.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.09
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 4
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.10
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-05
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.10
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 4
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.11
|
||||
hub_strategy: every_save
|
||||
learning_rate: 4.0e-05
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.11
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 4
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.12
|
||||
hub_strategy: every_save
|
||||
learning_rate: 5.0e-07
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: cosine_with_min_lr
|
||||
lr_scheduler_kwargs:
|
||||
min_lr_rate: 0.1
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.12
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 4
|
||||
66
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
Normal file
66
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.13
|
||||
hub_strategy: every_save
|
||||
learning_rate: 5.0e-07
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.13
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
66
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
Normal file
66
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.14
|
||||
hub_strategy: every_save
|
||||
learning_rate: 5.0e-07
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.14
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.01
|
||||
parallel_code_exec_per_proc: 10
|
||||
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
Normal file
65
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.15
|
||||
hub_strategy: every_save
|
||||
learning_rate: 5.0e-07
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 1.0
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.15
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.01
|
||||
parallel_code_exec_per_proc: 10
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.16
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.16
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
e2b_router_url: "ip-10-53-85-124:8000"
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.17
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.17
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
e2b_router_url: "ip-10-53-85-124:8000"
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.4
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
Normal file
67
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.18
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.18
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
e2b_router_url: "ip-10-53-85-124:8000"
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.05
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
69
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
Normal file
69
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.20
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.20
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
e2b_router_url: ip-10-53-86-47:8000
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
|
||||
mask_truncated_completions: true
|
||||
70
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
Normal file
70
recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
|
||||
dataset_prompt_column: problem
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- lcb_v4
|
||||
beta: 0.000
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
|
||||
hub_model_revision: v05.30
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 4
|
||||
num_train_epochs: 1.0
|
||||
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.30
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 8
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- binary_code
|
||||
- code_format
|
||||
e2b_router_url: ip-10-53-86-47:8000
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
parallel_code_exec_per_proc: 10
|
||||
|
||||
mask_truncated_completions: true
|
||||
loss_type: dr_grpo
|
||||
64
recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
Normal file
64
recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.001
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 14
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
|
||||
hub_model_revision: v02.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 14
|
||||
num_train_epochs: 0.1
|
||||
output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v02.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
64
recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
Normal file
64
recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# Data training arguments
|
||||
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
|
||||
|
||||
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- math_500
|
||||
- aime24
|
||||
beta: 0.001
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
vllm_device: auto
|
||||
vllm_gpu_memory_utilization: 0.7
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 14
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
|
||||
hub_model_revision: v03.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 4096
|
||||
max_steps: -1
|
||||
num_generations: 14
|
||||
num_train_epochs: 0.1
|
||||
output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v03.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- code
|
||||
- code_format
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 0.1
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.1
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 0.7
|
||||
wandb_entity: huggingface
|
||||
wandb_project: open-r1
|
||||
warmup_ratio: 0.1
|
||||
|
|
@ -387,14 +387,30 @@ def extract_code(completion: str, language: str = "python") -> str:
|
|||
def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
|
||||
rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
|
||||
BINARY_THRESHOLD = 0.99
|
||||
|
||||
|
||||
output = []
|
||||
for reward in rewards:
|
||||
if reward is None:
|
||||
output.append(None)
|
||||
else:
|
||||
output.append(1.0 if reward > BINARY_THRESHOLD else 0.0)
|
||||
|
||||
return output
|
||||
|
||||
def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
|
||||
# combines binary reward with a weighted reward code reward
|
||||
rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
|
||||
BINARY_THRESHOLD = 0.99
|
||||
NON_BINARY_WEIGHT = 0.1
|
||||
|
||||
output = []
|
||||
for reward in rewards:
|
||||
if reward is None:
|
||||
output.append(None)
|
||||
else:
|
||||
binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
|
||||
output.append(binary_reward + NON_BINARY_WEIGHT * reward)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
|
|
@ -596,6 +612,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
|
|||
),
|
||||
binary_code_reward,
|
||||
),
|
||||
"weighted_binary_code_reward": update_wrapper(
|
||||
partial(
|
||||
weighted_binary_code_reward,
|
||||
num_parallel=script_args.parallel_code_exec_per_proc,
|
||||
e2b_router_url=script_args.e2b_router_url,
|
||||
),
|
||||
weighted_binary_code_reward,
|
||||
),
|
||||
"ioi_code": update_wrapper(
|
||||
partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
|
||||
),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue