Compare commits

...

8 commits

Author SHA1 Message Date
Lewis Tunstall
1bab913b88 Merge branch 'main' into code-grpo-configs 2025-04-26 12:01:21 +02:00
edbeeching
b3c7971ebb Merge branch 'main' into code-grpo-configs 2025-04-18 11:41:27 +00:00
edbeeching
9b6c9704da update trl version in setup 2025-04-18 11:38:38 +00:00
edbeeching
0662164248 add new reward, configs 2025-04-18 11:38:17 +00:00
edbeeching
7ddc0282cd save wip 2025-04-17 12:08:12 +00:00
edbeeching
243db805c2 bin reward 2025-04-11 15:38:22 +00:00
edbeeching
3684ab2575 Merge branch 'main' into code-grpo-configs 2025-04-11 15:12:50 +00:00
edbeeching
98cbed7596 add WIP code GRPO configs 2025-04-11 09:26:05 +00:00
33 changed files with 2057 additions and 6 deletions

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v03.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 8
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
e2b_router_url: "ip-10-53-85-124:8000"
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 128
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v04.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 8
num_iterations: 1
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
mask_truncated_completions: true
loss_type: dr_grpo

View file

@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v05.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
mask_truncated_completions: true
loss_type: dr_grpo

View file

@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v06.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- weighted_binary_code_reward
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
mask_truncated_completions: true
loss_type: dr_grpo

View file

@ -14,7 +14,7 @@ dataset_num_proc: 48
bf16: true
do_eval: false
eval_strategy: 'no'
gradient_accumulation_steps: 8
gradient_accumulation_steps: 2
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
@ -27,20 +27,20 @@ logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
packing: false
packing: true
max_grad_norm: 0.2
max_length: 32768
max_length: 16000
max_steps: -1
num_train_epochs: 10
output_dir: data/OlympicCoder-7B
overwrite_output_dir: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 2
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
save_strategy: epoch
save_total_limit: 1
seed: 42
use_liger_kernel: true
use_liger_kernel: false
warmup_ratio: 0.03

View file

@ -0,0 +1,58 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v01.05
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 8
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
use_liger_kernel: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1

View file

@ -0,0 +1,60 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v01.06
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.06
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
use_liger_kernel: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1
wandb_entity: huggingface
wandb_project: open-r1

View file

@ -0,0 +1,61 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v01.07
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.07
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
use_liger_kernel: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1
wandb_entity: huggingface
wandb_project: open-r1
scale_rewards: false

View file

@ -0,0 +1,62 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v01.08
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.08
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
use_liger_kernel: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1
wandb_entity: huggingface
wandb_project: open-r1
num_iterations: 4
scale_rewards: false

View file

@ -0,0 +1,62 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v01.09
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.09
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
use_liger_kernel: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1
wandb_entity: huggingface
wandb_project: open-r1
num_iterations: 4
scale_rewards: true

View file

@ -0,0 +1,65 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.00
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1

View file

@ -0,0 +1,65 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.02
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.02
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,65 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.03
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.03
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,65 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.04
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.04
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.05
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.05
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,65 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.06
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 64
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.06
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.07
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 64
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.07
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.08
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.08
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 4

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.09
hub_strategy: every_save
learning_rate: 5.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.09
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 4

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.10
hub_strategy: every_save
learning_rate: 1.0e-05
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.10
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 4

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.11
hub_strategy: every_save
learning_rate: 4.0e-05
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.11
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 4

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.12
hub_strategy: every_save
learning_rate: 5.0e-07
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.12
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 4

View file

@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.13
hub_strategy: every_save
learning_rate: 5.0e-07
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.13
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.14
hub_strategy: every_save
learning_rate: 5.0e-07
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.14
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.01
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,65 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.15
hub_strategy: every_save
learning_rate: 5.0e-07
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 1.0
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.15
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.01
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.16
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.16
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
e2b_router_url: "ip-10-53-85-124:8000"
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.17
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.17
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
e2b_router_url: "ip-10-53-85-124:8000"
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.4
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.18
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.18
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
e2b_router_url: "ip-10-53-85-124:8000"
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 1.0
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10

View file

@ -0,0 +1,69 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.20
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.20
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10
mask_truncated_completions: true

View file

@ -0,0 +1,70 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v05.30
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.30
overwrite_output_dir: true
per_device_train_batch_size: 8
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10
mask_truncated_completions: true
loss_type: dr_grpo

View file

@ -0,0 +1,64 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 14
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
hub_model_revision: v02.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 14
num_train_epochs: 0.1
output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v02.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1

View file

@ -0,0 +1,64 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- math_500
- aime24
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 14
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
hub_model_revision: v03.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 14
num_train_epochs: 0.1
output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v03.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- code
- code_format
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1

View file

@ -387,14 +387,30 @@ def extract_code(completion: str, language: str = "python") -> str:
def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
BINARY_THRESHOLD = 0.99
output = []
for reward in rewards:
if reward is None:
output.append(None)
else:
output.append(1.0 if reward > BINARY_THRESHOLD else 0.0)
return output
def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
# combines binary reward with a weighted reward code reward
rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
BINARY_THRESHOLD = 0.99
NON_BINARY_WEIGHT = 0.1
output = []
for reward in rewards:
if reward is None:
output.append(None)
else:
binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
output.append(binary_reward + NON_BINARY_WEIGHT * reward)
return output
@ -596,6 +612,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
),
binary_code_reward,
),
"weighted_binary_code_reward": update_wrapper(
partial(
weighted_binary_code_reward,
num_parallel=script_args.parallel_code_exec_per_proc,
e2b_router_url=script_args.e2b_router_url,
),
weighted_binary_code_reward,
),
"ioi_code": update_wrapper(
partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
),