Merge branch 'main' into code-grpo-configs

update trl version in setup
2026-06-24 01:54:06 +00:00 · 2025-04-26 12:01:21 +02:00 · 2025-04-18 11:41:27 +00:00 · 2025-04-18 11:38:38 +00:00 · 2025-04-18 11:38:17 +00:00 · 2025-04-17 12:08:12 +00:00
33 changed files with 2057 additions and 6 deletions
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 128
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v04.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v06.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- weighted_binary_code_reward
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
--- a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
+++ b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
@ -14,7 +14,7 @@ dataset_num_proc: 48
 bf16: true
 do_eval: false
 eval_strategy: 'no'
-gradient_accumulation_steps: 8
+gradient_accumulation_steps: 2
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
@ -27,20 +27,20 @@ logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
  min_lr_rate: 0.1
-packing: false
+packing: true
 max_grad_norm: 0.2
-max_length: 32768
+max_length: 16000
 max_steps: -1
 num_train_epochs: 10
 output_dir: data/OlympicCoder-7B
 overwrite_output_dir: true
 per_device_eval_batch_size: 1
-per_device_train_batch_size: 2
+per_device_train_batch_size: 1
 push_to_hub: true
 report_to:
 - wandb
 save_strategy: epoch
 save_total_limit: 1
 seed: 42
-use_liger_kernel: true
+use_liger_kernel: false
 warmup_ratio: 0.03
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
@ -0,0 +1,58 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 8
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
@ -0,0 +1,60 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.06
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.06
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
@ -0,0 +1,61 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.07
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.07
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+scale_rewards: false
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
@ -0,0 +1,62 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.08
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.08
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+num_iterations: 4
+scale_rewards: false
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
@ -0,0 +1,62 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.09
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.09
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+num_iterations: 4
+scale_rewards: true
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.02
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.02
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.03
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.03
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.04
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.04
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.05
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.06
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 64
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.06
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.07
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 64
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.07
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.08
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.08
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.09
+hub_strategy: every_save
+learning_rate: 5.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.09
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.10
+hub_strategy: every_save
+learning_rate: 1.0e-05
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.10
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.11
+hub_strategy: every_save
+learning_rate: 4.0e-05
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.11
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.12
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.12
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.13
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.13
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.14
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.14
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.01
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.15
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 1.0
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.15
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.01
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.16
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.16
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.17
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.17
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.4
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.18
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.18
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
@ -0,0 +1,69 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.20
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.20
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
+
+mask_truncated_completions: true
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
@ -0,0 +1,70 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.30
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.30
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
+
+mask_truncated_completions: true
+loss_type: dr_grpo
--- a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
+++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
@ -0,0 +1,64 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 14
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
+hub_model_revision: v02.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 14
+num_train_epochs: 0.1
+output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v02.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
--- a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
+++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
@ -0,0 +1,64 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 14
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 14
+num_train_epochs: 0.1
+output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@ -387,14 +387,30 @@ def extract_code(completion: str, language: str = "python") -> str:
 def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
    rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
    BINARY_THRESHOLD = 0.99
-
+    
    output = []
    for reward in rewards:
        if reward is None:
            output.append(None)
        else:
            output.append(1.0 if reward > BINARY_THRESHOLD else 0.0)
+  
+    return output

+def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
+    # combines binary reward with a weighted reward code reward
+    rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
+    BINARY_THRESHOLD = 0.99
+    NON_BINARY_WEIGHT = 0.1
+    
+    output = []
+    for reward in rewards:
+        if reward is None:
+            output.append(None)
+        else:
+            binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
+            output.append(binary_reward + NON_BINARY_WEIGHT * reward)
+  
    return output


@ -596,6 +612,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
            ),
            binary_code_reward,
        ),
+        "weighted_binary_code_reward": update_wrapper(
+            partial(
+                weighted_binary_code_reward,
+                num_parallel=script_args.parallel_code_exec_per_proc,
+                e2b_router_url=script_args.e2b_router_url,
+            ),
+            weighted_binary_code_reward,
+        ),
        "ioi_code": update_wrapper(
            partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
        ),
Author	SHA1	Message	Date
Lewis Tunstall	1bab913b88	Merge branch 'main' into code-grpo-configs	2025-04-26 12:01:21 +02:00
edbeeching	b3c7971ebb	Merge branch 'main' into code-grpo-configs	2025-04-18 11:41:27 +00:00
edbeeching	9b6c9704da	update trl version in setup	2025-04-18 11:38:38 +00:00
edbeeching	0662164248	add new reward, configs	2025-04-18 11:38:17 +00:00
edbeeching	7ddc0282cd	save wip	2025-04-17 12:08:12 +00:00
edbeeching	243db805c2	bin reward	2025-04-11 15:38:22 +00:00
edbeeching	3684ab2575	Merge branch 'main' into code-grpo-configs	2025-04-11 15:12:50 +00:00
edbeeching	98cbed7596	add WIP code GRPO configs	2025-04-11 09:26:05 +00:00