Merge branch 'main' into r1-zero

2026-06-24 01:54:06 +00:00 · 2025-05-25 12:04:07 +02:00 · 2025-05-10 08:01:02 +00:00 · 2025-05-09 20:09:15 +00:00 · 2025-05-08 16:16:29 +02:00 · 2025-05-07 08:00:23 +00:00
15 changed files with 945 additions and 23 deletions
--- a/recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml
+++ b/recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml
@ -0,0 +1,68 @@
+# Config for 4 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-32B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+# gradient_checkpointing_kwargs:
+#   use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-32B-Math
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-32B-Math-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+vllm_server_timeout: 1200
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Code/grpo/config_v00.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Code/grpo/config_v00.00.yaml
@ -0,0 +1,70 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+generation_batch_size: 512
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Code
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-7B-Code-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- weighted_binary_code_reward
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+e2b_router_url: ip-10-53-83-71:8000
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.0
+epsilon: 0.2
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v00.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v00.00.yaml
@ -0,0 +1,64 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_train_epochs: 0.1 # 21.6k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: true 
+ref_model_sync_steps: 100 
+ref_model_mixup_alpha: 1.0
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v01.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v01.00.yaml
@ -0,0 +1,66 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v01.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_train_epochs: 0.1 # 21.6k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v01.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v02.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v02.00.yaml
@ -0,0 +1,67 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: level_2_3_4_5
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v02.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 0.12 # 19.9k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v02.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v03.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v03.00.yaml
@ -0,0 +1,67 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: level_3_4_5
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 0.16 # 19.5k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v04.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v04.00.yaml
@ -0,0 +1,67 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: level_4_5
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v04.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 0.25 # 19.8k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v04.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v04.10.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v04.10.yaml
@ -0,0 +1,66 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: open-r1/R1-Zero-Qwen-7B-Math
+model_revision: v04.00-step-000000310
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: level_5
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v04.10
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 0.53 # 19.9k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v04.10
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v05.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v05.00.yaml
@ -0,0 +1,67 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-7B-Math-v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v06.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v06.00.yaml
@ -0,0 +1,66 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/Big-Math-RL-Verified-Processed
+dataset_config: quintile_3
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v06.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 0.897 # 20k prompts
+output_dir: data/R1-Zero-Qwen-7B-Math-v06.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.5
+- 0.5
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v07.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v07.00.yaml
@ -0,0 +1,69 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+generation_batch_size: 8192
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v07.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-7B-Math-v07.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
+epsilon: 0.2
--- a/recipes/OpenR1-Zero-7B-Math/grpo/config_v08.00.yaml
+++ b/recipes/OpenR1-Zero-7B-Math/grpo/config_v08.00.yaml
@ -0,0 +1,69 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>...</think> and <answer>...</answer> tags, respectively, i.e., \\n<think>\\nreasoning process here\\n</think>\\n<answer>\\nanswer here\\n</answer>.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+generation_batch_size: 512
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-7B-Math
+hub_model_revision: v08.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-7B-Math-v08.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+- soft_format
+reward_weights:
+- 1.0
+- 0.25
+- 0.25
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+sync_ref_model: false 
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.1
+epsilon: 0.2
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@ -143,6 +143,7 @@ class GRPOConfig(trl.GRPOConfig):
    hub_model_revision: Optional[str] = field(
        default="main", metadata={"help": "The Hub model branch to push the model to."}
    )
+    num_completions_to_print: int = field(default=0, metadata={"help": "Number of completions to print."})
    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
    wandb_entity: Optional[str] = field(
@ -157,6 +158,12 @@ class GRPOConfig(trl.GRPOConfig):
        default=None,
        metadata={"help": ("The group to store runs under.")},
    )
+    wandb_log_unique_prompts: bool = field(
+        default=True,
+        metadata={
+            "help": ("Whether to log the unique prompts to wandb. This will create a new run for each unique prompt.")
+        },
+    )


@dataclass
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@ -82,7 +82,7 @@ def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str]
    return rewards


-def format_reward(completions, **kwargs):
+def format_reward(completions, **kwargs) -> list[float]:
    """Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags."""
    pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
@ -90,6 +90,34 @@ def format_reward(completions, **kwargs):
    return [1.0 if match else 0.0 for match in matches]


+def soft_format_reward(completions, **kwargs) -> list[float]:
+    """
+    Reward is 1.0 only if there is exactly one <think>...</think> block
+    followed by exactly one <answer>...</answer> block, and no other occurrences.
+    """
+    think_pattern = r"<think>.*?</think>"
+    answer_pattern = r"<answer>.*?</answer>"
+
+    completion_contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+
+    for content in completion_contents:
+        think_matches = re.findall(think_pattern, content, re.DOTALL)
+        answer_matches = re.findall(answer_pattern, content, re.DOTALL)
+
+        # Enforce exactly one of each
+        if len(think_matches) == 1 and len(answer_matches) == 1:
+            # Check that <think> comes before <answer>
+            think_index = content.find(think_matches[0])
+            answer_index = content.find(answer_matches[0])
+            if think_index < answer_index:
+                rewards.append(1.0)
+                continue
+        rewards.append(0.0)
+
+    return rewards
+
+
 def tag_count_reward(completions, **kwargs) -> list[float]:
    """Reward function that checks if we produce the desired number of think and answer tags associated with `format_reward()`.

@ -507,6 +535,21 @@ def binary_code_reward(

    return output

+def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
+    # combines binary reward with a weighted reward code reward
+    rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
+    BINARY_THRESHOLD = 0.99
+    NON_BINARY_WEIGHT = 0.1 # We should expose this before merging
+
+    output = []
+    for reward in rewards:
+        if reward is None:
+            output.append(None)
+        else:
+            binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
+            output.append(binary_reward + NON_BINARY_WEIGHT * reward)
+
+    return output

 def code_reward(
    completions,
@ -647,6 +690,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
    REWARD_FUNCS_REGISTRY = {
        "accuracy": accuracy_reward,
        "format": format_reward,
+        "soft_format": soft_format_reward,
        "reasoning_steps": reasoning_steps_reward,
        "cosine": get_cosine_scaled_reward(
            min_value_wrong=script_args.cosine_min_value_wrong,
@ -678,6 +722,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
            ),
            binary_code_reward,
        ),
+        "weighted_binary_code_reward": update_wrapper(
+            partial(
+                weighted_binary_code_reward,
+                num_parallel=script_args.parallel_code_exec_per_proc,
+                e2b_router_url=script_args.e2b_router_url,
+            ),
+            weighted_binary_code_reward,
+        ),
        "ioi_code": update_wrapper(
            partial(
                ioi_code_reward,
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@ -27,6 +27,7 @@ from open_r1.rewards import (
    get_soft_overlong_punishment,
    len_reward,
    reasoning_steps_reward,
+    soft_format_reward,
    tag_count_reward,
 )

@ -75,28 +76,7 @@ class TestGetRewardFuncs(unittest.TestCase):
            self.assertEqual(func_name, func.__name__)


-class TestRewards(unittest.TestCase):
-    def test_accuracy_reward_correct_answer(self):
-        """Test accuracy_reward with a correct answer."""
-        completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
-        solution = [r"\frac{63}{400}"]
-        rewards = accuracy_reward(completion, solution)
-        self.assertEqual(rewards[0], 1.0)
-
-    def test_accuracy_reward_wrong_answer(self):
-        """Test accuracy_reward with an incorrect answer."""
-        completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
-        solution = [r"\frac{63}{400}"]
-        rewards = accuracy_reward(completion, solution)
-        self.assertEqual(rewards[0], 0.0)
-
-    def test_accuracy_reward_wrong_answer_no_latex(self):
-        """Test accuracy_reward with an incorrect answer and gold solution with no latex."""
-        completion = [[{"content": r"\boxed{3}"}]]
-        solution = ["6"]
-        rewards = accuracy_reward(completion, solution)
-        self.assertEqual(rewards[0], 0.0)
-
+class TestFormatRewards(unittest.TestCase):
    def test_format_reward_correct(self):
        """Test format_reward with correct format."""
        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
@ -118,6 +98,60 @@ class TestRewards(unittest.TestCase):
            rewards = format_reward(completion)
            self.assertEqual(rewards[0], 0.0)

+
+class TestSoftFormatReward(unittest.TestCase):
+    def test_correct_with_newlines(self):
+        completion = [
+            [{"content": "Here is my reasoning: <think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]
+        ]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_correct_without_newlines(self):
+        completion = [[{"content": "Here is my reasoning: <think>Some reasoning</think><answer>The answer</answer>"}]]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_correct_with_extra_spaces(self):
+        completion = [
+            [{"content": "Here is my reasoning: <think> Some reasoning </think> <answer> The answer </answer>"}]
+        ]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_correct_with_strict_format(self):
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_incorrect_with_multiple_reasoning_block(self):
+        completion = [
+            [
+                {
+                    "content": "Here is my reasoning: <think> Some reasoning </think> <answer> The answer </answer> New rambling <think> Some reasoning </think> <answer> The answer </answer>"
+                }
+            ]
+        ]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_incorrect_with_answer_before_think(self):
+        completion = [[{"content": "<answer>The answer</answer><think>Some reasoning</think>"}]]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_incorrect_missing_think_block(self):
+        completion = [[{"content": "Here is my reasoning: <answer>The answer</answer>"}]]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_incorrect_missing_answer_block(self):
+        completion = [[{"content": "Here is my reasoning: <think>Some reasoning</think>"}]]
+        rewards = soft_format_reward(completion)
+        self.assertEqual(rewards[0], 0.0)
+
+
+class TestReasoningStepsReward(unittest.TestCase):
    def test_reasoning_steps_reward(self):
        """Test reasoning_steps_reward with various formats."""
        test_cases = [
@ -136,6 +170,29 @@ class TestRewards(unittest.TestCase):
            rewards = reasoning_steps_reward(completion)
            self.assertAlmostEqual(rewards[0], expected_reward)

+
+class TestRewards(unittest.TestCase):
+    def test_accuracy_reward_correct_answer(self):
+        """Test accuracy_reward with a correct answer."""
+        completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
+        solution = [r"\frac{63}{400}"]
+        rewards = accuracy_reward(completion, solution)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_accuracy_reward_wrong_answer(self):
+        """Test accuracy_reward with an incorrect answer."""
+        completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
+        solution = [r"\frac{63}{400}"]
+        rewards = accuracy_reward(completion, solution)
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_accuracy_reward_wrong_answer_no_latex(self):
+        """Test accuracy_reward with an incorrect answer and gold solution with no latex."""
+        completion = [[{"content": r"\boxed{3}"}]]
+        solution = ["6"]
+        rewards = accuracy_reward(completion, solution)
+        self.assertEqual(rewards[0], 0.0)
+
    def test_multiple_completions(self):
        """Test handling multiple completions at once."""
        completions = [
Author	SHA1	Message	Date
lewtun	a401d64a70	Merge branch 'main' into r1-zero	2025-05-25 12:04:07 +02:00
edbeeching	0ed9ea370b	Merge branch 'main' into r1-zero	2025-05-10 08:01:02 +00:00
edbeeching	b43069370d	Merge branch 'main' into r1-zero	2025-05-09 20:09:15 +00:00
Lewis Tunstall	464d9511f2	Merge branch 'main' into r1-zero	2025-05-08 16:16:29 +02:00
edbeeching	d9a6c086f1	add latest configs	2025-05-07 08:00:23 +00:00
edbeeching	a82c1fd00a	adds weighted code reward	2025-05-07 07:59:24 +00:00
edbeeching	8d993d54e9	add gen batch exp config	2025-05-05 08:12:23 +00:00
Lewis Tunstall	2c0cac5da2	Merge branch 'main' into r1-zero	2025-04-26 12:03:15 +02:00
Lewis Tunstall	46c16569a1	Pin transformers	2025-04-26 06:39:02 +00:00
Lewis Tunstall	5f0b8f80c5	Remove hf-transfer in favour of hf-xet	2025-04-24 06:56:30 +00:00
Lewis Tunstall	f27c7327e0	Fix	2025-04-23 18:12:25 +00:00
Lewis Tunstall	2f4b0daba9	Revert slurm	2025-04-23 18:06:51 +00:00
Lewis Tunstall	cebaad59f8	Wait	2025-04-23 09:34:13 +00:00
Lewis Tunstall	2715d3174a	Hack	2025-04-23 08:51:49 +00:00
Lewis Tunstall	c24ffd78db	Fix attempt on Slurm	2025-04-23 08:09:05 +00:00
Lewis Tunstall	0df1654ef9	Tune recipe	2025-04-23 07:20:33 +00:00
Lewis Tunstall	be72ce6513	Fix sharding in Slurm	2025-04-23 07:20:27 +00:00
Lewis Tunstall	2f0b983cc9	Add 32B recipe	2025-04-22 09:42:33 +00:00
Lewis Tunstall	06bdd50334	Merge branch 'main' into r1-zero	2025-04-17 19:56:35 +02:00
Lewis Tunstall	f3920f8890	Pin TRL	2025-04-17 11:41:37 +00:00
lewtun	a5f3baafb6	Merge branch 'main' into r1-zero	2025-04-17 11:12:01 +02:00
Lewis Tunstall	b6a73c07f3	Merge branch 'main' into r1-zero	2025-04-16 14:44:25 +02:00
Lewis Tunstall	3c312f883c	Add hack for lighteval	2025-04-14 07:00:35 +00:00
Lewis Tunstall	8500f41c69	Parse GAS	2025-04-12 18:16:14 +00:00
Lewis Tunstall	c1d2352772	Add q3	2025-04-11 18:39:51 +00:00
Lewis Tunstall	2d74588a9a	Merge branch 'main' into r1-zero	2025-04-11 20:18:42 +02:00
Lewis Tunstall	7a8dead7ad	Fix	2025-04-11 14:30:09 +00:00
Lewis Tunstall	b29e672c18	Add level configs and DAPO	2025-04-10 15:21:31 +00:00
Lewis Tunstall	9bed487d67	Add v01	2025-04-09 05:46:57 +00:00
Lewis Tunstall	939c74c446	Fix liger	2025-04-09 05:12:09 +00:00
Lewis Tunstall	f62e42af5c	Pin TRL for overlong masking	2025-04-08 19:03:43 +00:00
lewtun	23b7b69f79	Merge branch 'main' into r1-zero	2025-04-08 20:53:51 +02:00
Lewis Tunstall	0f98a5ac74	Fix soft reward to be really soft	2025-04-07 12:58:25 +00:00
Lewis Tunstall	10a555b4bb	Add soft format reward	2025-04-07 12:11:36 +00:00
Lewis Tunstall	1d7d66a095	Merge branch 'main' into r1-zero	2025-04-04 13:44:29 +02:00
Lewis Tunstall	995beb88fc	Clean up	2025-04-04 07:12:14 +00:00
Lewis Tunstall	f1832c5cb6	Pin TRL	2025-04-03 07:37:13 +00:00
Lewis Tunstall	d51de45bb2	Use proper dataset	2025-04-02 13:11:10 +00:00
Lewis Tunstall	2897519b8e	Revert config	2025-04-01 18:59:35 +00:00
Lewis Tunstall	82a1167186	Log unique only	2025-04-01 18:59:35 +00:00
Lewis Tunstall	f22657b7ed	Set defaults	2025-04-01 18:59:35 +00:00
Lewis Tunstall	5fe41f0839	Pin trl	2025-04-01 18:59:35 +00:00
lewtun	8f26046af4	Merge branch 'main' into r1-zero	2025-04-01 12:06:33 +02:00
Lewis Tunstall	d9c8cd8557	Use None for unferified	2025-04-01 08:37:54 +00:00
Lewis Tunstall	1078b73d51	Fix order of inputs	2025-04-01 07:39:58 +00:00
Lewis Tunstall	5747cfc095	Return None for invalid samples	2025-03-31 17:59:55 +00:00
Lewis Tunstall	1d6c0bbc4b	Fix accuracy rewards	2025-03-31 15:31:04 +00:00
Lewis Tunstall	b35213cd64	Add medium, hard, ultra hard recipes	2025-03-31 15:30:44 +00:00
Lewis Tunstall	9e0e47806c	Add new difficulty levels	2025-03-29 19:18:22 +00:00
Lewis Tunstall	8a4af61829	Fix chat template	2025-03-29 16:33:34 +00:00
Lewis Tunstall	b5e6f9c384	Add R1 Zero 7B	2025-03-29 12:05:54 +00:00