mirror of
https://github.com/huggingface/open-r1.git
synced 2026-06-24 01:54:06 +00:00
Compare commits
24 commits
main
...
zero-math-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3bcc4fc86e | ||
|
|
43375fa7b9 | ||
|
|
97b1c22e55 | ||
|
|
cada407cd6 | ||
|
|
b369e428f8 | ||
|
|
f6a07648e2 | ||
|
|
898406d85f | ||
|
|
b6b1643c2d | ||
|
|
82fb385fa5 | ||
|
|
296aa66e1e | ||
|
|
9f6abc8ed1 | ||
|
|
bc06504df5 | ||
|
|
9862bfec41 | ||
|
|
1f56bab96c | ||
|
|
965d451d61 | ||
|
|
31eacc4b9a | ||
|
|
0b933a2aa4 | ||
|
|
cf765df201 | ||
|
|
da0e9ae28d | ||
|
|
7f777c0583 | ||
|
|
b575444fe8 | ||
|
|
6c7c102755 | ||
|
|
5374bc2bef |
||
|
|
3258282733 |
9 changed files with 499 additions and 9 deletions
|
|
@ -477,8 +477,9 @@ lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
|
|||
To increase throughput across multiple GPUs, use _data parallel_ as follows:
|
||||
|
||||
```shell
|
||||
export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
NUM_GPUS=8
|
||||
MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
||||
MODEL=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
|
||||
MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
|
||||
TASK=aime24
|
||||
OUTPUT_DIR=data/evals/$MODEL
|
||||
|
|
|
|||
76
recipes/R1-Zero-Qwen-Math-7B-Code/grpo/config_v00.00.yaml
Normal file
76
recipes/R1-Zero-Qwen-Math-7B-Code/grpo/config_v00.00.yaml
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are structured into two main sections: Thought and Answer using the specified format: <think> Thought section </think> Answer section.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/codeforces
|
||||
dataset_prompt_column: prompt
|
||||
dataset_config: verifiable-prompts
|
||||
dataset_test_split: test
|
||||
dataset_train_split: train
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- aime24
|
||||
- gpqa
|
||||
- lcb_v4
|
||||
beta: 0.0
|
||||
loss_type: dr_grpo
|
||||
scale_rewards: false
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 32
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Code
|
||||
hub_model_revision: v00.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 2048
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 4 # About 1k optimization steps
|
||||
output_dir: data/R1-Zero-Qwen-Math-7B-Code-v00.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- think_code_format
|
||||
- cf_code
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.2
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
top_k: null
|
||||
use_liger_kernel: true
|
||||
warmup_ratio: 0.0
|
||||
# for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating
|
||||
# otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions
|
||||
code_eval_test_batch_size: 30 # -1
|
||||
code_eval_scoring_mode: weighted_sum
|
||||
67
recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v00.00.yaml
Normal file
67
recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v00.00.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are structured into two main sections: Thought and Answer using the specified format: <think> Thought section </think> Answer section.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- aime24
|
||||
- gpqa
|
||||
- lcb_v4
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Math
|
||||
hub_model_revision: v01.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Zero-Qwen-Math-7B-Math-v00.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- think_format
|
||||
- think_accuracy
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.2
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
top_k: null
|
||||
use_liger_kernel: true
|
||||
warmup_ratio: 0.0
|
||||
67
recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v01.00.yaml
Normal file
67
recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v01.00.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Config for 1 + 1 nodes
|
||||
# Model arguments
|
||||
model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant thoroughly explores it through a systematic thinking process before providing the final precise and accurate solution. The assistant structures its response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, the assistant details its reasoning process in steps. Each step includes detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, the assistant systematically presents the final solution that it deems correct.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
|
||||
dataset_name: open-r1/DAPO-Math-17k-Processed
|
||||
dataset_config: all
|
||||
|
||||
# GRPO trainer config
|
||||
callbacks:
|
||||
- push_to_hub_revision
|
||||
benchmarks:
|
||||
- aime24
|
||||
- gpqa
|
||||
- lcb_v4
|
||||
beta: 0.0
|
||||
bf16: true
|
||||
do_eval: false
|
||||
eval_strategy: "no"
|
||||
use_vllm: true
|
||||
do_eval: false
|
||||
gradient_accumulation_steps: 16
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Math
|
||||
hub_model_revision: v00.00
|
||||
hub_strategy: every_save
|
||||
learning_rate: 1.0e-06
|
||||
log_completions: true
|
||||
log_level: info
|
||||
logging_first_step: true
|
||||
logging_steps: 1
|
||||
logging_strategy: steps
|
||||
lr_scheduler_type: constant_with_warmup
|
||||
mask_truncated_completions: true
|
||||
max_grad_norm: 0.2
|
||||
max_prompt_length: 1024
|
||||
max_completion_length: 8192
|
||||
max_steps: -1
|
||||
num_generations: 16
|
||||
num_iterations: 1
|
||||
num_train_epochs: 1
|
||||
output_dir: data/R1-Zero-Qwen-Math-7B-Math-v00.00
|
||||
overwrite_output_dir: true
|
||||
per_device_train_batch_size: 4
|
||||
push_to_hub: true
|
||||
report_to:
|
||||
- wandb
|
||||
reward_funcs:
|
||||
- think_format
|
||||
- think_accuracy
|
||||
reward_weights:
|
||||
- 1.0
|
||||
- 1.0
|
||||
save_strategy: "steps"
|
||||
save_steps: 0.2
|
||||
save_total_limit: 1
|
||||
seed: 42
|
||||
temperature: 1.0
|
||||
top_k: null
|
||||
use_liger_kernel: true
|
||||
warmup_ratio: 0.0
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
We have built a [piston](https://github.com/engineer-man/piston) package to run IOI problems.
|
||||
|
||||
To launch a fleet of piston workers on a slurm cluster, you can adapt the paths in `launch_piston_workers.sh` and `launch_single_piston.sh` and run:
|
||||
|
||||
```bash
|
||||
slurm/piston/launch_piston_workers.sh (number of workers to launch)
|
||||
```
|
||||
|
|
@ -10,7 +11,9 @@ slurm/piston/launch_piston_workers.sh (number of workers to launch)
|
|||
This command will launch a slurm job for each worker, which will be called `piston-worker-<port>`, where `<port>` is the port where the worker will be listening.
|
||||
|
||||
## First time setup
|
||||
|
||||
You will need to install the [IOI package](https://github.com/guipenedo/piston/tree/master/packages/cms_ioi/1.0.0) in the workers.
|
||||
|
||||
1. Launch a single worker:
|
||||
```bash
|
||||
slurm/piston/launch_piston_workers.sh 1
|
||||
|
|
@ -25,7 +28,7 @@ curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: appli
|
|||
|
||||
For CodeForces:
|
||||
```bash
|
||||
curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
|
||||
curl -X POST http://ip-10-53-93-192:7836/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
|
||||
```
|
||||
|
||||
3. You can now launch more workers and due to the shared mounted packages directory, they should already have the package installed.
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --job-name=piston_worker
|
||||
#SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out
|
||||
#SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out # Redirect error logs to .out
|
||||
#SBATCH --output=./logs/%x-%j.out
|
||||
#SBATCH --error=./logs/%x-%j.out # Redirect error logs to .out
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem-per-cpu=1950M
|
||||
#SBATCH --partition=hopper-cpu
|
||||
|
|
@ -14,7 +14,11 @@ sleep $(( RANDOM % 20 ))
|
|||
# we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility)
|
||||
# feel free try with the latest image
|
||||
# the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package
|
||||
srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
|
||||
|
||||
# Create the packages directory on the host if it doesn't exist
|
||||
mkdir -p ./data/piston/packages
|
||||
|
||||
srun --container-mounts=./data/piston/packages:/piston/packages:rw --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
|
||||
bash -c "
|
||||
export PISTON_COMPILE_TIMEOUT=60000
|
||||
export PISTON_RUN_TIMEOUT=60000
|
||||
|
|
@ -23,6 +27,10 @@ srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/pac
|
|||
export PISTON_DISABLE_NETWORKING=true
|
||||
export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
|
||||
|
||||
# Ensure the packages directory exists and has correct permissions inside container
|
||||
mkdir -p /piston/packages
|
||||
chmod 755 /piston/packages
|
||||
|
||||
sed -i '/app.use(body_parser.urlencoded/c\ app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
|
||||
sed -i '/app.use(body_parser.json/c\ app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ fi
|
|||
module load cuda/12.4
|
||||
set -x -e
|
||||
|
||||
export PISTON_ENDPOINTS=slurm
|
||||
|
||||
source ~/.bashrc
|
||||
source openr1/bin/activate
|
||||
START_TIME=$(date +%s)
|
||||
|
|
@ -137,11 +139,11 @@ fi
|
|||
|
||||
# force crashing on nccl issues like hanging broadcast
|
||||
export NCCL_ASYNC_ERROR_HANDLING=1
|
||||
# export NCCL_DEBUG=INFO
|
||||
# export NCCL_DEBUG_SUBSYS=COLL
|
||||
export NCCL_DEBUG=INFO
|
||||
export NCCL_DEBUG_SUBSYS=COLL
|
||||
export CUDA_LAUNCH_BLOCKING=1
|
||||
# export NCCL_SOCKET_NTHREADS=1
|
||||
# export NCCL_NSOCKS_PERTHREAD=1
|
||||
# export CUDA_LAUNCH_BLOCKING=1
|
||||
|
||||
export CMD=" \
|
||||
src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
|
||||
|
|
|
|||
|
|
@ -90,6 +90,125 @@ def format_reward(completions, **kwargs):
|
|||
return [1.0 if match else 0.0 for match in matches]
|
||||
|
||||
|
||||
# Copied from TRL: https://github.com/huggingface/trl/blob/9ac614fb081e17805f7f62ab3f5f7036bdefe7b0/trl/rewards/format_rewards.py#L18
|
||||
def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:
|
||||
r"""
|
||||
Reward function that checks if the reasoning process is enclosed within `"<think>"` and `"</think>"` tags. The
|
||||
function returns a reward of 1.0 if the format is correct, otherwise 0.0.
|
||||
|
||||
This version ensures:
|
||||
1. The completion starts with optional whitespace followed by a <think> tag
|
||||
2. There is exactly one <think> tag and exactly one </think> tag in the completion
|
||||
3. No other characters appear before the <think> tag
|
||||
|
||||
Args:
|
||||
completions (`list[list[dict[str, str]]]`):
|
||||
List of completions to be evaluated. Each completion must be a list of one message, i.e. a dictionary
|
||||
containing the key `"content"` with the value being the text of the completion.
|
||||
**kwargs:
|
||||
Additional keyword arguments. This function does not use them, but they are required in the function
|
||||
signature to ensure compatibility with trainers like [`GRPOTrainer`].
|
||||
|
||||
Returns:
|
||||
`list[float]`:
|
||||
A list of rewards, where each reward is 1.0 if the completion matches the expected format, otherwise 0.0.
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> from trl.rewards import think_format_reward
|
||||
>>> completions = [
|
||||
... [{"content": "<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
|
||||
... [{"content": "\n<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
|
||||
... [{"content": " \n \n<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
|
||||
... [{"content": "Some text <think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
|
||||
... [{"content": "<think>\nThis is my reasoning.\n</think> Extra </think>\nThis is my answer."}],
|
||||
... ]
|
||||
>>> think_format_reward(completions)
|
||||
[1.0, 1.0, 1.0, 0.0, 0.0]
|
||||
```
|
||||
"""
|
||||
# Pattern to check the start of the completion
|
||||
start_pattern = r"^[\s\n]*<think>"
|
||||
|
||||
completion_contents = [completion[0]["content"] for completion in completions]
|
||||
rewards = []
|
||||
|
||||
for content in completion_contents:
|
||||
# Check if the completion starts correctly
|
||||
start_match = re.match(start_pattern, content, re.DOTALL | re.MULTILINE)
|
||||
|
||||
# Count the number of <think> and </think> tags
|
||||
think_open_count = content.count("<think>")
|
||||
think_close_count = content.count("</think>")
|
||||
|
||||
# Give reward of 1.0 only if:
|
||||
# 1. The completion starts with whitespace/newlines followed by <think>
|
||||
# 2. There's exactly one <think> and one </think> tag
|
||||
if start_match and think_open_count == 1 and think_close_count == 1:
|
||||
rewards.append(1.0)
|
||||
else:
|
||||
rewards.append(0.0)
|
||||
|
||||
return rewards
|
||||
|
||||
|
||||
def think_accuracy_reward(
|
||||
completions: list[list[dict[str, str]]], solution: list[str], **kwargs
|
||||
) -> list[Optional[float]]:
|
||||
"""Reward function that checks if the answer after the closing </think> tag is the same as the ground truth."""
|
||||
contents = [completion[0]["content"] for completion in completions]
|
||||
# Extract the content after the last </think> tag.
|
||||
contents_after_think = []
|
||||
for content in contents:
|
||||
# Find the last occurrence of </think>
|
||||
last_think_pos = content.rfind("</think>")
|
||||
if last_think_pos != -1:
|
||||
contents_after_think.append(content[last_think_pos + len("</think>") :].strip())
|
||||
else:
|
||||
contents_after_think.append("")
|
||||
|
||||
rewards = []
|
||||
for content, sol in zip(contents_after_think, solution):
|
||||
gold_parsed = parse(
|
||||
sol,
|
||||
extraction_mode="first_match",
|
||||
)
|
||||
if len(gold_parsed) != 0:
|
||||
# We require the answer to be provided in correct latex (no malformed operators)
|
||||
answer_parsed = parse(
|
||||
content,
|
||||
extraction_config=[
|
||||
LatexExtractionConfig(
|
||||
normalization_config=NormalizationConfig(
|
||||
nits=False,
|
||||
malformed_operators=False,
|
||||
basic_latex=True,
|
||||
equations=True,
|
||||
boxed="all",
|
||||
units=True,
|
||||
),
|
||||
# Ensures that boxed is tried first
|
||||
boxed_match_priority=0,
|
||||
try_extract_without_anchor=False,
|
||||
)
|
||||
],
|
||||
extraction_mode="first_match",
|
||||
)
|
||||
# Compute binary rewards if verifiable, `None` otherwise to skip this example
|
||||
try:
|
||||
reward = float(verify(gold_parsed, answer_parsed))
|
||||
except Exception as e:
|
||||
print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
|
||||
reward = None
|
||||
else:
|
||||
# If the gold solution is not parseable, we assign `None` to skip this example
|
||||
reward = None
|
||||
print("Failed to parse gold solution: ", sol)
|
||||
rewards.append(reward)
|
||||
|
||||
return rewards
|
||||
|
||||
|
||||
def tag_count_reward(completions, **kwargs) -> list[float]:
|
||||
"""Reward function that checks if we produce the desired number of think and answer tags associated with `format_reward()`.
|
||||
|
||||
|
|
@ -616,6 +735,59 @@ def get_code_format_reward(language: str = "python"):
|
|||
|
||||
return code_format_reward
|
||||
|
||||
def get_think_code_format_reward(language: str = "python"):
|
||||
"""Format reward function that checks for proper think tags AND code format.
|
||||
|
||||
This function combines the logic from think_format_reward (proper <think> tag placement)
|
||||
with code language checking after the closing </think> tag (no <answer> tags required).
|
||||
|
||||
Args:
|
||||
language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
|
||||
"""
|
||||
|
||||
def think_code_format_reward(completions, **kwargs):
|
||||
# if there is a language field, use it instead of the default language
|
||||
languages = kwargs["language"] if "language" in kwargs else [language] * len(completions)
|
||||
|
||||
completion_contents = [completion[0]["content"] for completion in completions]
|
||||
rewards = []
|
||||
|
||||
# Pattern to check the start of the completion (from think_format_reward)
|
||||
start_pattern = r"^[\s\n]*<think>"
|
||||
|
||||
for content, sample_language in zip(completion_contents, languages):
|
||||
# Check if the completion starts correctly with <think>
|
||||
start_match = re.match(start_pattern, content, re.DOTALL | re.MULTILINE)
|
||||
|
||||
# Count the number of <think> and </think> tags
|
||||
think_open_count = content.count("<think>")
|
||||
think_close_count = content.count("</think>")
|
||||
|
||||
# Check for proper think tag format
|
||||
proper_think_format = (
|
||||
start_match and
|
||||
think_open_count == 1 and
|
||||
think_close_count == 1
|
||||
)
|
||||
|
||||
if not proper_think_format:
|
||||
rewards.append(0.0)
|
||||
continue
|
||||
|
||||
# Extract content after the closing </think> tag
|
||||
last_think_pos = content.rfind("</think>")
|
||||
content_after_think = content[last_think_pos + len("</think>"):] if last_think_pos != -1 else ""
|
||||
|
||||
# Check if there's a code block with the specified language after </think>
|
||||
code_pattern = rf"```{sample_language}.*?```"
|
||||
code_match = re.search(code_pattern, content_after_think, re.DOTALL)
|
||||
|
||||
rewards.append(1.0 if code_match else 0.0)
|
||||
|
||||
return rewards
|
||||
|
||||
return think_code_format_reward
|
||||
|
||||
|
||||
def get_soft_overlong_punishment(max_completion_len, soft_punish_cache):
|
||||
"""
|
||||
|
|
@ -647,6 +819,8 @@ def get_reward_funcs(script_args) -> list[Callable]:
|
|||
REWARD_FUNCS_REGISTRY = {
|
||||
"accuracy": accuracy_reward,
|
||||
"format": format_reward,
|
||||
"think_format": think_format_reward,
|
||||
"think_accuracy": think_accuracy_reward,
|
||||
"reasoning_steps": reasoning_steps_reward,
|
||||
"cosine": get_cosine_scaled_reward(
|
||||
min_value_wrong=script_args.cosine_min_value_wrong,
|
||||
|
|
@ -695,6 +869,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
|
|||
cf_code_reward,
|
||||
),
|
||||
"code_format": get_code_format_reward(language=script_args.code_language),
|
||||
"think_code_format": get_think_code_format_reward(language=script_args.code_language),
|
||||
"tag_count": tag_count_reward,
|
||||
"soft_overlong_punishment": get_soft_overlong_punishment(
|
||||
max_completion_len=script_args.max_completion_len,
|
||||
|
|
|
|||
|
|
@ -28,7 +28,10 @@ from open_r1.rewards import (
|
|||
len_reward,
|
||||
reasoning_steps_reward,
|
||||
tag_count_reward,
|
||||
think_accuracy_reward,
|
||||
think_format_reward,
|
||||
)
|
||||
from parameterized import parameterized
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
|
@ -40,7 +43,8 @@ class TestGetRewardFuncs(unittest.TestCase):
|
|||
reward_names = [
|
||||
"accuracy",
|
||||
"format",
|
||||
"reasoning_steps",
|
||||
"think_format",
|
||||
"think_accruacyreasoning_steps",
|
||||
"cosine",
|
||||
"repetition_penalty",
|
||||
"length",
|
||||
|
|
@ -53,6 +57,8 @@ class TestGetRewardFuncs(unittest.TestCase):
|
|||
reward_func_names = [
|
||||
"accuracy_reward",
|
||||
"format_reward",
|
||||
"think_format_reward",
|
||||
"think_accuracy_reward",
|
||||
"reasoning_steps_reward",
|
||||
"cosine_scaled_reward",
|
||||
"repetition_penalty_reward",
|
||||
|
|
@ -564,5 +570,90 @@ class TestCodeFormat(unittest.TestCase):
|
|||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
|
||||
class TestThinkFormatReward(unittest.TestCase):
|
||||
@parameterized.expand(
|
||||
[
|
||||
("<think>\n\nThought\n\n</think>\n\nSolution",),
|
||||
("<think>\nThought\n</think>\nSolution",),
|
||||
("<think>Thought</think>Solution",),
|
||||
("<think> Thought </think> Solution",),
|
||||
("\n<think> Thought </think> Solution",),
|
||||
("<think> Thought </think> Solution",),
|
||||
("\n\n<think> Thought </think> Solution",),
|
||||
(" <think> Thought </think> Solution",),
|
||||
("\n <think> Thought </think> Solution",),
|
||||
]
|
||||
)
|
||||
def test_correct_think_format(self, format_string):
|
||||
"""Test think_format_reward with correct think format."""
|
||||
completion = [[{"content": format_string}]]
|
||||
rewards = think_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
("Preamble <think> Thought </think> Solution",),
|
||||
("No tags at all",),
|
||||
("<think> Missing closing thought",),
|
||||
("<think> Thought 1 </think> <think> Thought 2 </think> Solution section",),
|
||||
(" <think> Thought 1 </think> Extra think tag </think> Solution"),
|
||||
(" <think> Thought 1 </think> Extra opening think tag <think> Solution"),
|
||||
]
|
||||
)
|
||||
def test_incorrect_think_format(self, format_string):
|
||||
"""Test think_format_reward with incorrect think format."""
|
||||
completion = [[{"content": format_string}]]
|
||||
rewards = think_format_reward(completion)
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
|
||||
class TestThinkAccuracyReward(unittest.TestCase):
|
||||
def test_correct_answer_after_think(self):
|
||||
completion = [[{"content": "<think> Thought </think> The answer is \\boxed{42}"}]]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_correct_answer_after_multiple_think(self):
|
||||
completion = [[{"content": "<think> Thought 1 </think> <think> Thought 2 </think> The answer is \\boxed{42}"}]]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 1.0)
|
||||
|
||||
def test_incorrect_answer_after_think(self):
|
||||
completion = [[{"content": "<think> Thought </think> The answer is \\boxed{43}"}]]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_multiple_answers_gives_zero_reward(self):
|
||||
completion = [[{"content": "<think> Thought </think> The answer is \\boxed{6} and \\boxed{42}"}]]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_no_latex_answer_gives_zero_reward(self):
|
||||
completion = [[{"content": "<think> Thought </think> The answer is 42"}]]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_truncated_answer_gives_zero_reward(self):
|
||||
completion = [[{"content": "<think> Thought the answer is \\boxed{42} </think> The answer is [TRUNCATED]"}]]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_truncated_answer_after_multiple_think_gives_zero_reward(self):
|
||||
completion = [
|
||||
[
|
||||
{
|
||||
"content": "<think> Thought 1 the answer is \\boxed{42} </think> <think> Thought 2 the answer is \\boxed{42} </think> The answer is [TRUNCATED]"
|
||||
}
|
||||
]
|
||||
]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
def test_truncated_thought_gives_zero_reward(self):
|
||||
completion = [[{"content": "<think> Thought the answer is \\boxed{42} and [TRUNCATED]"}]]
|
||||
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
|
||||
self.assertEqual(rewards[0], 0.0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue