Compare commits

...

24 commits

Author SHA1 Message Date
Lewis Tunstall
3bcc4fc86e Add codeforces 2025-05-28 19:21:15 +00:00
Lewis Tunstall
43375fa7b9 Merge branch 'main' into zero-math-code 2025-05-28 13:52:20 +02:00
Lewis Tunstall
97b1c22e55 Merge branch 'bump-deps-0' into zero-math-code 2025-05-28 10:11:06 +02:00
Lewis Tunstall
cada407cd6 Merge branch 'main' into zero-math-code 2025-05-28 09:24:12 +02:00
Lewis Tunstall
b369e428f8 Merge branch 'main' into zero-math-code 2025-05-28 09:22:22 +02:00
Lewis Tunstall
f6a07648e2 Bump vLLM and TRL 2025-05-28 06:48:01 +00:00
Lewis Tunstall
898406d85f Fix DP=2 for evals 2025-05-27 21:20:52 +00:00
Lewis Tunstall
b6b1643c2d Fix benchmarks! 2025-05-27 20:44:35 +00:00
Lewis Tunstall
82fb385fa5 Refine tests 2025-05-27 13:39:00 +00:00
Lewis Tunstall
296aa66e1e Tweak format reward 2025-05-27 08:16:49 +00:00
Lewis Tunstall
9f6abc8ed1 Relax format reward 2025-05-26 11:15:56 +00:00
Lewis Tunstall
bc06504df5 Add better baseline defaults 2025-05-26 09:06:09 +00:00
Lewis Tunstall
9862bfec41 Relax reward 2025-05-26 08:09:03 +00:00
Lewis Tunstall
1f56bab96c Tune baseline 2025-05-25 17:22:06 +00:00
Lewis Tunstall
965d451d61 Restore baseline 2025-05-25 17:00:33 +00:00
Lewis Tunstall
31eacc4b9a Use GAS instead of generation 2025-05-25 16:57:33 +00:00
Lewis Tunstall
0b933a2aa4 Restore gas 2025-05-25 16:54:18 +00:00
Lewis Tunstall
cf765df201 Tune baseline 2025-05-25 13:21:01 +00:00
Lewis Tunstall
da0e9ae28d Add overlong punishment 2025-05-25 12:46:45 +00:00
Lewis Tunstall
7f777c0583 Add new DAPO recipe 2025-05-25 12:40:32 +00:00
Lewis Tunstall
b575444fe8 Add think format and accuracy rewards 2025-05-25 12:24:43 +00:00
Lewis Tunstall
6c7c102755 Merge remote-tracking branch 'origin/bump-deps-0' into zero-math-code 2025-05-25 14:05:42 +02:00
lewtun
5374bc2bef
Merge branch 'main' into bump-deps-0 2025-05-25 12:02:52 +02:00
Lewis Tunstall
3258282733 Bump deps 2025-05-25 09:59:57 +00:00
9 changed files with 499 additions and 9 deletions

View file

@ -477,8 +477,9 @@ lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
To increase throughput across multiple GPUs, use _data parallel_ as follows:
```shell
export VLLM_WORKER_MULTIPROC_METHOD=spawn
NUM_GPUS=8
MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
MODEL=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
TASK=aime24
OUTPUT_DIR=data/evals/$MODEL

View file

@ -0,0 +1,76 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are structured into two main sections: Thought and Answer using the specified format: <think> Thought section </think> Answer section.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/codeforces
dataset_prompt_column: prompt
dataset_config: verifiable-prompts
dataset_test_split: test
dataset_train_split: train
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- aime24
- gpqa
- lcb_v4
beta: 0.0
loss_type: dr_grpo
scale_rewards: false
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 32
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Code
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 2048
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 4 # About 1k optimization steps
output_dir: data/R1-Zero-Qwen-Math-7B-Code-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- think_code_format
- cf_code
reward_weights:
- 1.0
- 1.0
save_strategy: "steps"
save_steps: 0.2
save_total_limit: 1
seed: 42
temperature: 1.0
top_k: null
use_liger_kernel: true
warmup_ratio: 0.0
# for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating
# otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions
code_eval_test_batch_size: 30 # -1
code_eval_scoring_mode: weighted_sum

View file

@ -0,0 +1,67 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are structured into two main sections: Thought and Answer using the specified format: <think> Thought section </think> Answer section.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- aime24
- gpqa
- lcb_v4
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Math
hub_model_revision: v01.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-Math-7B-Math-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- think_format
- think_accuracy
reward_weights:
- 1.0
- 1.0
save_strategy: "steps"
save_steps: 0.2
save_total_limit: 1
seed: 42
temperature: 1.0
top_k: null
use_liger_kernel: true
warmup_ratio: 0.0

View file

@ -0,0 +1,67 @@
# Config for 1 + 1 nodes
# Model arguments
model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant thoroughly explores it through a systematic thinking process before providing the final precise and accurate solution. The assistant structures its response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, the assistant details its reasoning process in steps. Each step includes detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, the assistant systematically presents the final solution that it deems correct.' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'user' %}\n {{- '\\n\\nUser: ' + message['content'].strip() }}\n {%- elif message['role'] == 'system' %}\n {{- message['content'] }}\n {%- elif message['role'] == 'assistant' %}\n {{- '\\n\\nAssistant: ' + message['content'] }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all
# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- aime24
- gpqa
- lcb_v4
beta: 0.0
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Math
hub_model_revision: v00.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
mask_truncated_completions: true
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1
output_dir: data/R1-Zero-Qwen-Math-7B-Math-v00.00
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- think_format
- think_accuracy
reward_weights:
- 1.0
- 1.0
save_strategy: "steps"
save_steps: 0.2
save_total_limit: 1
seed: 42
temperature: 1.0
top_k: null
use_liger_kernel: true
warmup_ratio: 0.0

View file

@ -3,6 +3,7 @@
We have built a [piston](https://github.com/engineer-man/piston) package to run IOI problems.
To launch a fleet of piston workers on a slurm cluster, you can adapt the paths in `launch_piston_workers.sh` and `launch_single_piston.sh` and run:
```bash
slurm/piston/launch_piston_workers.sh (number of workers to launch)
```
@ -10,7 +11,9 @@ slurm/piston/launch_piston_workers.sh (number of workers to launch)
This command will launch a slurm job for each worker, which will be called `piston-worker-<port>`, where `<port>` is the port where the worker will be listening.
## First time setup
You will need to install the [IOI package](https://github.com/guipenedo/piston/tree/master/packages/cms_ioi/1.0.0) in the workers.
1. Launch a single worker:
```bash
slurm/piston/launch_piston_workers.sh 1
@ -25,7 +28,7 @@ curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: appli
For CodeForces:
```bash
curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
curl -X POST http://ip-10-53-93-192:7836/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
```
3. You can now launch more workers and due to the shared mounted packages directory, they should already have the package installed.

View file

@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --job-name=piston_worker
#SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out
#SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out # Redirect error logs to .out
#SBATCH --output=./logs/%x-%j.out
#SBATCH --error=./logs/%x-%j.out # Redirect error logs to .out
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1950M
#SBATCH --partition=hopper-cpu
@ -14,7 +14,11 @@ sleep $(( RANDOM % 20 ))
# we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility)
# feel free try with the latest image
# the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package
srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
# Create the packages directory on the host if it doesn't exist
mkdir -p ./data/piston/packages
srun --container-mounts=./data/piston/packages:/piston/packages:rw --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
bash -c "
export PISTON_COMPILE_TIMEOUT=60000
export PISTON_RUN_TIMEOUT=60000
@ -23,6 +27,10 @@ srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/pac
export PISTON_DISABLE_NETWORKING=true
export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
# Ensure the packages directory exists and has correct permissions inside container
mkdir -p /piston/packages
chmod 755 /piston/packages
sed -i '/app.use(body_parser.urlencoded/c\ app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
sed -i '/app.use(body_parser.json/c\ app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js

View file

@ -27,6 +27,8 @@ fi
module load cuda/12.4
set -x -e
export PISTON_ENDPOINTS=slurm
source ~/.bashrc
source openr1/bin/activate
START_TIME=$(date +%s)
@ -137,11 +139,11 @@ fi
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=COLL
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=COLL
export CUDA_LAUNCH_BLOCKING=1
# export NCCL_SOCKET_NTHREADS=1
# export NCCL_NSOCKS_PERTHREAD=1
# export CUDA_LAUNCH_BLOCKING=1
export CMD=" \
src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS

View file

@ -90,6 +90,125 @@ def format_reward(completions, **kwargs):
return [1.0 if match else 0.0 for match in matches]
# Copied from TRL: https://github.com/huggingface/trl/blob/9ac614fb081e17805f7f62ab3f5f7036bdefe7b0/trl/rewards/format_rewards.py#L18
def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:
r"""
Reward function that checks if the reasoning process is enclosed within `"<think>"` and `"</think>"` tags. The
function returns a reward of 1.0 if the format is correct, otherwise 0.0.
This version ensures:
1. The completion starts with optional whitespace followed by a <think> tag
2. There is exactly one <think> tag and exactly one </think> tag in the completion
3. No other characters appear before the <think> tag
Args:
completions (`list[list[dict[str, str]]]`):
List of completions to be evaluated. Each completion must be a list of one message, i.e. a dictionary
containing the key `"content"` with the value being the text of the completion.
**kwargs:
Additional keyword arguments. This function does not use them, but they are required in the function
signature to ensure compatibility with trainers like [`GRPOTrainer`].
Returns:
`list[float]`:
A list of rewards, where each reward is 1.0 if the completion matches the expected format, otherwise 0.0.
Example:
```python
>>> from trl.rewards import think_format_reward
>>> completions = [
... [{"content": "<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
... [{"content": "\n<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
... [{"content": " \n \n<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
... [{"content": "Some text <think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
... [{"content": "<think>\nThis is my reasoning.\n</think> Extra </think>\nThis is my answer."}],
... ]
>>> think_format_reward(completions)
[1.0, 1.0, 1.0, 0.0, 0.0]
```
"""
# Pattern to check the start of the completion
start_pattern = r"^[\s\n]*<think>"
completion_contents = [completion[0]["content"] for completion in completions]
rewards = []
for content in completion_contents:
# Check if the completion starts correctly
start_match = re.match(start_pattern, content, re.DOTALL | re.MULTILINE)
# Count the number of <think> and </think> tags
think_open_count = content.count("<think>")
think_close_count = content.count("</think>")
# Give reward of 1.0 only if:
# 1. The completion starts with whitespace/newlines followed by <think>
# 2. There's exactly one <think> and one </think> tag
if start_match and think_open_count == 1 and think_close_count == 1:
rewards.append(1.0)
else:
rewards.append(0.0)
return rewards
def think_accuracy_reward(
completions: list[list[dict[str, str]]], solution: list[str], **kwargs
) -> list[Optional[float]]:
"""Reward function that checks if the answer after the closing </think> tag is the same as the ground truth."""
contents = [completion[0]["content"] for completion in completions]
# Extract the content after the last </think> tag.
contents_after_think = []
for content in contents:
# Find the last occurrence of </think>
last_think_pos = content.rfind("</think>")
if last_think_pos != -1:
contents_after_think.append(content[last_think_pos + len("</think>") :].strip())
else:
contents_after_think.append("")
rewards = []
for content, sol in zip(contents_after_think, solution):
gold_parsed = parse(
sol,
extraction_mode="first_match",
)
if len(gold_parsed) != 0:
# We require the answer to be provided in correct latex (no malformed operators)
answer_parsed = parse(
content,
extraction_config=[
LatexExtractionConfig(
normalization_config=NormalizationConfig(
nits=False,
malformed_operators=False,
basic_latex=True,
equations=True,
boxed="all",
units=True,
),
# Ensures that boxed is tried first
boxed_match_priority=0,
try_extract_without_anchor=False,
)
],
extraction_mode="first_match",
)
# Compute binary rewards if verifiable, `None` otherwise to skip this example
try:
reward = float(verify(gold_parsed, answer_parsed))
except Exception as e:
print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
reward = None
else:
# If the gold solution is not parseable, we assign `None` to skip this example
reward = None
print("Failed to parse gold solution: ", sol)
rewards.append(reward)
return rewards
def tag_count_reward(completions, **kwargs) -> list[float]:
"""Reward function that checks if we produce the desired number of think and answer tags associated with `format_reward()`.
@ -616,6 +735,59 @@ def get_code_format_reward(language: str = "python"):
return code_format_reward
def get_think_code_format_reward(language: str = "python"):
"""Format reward function that checks for proper think tags AND code format.
This function combines the logic from think_format_reward (proper <think> tag placement)
with code language checking after the closing </think> tag (no <answer> tags required).
Args:
language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
"""
def think_code_format_reward(completions, **kwargs):
# if there is a language field, use it instead of the default language
languages = kwargs["language"] if "language" in kwargs else [language] * len(completions)
completion_contents = [completion[0]["content"] for completion in completions]
rewards = []
# Pattern to check the start of the completion (from think_format_reward)
start_pattern = r"^[\s\n]*<think>"
for content, sample_language in zip(completion_contents, languages):
# Check if the completion starts correctly with <think>
start_match = re.match(start_pattern, content, re.DOTALL | re.MULTILINE)
# Count the number of <think> and </think> tags
think_open_count = content.count("<think>")
think_close_count = content.count("</think>")
# Check for proper think tag format
proper_think_format = (
start_match and
think_open_count == 1 and
think_close_count == 1
)
if not proper_think_format:
rewards.append(0.0)
continue
# Extract content after the closing </think> tag
last_think_pos = content.rfind("</think>")
content_after_think = content[last_think_pos + len("</think>"):] if last_think_pos != -1 else ""
# Check if there's a code block with the specified language after </think>
code_pattern = rf"```{sample_language}.*?```"
code_match = re.search(code_pattern, content_after_think, re.DOTALL)
rewards.append(1.0 if code_match else 0.0)
return rewards
return think_code_format_reward
def get_soft_overlong_punishment(max_completion_len, soft_punish_cache):
"""
@ -647,6 +819,8 @@ def get_reward_funcs(script_args) -> list[Callable]:
REWARD_FUNCS_REGISTRY = {
"accuracy": accuracy_reward,
"format": format_reward,
"think_format": think_format_reward,
"think_accuracy": think_accuracy_reward,
"reasoning_steps": reasoning_steps_reward,
"cosine": get_cosine_scaled_reward(
min_value_wrong=script_args.cosine_min_value_wrong,
@ -695,6 +869,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
cf_code_reward,
),
"code_format": get_code_format_reward(language=script_args.code_language),
"think_code_format": get_think_code_format_reward(language=script_args.code_language),
"tag_count": tag_count_reward,
"soft_overlong_punishment": get_soft_overlong_punishment(
max_completion_len=script_args.max_completion_len,

View file

@ -28,7 +28,10 @@ from open_r1.rewards import (
len_reward,
reasoning_steps_reward,
tag_count_reward,
think_accuracy_reward,
think_format_reward,
)
from parameterized import parameterized
load_dotenv()
@ -40,7 +43,8 @@ class TestGetRewardFuncs(unittest.TestCase):
reward_names = [
"accuracy",
"format",
"reasoning_steps",
"think_format",
"think_accruacyreasoning_steps",
"cosine",
"repetition_penalty",
"length",
@ -53,6 +57,8 @@ class TestGetRewardFuncs(unittest.TestCase):
reward_func_names = [
"accuracy_reward",
"format_reward",
"think_format_reward",
"think_accuracy_reward",
"reasoning_steps_reward",
"cosine_scaled_reward",
"repetition_penalty_reward",
@ -564,5 +570,90 @@ class TestCodeFormat(unittest.TestCase):
self.assertEqual(rewards[0], 1.0)
class TestThinkFormatReward(unittest.TestCase):
@parameterized.expand(
[
("<think>\n\nThought\n\n</think>\n\nSolution",),
("<think>\nThought\n</think>\nSolution",),
("<think>Thought</think>Solution",),
("<think> Thought </think> Solution",),
("\n<think> Thought </think> Solution",),
("<think> Thought </think> Solution",),
("\n\n<think> Thought </think> Solution",),
(" <think> Thought </think> Solution",),
("\n <think> Thought </think> Solution",),
]
)
def test_correct_think_format(self, format_string):
"""Test think_format_reward with correct think format."""
completion = [[{"content": format_string}]]
rewards = think_format_reward(completion)
self.assertEqual(rewards[0], 1.0)
@parameterized.expand(
[
("Preamble <think> Thought </think> Solution",),
("No tags at all",),
("<think> Missing closing thought",),
("<think> Thought 1 </think> <think> Thought 2 </think> Solution section",),
(" <think> Thought 1 </think> Extra think tag </think> Solution"),
(" <think> Thought 1 </think> Extra opening think tag <think> Solution"),
]
)
def test_incorrect_think_format(self, format_string):
"""Test think_format_reward with incorrect think format."""
completion = [[{"content": format_string}]]
rewards = think_format_reward(completion)
self.assertEqual(rewards[0], 0.0)
class TestThinkAccuracyReward(unittest.TestCase):
def test_correct_answer_after_think(self):
completion = [[{"content": "<think> Thought </think> The answer is \\boxed{42}"}]]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 1.0)
def test_correct_answer_after_multiple_think(self):
completion = [[{"content": "<think> Thought 1 </think> <think> Thought 2 </think> The answer is \\boxed{42}"}]]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 1.0)
def test_incorrect_answer_after_think(self):
completion = [[{"content": "<think> Thought </think> The answer is \\boxed{43}"}]]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 0.0)
def test_multiple_answers_gives_zero_reward(self):
completion = [[{"content": "<think> Thought </think> The answer is \\boxed{6} and \\boxed{42}"}]]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 0.0)
def test_no_latex_answer_gives_zero_reward(self):
completion = [[{"content": "<think> Thought </think> The answer is 42"}]]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 0.0)
def test_truncated_answer_gives_zero_reward(self):
completion = [[{"content": "<think> Thought the answer is \\boxed{42} </think> The answer is [TRUNCATED]"}]]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 0.0)
def test_truncated_answer_after_multiple_think_gives_zero_reward(self):
completion = [
[
{
"content": "<think> Thought 1 the answer is \\boxed{42} </think> <think> Thought 2 the answer is \\boxed{42} </think> The answer is [TRUNCATED]"
}
]
]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 0.0)
def test_truncated_thought_gives_zero_reward(self):
completion = [[{"content": "<think> Thought the answer is \\boxed{42} and [TRUNCATED]"}]]
rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
self.assertEqual(rewards[0], 0.0)
if __name__ == "__main__":
unittest.main()