Add codeforces

Merge branch 'main' into zero-math-code
Merge branch 'bump-deps-0' into zero-math-code
2026-06-24 01:54:06 +00:00 · 2025-05-28 19:21:15 +00:00 · 2025-05-28 13:52:20 +02:00 · 2025-05-28 10:11:06 +02:00 · 2025-05-28 09:24:12 +02:00 · 2025-05-28 09:22:22 +02:00
9 changed files with 499 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -477,8 +477,9 @@ lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
 To increase throughput across multiple GPUs, use _data parallel_ as follows:

 ```shell
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
 NUM_GPUS=8
-MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+MODEL=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
 MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
--- a/recipes/R1-Zero-Qwen-Math-7B-Code/grpo/config_v00.00.yaml
+++ b/recipes/R1-Zero-Qwen-Math-7B-Code/grpo/config_v00.00.yaml
@ -0,0 +1,76 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are structured into two main sections: Thought and Answer using the specified format: <think> Thought section </think> Answer section.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/codeforces
+dataset_prompt_column: prompt
+dataset_config: verifiable-prompts
+dataset_test_split: test
+dataset_train_split: train
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- aime24
+- gpqa
+- lcb_v4
+beta: 0.0
+loss_type: dr_grpo
+scale_rewards: false
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 32
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Code
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 2048
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 4 # About 1k optimization steps
+output_dir: data/R1-Zero-Qwen-Math-7B-Code-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- think_code_format
+- cf_code
+reward_weights:
+- 1.0
+- 1.0
+save_strategy: "steps"
+save_steps: 0.2
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+top_k: null
+use_liger_kernel: true
+warmup_ratio: 0.0
+# for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating
+# otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions
+code_eval_test_batch_size: 30 # -1
+code_eval_scoring_mode: weighted_sum
--- a/recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v00.00.yaml
+++ b/recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v00.00.yaml
@ -0,0 +1,67 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are structured into two main sections: Thought and Answer using the specified format: <think> Thought section </think> Answer section.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- aime24
+- gpqa
+- lcb_v4
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Math
+hub_model_revision: v01.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-Math-7B-Math-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- think_format
+- think_accuracy
+reward_weights:
+- 1.0
+- 1.0
+save_strategy: "steps"
+save_steps: 0.2
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+top_k: null
+use_liger_kernel: true
+warmup_ratio: 0.0
--- a/recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v01.00.yaml
+++ b/recipes/R1-Zero-Qwen-Math-7B-Math/grpo/config_v01.00.yaml
@ -0,0 +1,67 @@
+# Config for 1 + 1 nodes
+# Model arguments
+model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if messages[0]['role'] == 'system' %}\n{{- messages[0]['content'] }}\n{%- else %}\n{{- 'Below is a conversation between a user and an AI assistant. The user asks a question, and the assistant thoroughly explores it through a systematic thinking process before providing the final precise and accurate solution. The assistant structures its response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, the assistant details its reasoning process in steps. Each step includes detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, the assistant systematically presents the final solution that it deems correct.' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '\\n\\nUser: ' + message['content'].strip() }}\n    {%- elif message['role'] == 'system' %}\n        {{- message['content'] }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '\\n\\nAssistant: '  + message['content'] }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '\\n\\nAssistant: ' }}\n{%- endif %}"
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- aime24
+- gpqa
+- lcb_v4
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/R1-Zero-Qwen-Math-7B-Math
+hub_model_revision: v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+mask_truncated_completions: true
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1
+output_dir: data/R1-Zero-Qwen-Math-7B-Math-v00.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- think_format
+- think_accuracy
+reward_weights:
+- 1.0
+- 1.0
+save_strategy: "steps"
+save_steps: 0.2
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+top_k: null
+use_liger_kernel: true
+warmup_ratio: 0.0
--- a/slurm/piston/README.md
+++ b/slurm/piston/README.md
@ -3,6 +3,7 @@
 We have built a [piston](https://github.com/engineer-man/piston) package to run IOI problems.

 To launch a fleet of piston workers on a slurm cluster, you can adapt the paths in `launch_piston_workers.sh` and `launch_single_piston.sh` and run:
+
 ```bash
 slurm/piston/launch_piston_workers.sh (number of workers to launch)
 ```
@ -10,7 +11,9 @@ slurm/piston/launch_piston_workers.sh (number of workers to launch)
 This command will launch a slurm job for each worker, which will be called `piston-worker-<port>`, where `<port>` is the port where the worker will be listening.

 ## First time setup
+
 You will need to install the [IOI package](https://github.com/guipenedo/piston/tree/master/packages/cms_ioi/1.0.0) in the workers.
+
 1. Launch a single worker:
 ```bash
 slurm/piston/launch_piston_workers.sh 1
@ -25,7 +28,7 @@ curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: appli

 For CodeForces:
 ```bash
-curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
+curl -X POST http://ip-10-53-93-192:7836/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
 ```

 3. You can now launch more workers and due to the shared mounted packages directory, they should already have the package installed.
--- a/slurm/piston/launch_single_piston.sh
+++ b/slurm/piston/launch_single_piston.sh
@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --job-name=piston_worker
-#SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out
-#SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out  # Redirect error logs to .out
+#SBATCH --output=./logs/%x-%j.out
+#SBATCH --error=./logs/%x-%j.out  # Redirect error logs to .out
 #SBATCH --cpus-per-task=2
 #SBATCH --mem-per-cpu=1950M
 #SBATCH --partition=hopper-cpu
@ -14,7 +14,11 @@ sleep $(( RANDOM % 20 ))
 # we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility)
 # feel free try with the latest image
 # the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package
-srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
+
+# Create the packages directory on the host if it doesn't exist
+mkdir -p ./data/piston/packages
+
+srun --container-mounts=./data/piston/packages:/piston/packages:rw --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
    bash -c "
    export PISTON_COMPILE_TIMEOUT=60000
    export PISTON_RUN_TIMEOUT=60000
@ -23,6 +27,10 @@ srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/pac
    export PISTON_DISABLE_NETWORKING=true
    export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index

+    # Ensure the packages directory exists and has correct permissions inside container
+    mkdir -p /piston/packages
+    chmod 755 /piston/packages
+
    sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
    sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js

--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@ -27,6 +27,8 @@ fi
 module load cuda/12.4
 set -x -e

+export PISTON_ENDPOINTS=slurm
+
 source ~/.bashrc
 source openr1/bin/activate
 START_TIME=$(date +%s)
@ -137,11 +139,11 @@ fi

 # force crashing on nccl issues like hanging broadcast
 export NCCL_ASYNC_ERROR_HANDLING=1
-# export NCCL_DEBUG=INFO
-# export NCCL_DEBUG_SUBSYS=COLL
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=COLL
+export CUDA_LAUNCH_BLOCKING=1
 # export NCCL_SOCKET_NTHREADS=1
 # export NCCL_NSOCKS_PERTHREAD=1
-# export CUDA_LAUNCH_BLOCKING=1

 export CMD=" \
    src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@ -90,6 +90,125 @@ def format_reward(completions, **kwargs):
    return [1.0 if match else 0.0 for match in matches]


+# Copied from TRL: https://github.com/huggingface/trl/blob/9ac614fb081e17805f7f62ab3f5f7036bdefe7b0/trl/rewards/format_rewards.py#L18
+def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:
+    r"""
+    Reward function that checks if the reasoning process is enclosed within `"<think>"` and `"</think>"` tags. The
+    function returns a reward of 1.0 if the format is correct, otherwise 0.0.
+
+    This version ensures:
+    1. The completion starts with optional whitespace followed by a <think> tag
+    2. There is exactly one <think> tag and exactly one </think> tag in the completion
+    3. No other characters appear before the <think> tag
+
+    Args:
+        completions (`list[list[dict[str, str]]]`):
+            List of completions to be evaluated. Each completion must be a list of one message, i.e. a dictionary
+            containing the key `"content"` with the value being the text of the completion.
+        **kwargs:
+            Additional keyword arguments. This function does not use them, but they are required in the function
+            signature to ensure compatibility with trainers like [`GRPOTrainer`].
+
+    Returns:
+        `list[float]`:
+            A list of rewards, where each reward is 1.0 if the completion matches the expected format, otherwise 0.0.
+
+    Example:
+    ```python
+    >>> from trl.rewards import think_format_reward
+    >>> completions = [
+    ...     [{"content": "<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
+    ...     [{"content": "\n<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
+    ...     [{"content": "  \n \n<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
+    ...     [{"content": "Some text <think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
+    ...     [{"content": "<think>\nThis is my reasoning.\n</think> Extra </think>\nThis is my answer."}],
+    ... ]
+    >>> think_format_reward(completions)
+    [1.0, 1.0, 1.0, 0.0, 0.0]
+    ```
+    """
+    # Pattern to check the start of the completion
+    start_pattern = r"^[\s\n]*<think>"
+
+    completion_contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+
+    for content in completion_contents:
+        # Check if the completion starts correctly
+        start_match = re.match(start_pattern, content, re.DOTALL | re.MULTILINE)
+
+        # Count the number of <think> and </think> tags
+        think_open_count = content.count("<think>")
+        think_close_count = content.count("</think>")
+
+        # Give reward of 1.0 only if:
+        # 1. The completion starts with whitespace/newlines followed by <think>
+        # 2. There's exactly one <think> and one </think> tag
+        if start_match and think_open_count == 1 and think_close_count == 1:
+            rewards.append(1.0)
+        else:
+            rewards.append(0.0)
+
+    return rewards
+
+
+def think_accuracy_reward(
+    completions: list[list[dict[str, str]]], solution: list[str], **kwargs
+) -> list[Optional[float]]:
+    """Reward function that checks if the answer after the closing </think> tag is the same as the ground truth."""
+    contents = [completion[0]["content"] for completion in completions]
+    # Extract the content after the last </think> tag.
+    contents_after_think = []
+    for content in contents:
+        # Find the last occurrence of </think>
+        last_think_pos = content.rfind("</think>")
+        if last_think_pos != -1:
+            contents_after_think.append(content[last_think_pos + len("</think>") :].strip())
+        else:
+            contents_after_think.append("")
+
+    rewards = []
+    for content, sol in zip(contents_after_think, solution):
+        gold_parsed = parse(
+            sol,
+            extraction_mode="first_match",
+        )
+        if len(gold_parsed) != 0:
+            # We require the answer to be provided in correct latex (no malformed operators)
+            answer_parsed = parse(
+                content,
+                extraction_config=[
+                    LatexExtractionConfig(
+                        normalization_config=NormalizationConfig(
+                            nits=False,
+                            malformed_operators=False,
+                            basic_latex=True,
+                            equations=True,
+                            boxed="all",
+                            units=True,
+                        ),
+                        # Ensures that boxed is tried first
+                        boxed_match_priority=0,
+                        try_extract_without_anchor=False,
+                    )
+                ],
+                extraction_mode="first_match",
+            )
+            # Compute binary rewards if verifiable, `None` otherwise to skip this example
+            try:
+                reward = float(verify(gold_parsed, answer_parsed))
+            except Exception as e:
+                print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
+                reward = None
+        else:
+            # If the gold solution is not parseable, we assign `None` to skip this example
+            reward = None
+            print("Failed to parse gold solution: ", sol)
+        rewards.append(reward)
+
+    return rewards
+
+
 def tag_count_reward(completions, **kwargs) -> list[float]:
    """Reward function that checks if we produce the desired number of think and answer tags associated with `format_reward()`.

@ -616,6 +735,59 @@ def get_code_format_reward(language: str = "python"):

    return code_format_reward

+def get_think_code_format_reward(language: str = "python"):
+    """Format reward function that checks for proper think tags AND code format.
+    
+    This function combines the logic from think_format_reward (proper <think> tag placement)
+    with code language checking after the closing </think> tag (no <answer> tags required).
+
+    Args:
+        language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
+    """
+
+    def think_code_format_reward(completions, **kwargs):
+        # if there is a language field, use it instead of the default language
+        languages = kwargs["language"] if "language" in kwargs else [language] * len(completions)
+        
+        completion_contents = [completion[0]["content"] for completion in completions]
+        rewards = []
+        
+        # Pattern to check the start of the completion (from think_format_reward)
+        start_pattern = r"^[\s\n]*<think>"
+        
+        for content, sample_language in zip(completion_contents, languages):
+            # Check if the completion starts correctly with <think>
+            start_match = re.match(start_pattern, content, re.DOTALL | re.MULTILINE)
+            
+            # Count the number of <think> and </think> tags
+            think_open_count = content.count("<think>")
+            think_close_count = content.count("</think>")
+            
+            # Check for proper think tag format
+            proper_think_format = (
+                start_match and 
+                think_open_count == 1 and 
+                think_close_count == 1
+            )
+            
+            if not proper_think_format:
+                rewards.append(0.0)
+                continue
+            
+            # Extract content after the closing </think> tag
+            last_think_pos = content.rfind("</think>")
+            content_after_think = content[last_think_pos + len("</think>"):] if last_think_pos != -1 else ""
+            
+            # Check if there's a code block with the specified language after </think>
+            code_pattern = rf"```{sample_language}.*?```"
+            code_match = re.search(code_pattern, content_after_think, re.DOTALL)
+            
+            rewards.append(1.0 if code_match else 0.0)
+        
+        return rewards
+
+    return think_code_format_reward
+

 def get_soft_overlong_punishment(max_completion_len, soft_punish_cache):
    """
@ -647,6 +819,8 @@ def get_reward_funcs(script_args) -> list[Callable]:
    REWARD_FUNCS_REGISTRY = {
        "accuracy": accuracy_reward,
        "format": format_reward,
+        "think_format": think_format_reward,
+        "think_accuracy": think_accuracy_reward,
        "reasoning_steps": reasoning_steps_reward,
        "cosine": get_cosine_scaled_reward(
            min_value_wrong=script_args.cosine_min_value_wrong,
@ -695,6 +869,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
            cf_code_reward,
        ),
        "code_format": get_code_format_reward(language=script_args.code_language),
+        "think_code_format": get_think_code_format_reward(language=script_args.code_language),
        "tag_count": tag_count_reward,
        "soft_overlong_punishment": get_soft_overlong_punishment(
            max_completion_len=script_args.max_completion_len,
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@ -28,7 +28,10 @@ from open_r1.rewards import (
    len_reward,
    reasoning_steps_reward,
    tag_count_reward,
+    think_accuracy_reward,
+    think_format_reward,
 )
+from parameterized import parameterized


 load_dotenv()
@ -40,7 +43,8 @@ class TestGetRewardFuncs(unittest.TestCase):
        reward_names = [
            "accuracy",
            "format",
-            "reasoning_steps",
+            "think_format",
+            "think_accruacyreasoning_steps",
            "cosine",
            "repetition_penalty",
            "length",
@ -53,6 +57,8 @@ class TestGetRewardFuncs(unittest.TestCase):
        reward_func_names = [
            "accuracy_reward",
            "format_reward",
+            "think_format_reward",
+            "think_accuracy_reward",
            "reasoning_steps_reward",
            "cosine_scaled_reward",
            "repetition_penalty_reward",
@ -564,5 +570,90 @@ class TestCodeFormat(unittest.TestCase):
        self.assertEqual(rewards[0], 1.0)


+class TestThinkFormatReward(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("<think>\n\nThought\n\n</think>\n\nSolution",),
+            ("<think>\nThought\n</think>\nSolution",),
+            ("<think>Thought</think>Solution",),
+            ("<think> Thought </think> Solution",),
+            ("\n<think> Thought </think> Solution",),
+            ("<think> Thought </think> Solution",),
+            ("\n\n<think> Thought </think> Solution",),
+            (" <think> Thought </think> Solution",),
+            ("\n <think> Thought </think> Solution",),
+        ]
+    )
+    def test_correct_think_format(self, format_string):
+        """Test think_format_reward with correct think format."""
+        completion = [[{"content": format_string}]]
+        rewards = think_format_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    @parameterized.expand(
+        [
+            ("Preamble <think> Thought </think> Solution",),
+            ("No tags at all",),
+            ("<think> Missing closing thought",),
+            ("<think> Thought 1 </think> <think> Thought 2 </think> Solution section",),
+            (" <think> Thought 1 </think> Extra think tag </think> Solution"),
+            (" <think> Thought 1 </think> Extra opening think tag <think> Solution"),
+        ]
+    )
+    def test_incorrect_think_format(self, format_string):
+        """Test think_format_reward with incorrect think format."""
+        completion = [[{"content": format_string}]]
+        rewards = think_format_reward(completion)
+        self.assertEqual(rewards[0], 0.0)
+
+
+class TestThinkAccuracyReward(unittest.TestCase):
+    def test_correct_answer_after_think(self):
+        completion = [[{"content": "<think> Thought </think> The answer is \\boxed{42}"}]]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_correct_answer_after_multiple_think(self):
+        completion = [[{"content": "<think> Thought 1 </think> <think> Thought 2 </think> The answer is \\boxed{42}"}]]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_incorrect_answer_after_think(self):
+        completion = [[{"content": "<think> Thought </think> The answer is \\boxed{43}"}]]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_multiple_answers_gives_zero_reward(self):
+        completion = [[{"content": "<think> Thought </think> The answer is \\boxed{6} and \\boxed{42}"}]]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_no_latex_answer_gives_zero_reward(self):
+        completion = [[{"content": "<think> Thought </think> The answer is 42"}]]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_truncated_answer_gives_zero_reward(self):
+        completion = [[{"content": "<think> Thought the answer is \\boxed{42} </think> The answer is [TRUNCATED]"}]]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_truncated_answer_after_multiple_think_gives_zero_reward(self):
+        completion = [
+            [
+                {
+                    "content": "<think> Thought 1 the answer is \\boxed{42} </think> <think> Thought 2 the answer is \\boxed{42} </think> The answer is [TRUNCATED]"
+                }
+            ]
+        ]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_truncated_thought_gives_zero_reward(self):
+        completion = [[{"content": "<think> Thought the answer is \\boxed{42} and [TRUNCATED]"}]]
+        rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"])
+        self.assertEqual(rewards[0], 0.0)
+
+
 if __name__ == "__main__":
    unittest.main()
Author	SHA1	Message	Date
Lewis Tunstall	3bcc4fc86e	Add codeforces	2025-05-28 19:21:15 +00:00
Lewis Tunstall	43375fa7b9	Merge branch 'main' into zero-math-code	2025-05-28 13:52:20 +02:00
Lewis Tunstall	97b1c22e55	Merge branch 'bump-deps-0' into zero-math-code	2025-05-28 10:11:06 +02:00
Lewis Tunstall	cada407cd6	Merge branch 'main' into zero-math-code	2025-05-28 09:24:12 +02:00
Lewis Tunstall	b369e428f8	Merge branch 'main' into zero-math-code	2025-05-28 09:22:22 +02:00
Lewis Tunstall	f6a07648e2	Bump vLLM and TRL	2025-05-28 06:48:01 +00:00
Lewis Tunstall	898406d85f	Fix DP=2 for evals	2025-05-27 21:20:52 +00:00
Lewis Tunstall	b6b1643c2d	Fix benchmarks!	2025-05-27 20:44:35 +00:00
Lewis Tunstall	82fb385fa5	Refine tests	2025-05-27 13:39:00 +00:00
Lewis Tunstall	296aa66e1e	Tweak format reward	2025-05-27 08:16:49 +00:00
Lewis Tunstall	9f6abc8ed1	Relax format reward	2025-05-26 11:15:56 +00:00
Lewis Tunstall	bc06504df5	Add better baseline defaults	2025-05-26 09:06:09 +00:00
Lewis Tunstall	9862bfec41	Relax reward	2025-05-26 08:09:03 +00:00
Lewis Tunstall	1f56bab96c	Tune baseline	2025-05-25 17:22:06 +00:00
Lewis Tunstall	965d451d61	Restore baseline	2025-05-25 17:00:33 +00:00
Lewis Tunstall	31eacc4b9a	Use GAS instead of generation	2025-05-25 16:57:33 +00:00
Lewis Tunstall	0b933a2aa4	Restore gas	2025-05-25 16:54:18 +00:00
Lewis Tunstall	cf765df201	Tune baseline	2025-05-25 13:21:01 +00:00
Lewis Tunstall	da0e9ae28d	Add overlong punishment	2025-05-25 12:46:45 +00:00
Lewis Tunstall	7f777c0583	Add new DAPO recipe	2025-05-25 12:40:32 +00:00
Lewis Tunstall	b575444fe8	Add think format and accuracy rewards	2025-05-25 12:24:43 +00:00
Lewis Tunstall	6c7c102755	Merge remote-tracking branch 'origin/bump-deps-0' into zero-math-code	2025-05-25 14:05:42 +02:00
lewtun	5374bc2bef	Merge branch 'main' into bump-deps-0	2025-05-25 12:02:52 +02:00
Lewis Tunstall	3258282733	Bump deps	2025-05-25 09:59:57 +00:00