open-r1/scripts/benchmark_e2b.py

# coding=utf-8
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Benchmark script for the code_reward function with E2B.

This script measures the performance of the code_reward function with varying numbers
of samples and parallelization levels.

Each sample is a CodeForces problem with a gold standard solution that is executed against a set of public test cases.
"""

from datasets import load_dataset
import time
from tqdm.auto import tqdm

from dotenv import load_dotenv
load_dotenv()

from open_r1.rewards import code_reward

def benchmark_code_reward(example):
    start_time = time.time()
    test_completions = [[{"content": example["gold_standard_solution"]}]]
    reward_kwargs = {"verification_info": [example["verification_info"]]}
    rewards = code_reward(test_completions, **reward_kwargs)
    end_time = time.time()
    example["test_reward"] = rewards[0]
    example["reward_time"] = end_time - start_time
    return example

if __name__ == "__main__":
    parallel_dict = {
        16:[1,4,16],
        64:[4,16, 64],
        256:[16, 64, 96], # cap at 96 as PRO account is limited to 100
    }
    # Store results for table formatting
    results = []

    for num_samples in tqdm([16, 64,256], desc="Benchmarking samples"):
        for num_parallel in parallel_dict[num_samples]:
            code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated")
            code_dataset = code_dataset["train"].shuffle(seed=42).select(range(num_samples))

            test_completions = [[{"content": example["gold_standard_solution"]}] for example in code_dataset]
            reward_kwargs = {"verification_info": [example["verification_info"] for example in code_dataset]}

            start_time = time.time()
            rewards = code_reward(test_completions, num_parallel=num_parallel, **reward_kwargs)
            execution_time = time.time() - start_time

            # Calculate some statistics about rewards
            mean_reward = sum(rewards) / len(rewards)
            min_reward = min(rewards)
            max_reward = max(rewards)

            # Store results
            results.append({
                "num_samples": num_samples,
                "num_parallel": num_parallel,
                "execution_time": execution_time,
                "mean_reward": mean_reward,
                "min_reward": min_reward,
                "max_reward": max_reward
            })

    print("\n## Benchmark Results\n")
    print("| Sample Size | Parallelization | Execution Time (s) | Mean Reward | Min Reward | Max Reward |")
    print("|:-----------:|:---------------:|------------------:|:-----------:|:-----------:|:-----------:|")

    for result in results:
        print(f"| {result['num_samples']:^11} | {result['num_parallel']:^15} | {result['execution_time']:17.2f} | {result['mean_reward']:^11.4f} | {result['min_reward']:^11.4f} | {result['max_reward']:^11.4f} |")