open-r1/tests/slow/test_code_reward.py

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import unittest

from datasets import load_dataset

from e2b_code_interpreter.models import Execution, ExecutionError
from open_r1.rewards import code_reward, ioi_code_reward
from open_r1.utils.routed_morph import RoutedMorphSandbox
from open_r1.utils.routed_sandbox import RoutedSandbox


class TestCodeRewards(unittest.TestCase):
    def test_python_code_reward(self):
        # requires E2B, see the README.md file
        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
        NUM_SAMPLES = 20
        samples = code_dataset["train"].select(range(NUM_SAMPLES))
        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
        rewards = code_reward(test_completions, **reward_kwargs)
        print(rewards)
        assert rewards == [1.0] * NUM_SAMPLES

    def test_e2b_router(self):
        # run router locally: python scripts/e2b_router.py
        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
        NUM_SAMPLES = 128
        samples = code_dataset["train"].select(range(NUM_SAMPLES))
        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
        rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
        print(rewards)
        assert rewards == [1.0] * NUM_SAMPLES

    def test_e2b_router_parallel(self):
        # run router locally: python scripts/e2b_router.py
        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")

        BATCH_SIZE = 32
        NUM_SAMPLES = 256

        def batch_code_reward(examples):
            test_completions = [[{"content": solution}] for solution in examples["gold_standard_solution"]]
            reward_kwargs = {
                "verification_info": [verification_info for verification_info in examples["verification_info"]]
            }
            rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
            assert rewards == [1.0] * BATCH_SIZE
            return examples

        code_dataset = code_dataset["train"].select(range(NUM_SAMPLES))
        code_dataset = code_dataset.map(
            batch_code_reward,
            batched=True,
            batch_size=BATCH_SIZE,
            num_proc=4,
            load_from_cache_file=False,
        )

    def test_ioi_code_reward(self):
        # This slow test case requires spinning up a bunch (I tested with ~64) of piston workers, see docs here
        # slurm/piston/README.md
        code_dataset = load_dataset("open-r1/ioi-reward-test-dataset")
        NUM_SAMPLES = 16
        samples = code_dataset["train"].select(range(NUM_SAMPLES))
        test_completions = [[{"content": f"```cpp\n{sample['sample_solution']}```"}] for sample in samples]
        keys = [key for key in samples[0] if key not in ["prompt", "completion"]]
        reward_kwargs = {key: [example[key] for example in samples] for key in keys}
        rewards = ioi_code_reward(test_completions, **reward_kwargs)
        print(rewards)
        assert rewards == [1.0] * NUM_SAMPLES

    def test_e2b_router_run_code_success(self):
        # run router locally: python scripts/e2b_router.py
        routed_sandbox = RoutedSandbox(router_url="localhost:8000")
        scripts = [
            "print('hello from integration test')",
            "result = 2 + 2\nprint(result)",
        ]

        results = routed_sandbox.run_code(scripts)

        assert len(results) == 2

        for result in results:
            assert isinstance(result, Execution)
            # assert result.exit_code == 0
            assert result.error is None
            assert "hello" in result.logs["stdout"][0] or "4" in result.logs["stdout"][0]

    def test_e2b_router_run_code_with_error(self):
        # run router locally: python scripts/e2b_router.py

        routed_sandbox = RoutedSandbox(router_url="localhost:8000")
        scripts = ["print('this is fine')", "print('unterminated string"]

        results = routed_sandbox.run_code(scripts)

        assert len(results) == 2

        # First one should be okay
        # assert results[0].exit_code == 0 # Execution object has no attribute 'exit_code'
        assert results[0].error is None
        assert "this is fine" in results[0].logs["stdout"][0]

        # Second one should have a syntax error

        # assert results[1].exit_code != 0 # Execution object has no attribute 'exit_code'
        assert results[1].error is not None
        assert isinstance(results[1].error, ExecutionError)
        assert "SyntaxError" in results[1].error.name

    def test_python_code_reward_morph(self):
        # requires MorphCloud, see the README.md file
        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
        NUM_SAMPLES = 20
        samples = code_dataset["train"].select(range(NUM_SAMPLES))
        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
        reward_kwargs = {
            "verification_info": [sample["verification_info"] for sample in samples],
            "provider_type": "morph",
        }
        rewards = code_reward(test_completions, **reward_kwargs)
        print(rewards)
        assert rewards == [1.0] * NUM_SAMPLES

    def test_morph_router(self):
        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
        NUM_SAMPLES = 32
        samples = code_dataset["train"].select(range(NUM_SAMPLES))
        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
        reward_kwargs = {
            "verification_info": [sample["verification_info"] for sample in samples],
            "provider_type": "morph",
            "morph_router_url": "0.0.0.0:8001",
        }
        rewards = code_reward(test_completions, **reward_kwargs)
        print(rewards)
        assert rewards == [1.0] * NUM_SAMPLES

    def test_morph_router_parallel(self):
        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")

        BATCH_SIZE = 32
        NUM_SAMPLES = 256

        def batch_code_reward(examples):
            test_completions = [[{"content": solution}] for solution in examples["gold_standard_solution"]]
            reward_kwargs = {
                "verification_info": [verification_info for verification_info in examples["verification_info"]],
                "provider_type": "morph",
                "morph_router_url": "0.0.0.0:8001",
            }
            rewards = code_reward(test_completions, **reward_kwargs)
            assert rewards == [1.0] * BATCH_SIZE
            return examples

        code_dataset = code_dataset["train"].select(range(NUM_SAMPLES))
        code_dataset = code_dataset.map(
            batch_code_reward,
            batched=True,
            batch_size=BATCH_SIZE,
            num_proc=4,
            load_from_cache_file=False,
        )

    def test_morph_router_run_code_success(self):
        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20

        routed_sandbox = RoutedMorphSandbox(router_url="localhost:8001")
        scripts = [
            "print('hello from morph integration test')",
            "result = 2 + 2\nprint(result)",
        ]

        results = routed_sandbox.run_code(scripts)

        assert len(results) == 2

        for result in results:
            assert result.exception_str is None
            assert "hello" in result.text or "4" in result.text

    def test_morph_router_run_code_with_error(self):
        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20

        routed_sandbox = RoutedMorphSandbox(router_url="localhost:8001")
        scripts = ["print('this is fine with morph')", "print('unterminated string"]

        results = routed_sandbox.run_code(scripts)

        assert len(results) == 2

        # First one should be okay
        assert results[0].exception_str is None
        assert "this is fine with morph" in results[0].text

        # Second one should have a syntax error
        assert "SyntaxError" in results[1].text


if __name__ == "__main__":
    unittest.main()