# Copyright 2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from dotenv import load_dotenv from open_r1.configs import GRPOScriptArguments from open_r1.rewards import ( accuracy_reward, format_reward, get_code_format_reward, get_cosine_scaled_reward, get_repetition_penalty_reward, get_reward_funcs, get_soft_overlong_punishment, len_reward, reasoning_steps_reward, tag_count_reward, think_accuracy_reward, think_format_reward, ) from parameterized import parameterized load_dotenv() class TestGetRewardFuncs(unittest.TestCase): def test_get_reward_funcs(self): """Test get_reward_funcs with various reward functions.""" reward_names = [ "accuracy", "format", "think_format", "think_accruacyreasoning_steps", "cosine", "repetition_penalty", "length", "tag_count", "code", "ioi_code", "code_format", "binary_code", ] reward_func_names = [ "accuracy_reward", "format_reward", "think_format_reward", "think_accuracy_reward", "reasoning_steps_reward", "cosine_scaled_reward", "repetition_penalty_reward", "len_reward", "tag_count_reward", "code_reward", "ioi_code_reward", "code_format_reward", "binary_code_reward", ] args = GRPOScriptArguments( dataset_name="dummy", reward_funcs=reward_names, ) reward_funcs = get_reward_funcs(args) self.assertEqual(len(reward_funcs), 11) for func_name, func in zip(reward_func_names, reward_funcs): self.assertEqual(func_name, func.__name__) class TestRewards(unittest.TestCase): def test_accuracy_reward_correct_answer(self): """Test accuracy_reward with a correct answer.""" completion = [[{"content": r"\boxed{\frac{63}{400}}"}]] solution = [r"\frac{63}{400}"] rewards = accuracy_reward(completion, solution) self.assertEqual(rewards[0], 1.0) def test_accuracy_reward_wrong_answer(self): """Test accuracy_reward with an incorrect answer.""" completion = [[{"content": r"\boxed{\frac{64}{400}}"}]] solution = [r"\frac{63}{400}"] rewards = accuracy_reward(completion, solution) self.assertEqual(rewards[0], 0.0) def test_accuracy_reward_wrong_answer_no_latex(self): """Test accuracy_reward with an incorrect answer and gold solution with no latex.""" completion = [[{"content": r"\boxed{3}"}]] solution = ["6"] rewards = accuracy_reward(completion, solution) self.assertEqual(rewards[0], 0.0) def test_format_reward_correct(self): """Test format_reward with correct format.""" completion = [[{"content": "\nSome reasoning\n\n\nThe answer\n"}]] rewards = format_reward(completion) self.assertEqual(rewards[0], 1.0) def test_format_reward_incorrect(self): """Test format_reward with incorrect format.""" incorrect_formats = [ "Only thinking", "Only answer", "No tags at all", "Missing closingMissing closing", "Wrong orderWrong order", ] for fmt in incorrect_formats: completion = [[{"content": fmt}]] rewards = format_reward(completion) self.assertEqual(rewards[0], 0.0) def test_reasoning_steps_reward(self): """Test reasoning_steps_reward with various formats.""" test_cases = [ # Full credit cases (3 or more steps) ("Step 1: First step\nStep 2: Second step\nStep 3: Third step", 1.0), ("First, we do this.\nSecond, we do that.\nFinally, we conclude.", 1.0), # Partial credit cases (less than 3 steps) ("Step 1: Only step", 1 / 3), ("First, we do this.\nFinally, we conclude.", 2 / 3), # No credit case ("Just plain text without any clear steps", 0.0), ] for content, expected_reward in test_cases: completion = [[{"content": content}]] rewards = reasoning_steps_reward(completion) self.assertAlmostEqual(rewards[0], expected_reward) def test_multiple_completions(self): """Test handling multiple completions at once.""" completions = [ [{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}], ] solutions = [r"\frac{63}{400}", r"\frac{63}{400}"] rewards = accuracy_reward(completions, solutions) self.assertEqual(len(rewards), 2) self.assertEqual(rewards[0], 1.0) self.assertEqual(rewards[1], 0.0) def test_cosine_scaled_reward(self): """Test cosine_scaled_reward with various cases.""" # Test parameters test_params = { "min_value_wrong": -1.0, "max_value_wrong": -0.5, "min_value_correct": 0.5, "max_value_correct": 1.0, "max_len": 100, } test_cases = [ # Correct answers with different lengths ( r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 20, 0.943, ), # Short correct answer ( r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 80, 0.547, ), # Long correct answer # Wrong answers with different lengths ( r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 20, -0.942, ), # Short wrong answer ( r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 80, -0.547, ), # Long wrong answer ] for content, solution, content_len, expected_reward in test_cases: # Pad content to desired length padded_content = content + " " * (content_len - len(content)) completion = [[{"content": padded_content}]] rewards = get_cosine_scaled_reward(**test_params)(completion, [solution]) self.assertAlmostEqual(rewards[0], expected_reward, places=2) def test_format_reward_specific_multiline(self): """Test format_reward with a specific multiline input.""" inputs = "\nI will count each distinct object in the image:\n1. Purple scooter\n2. Red bicycle\n3. Green motorcycle\n4. Gray sedan\n5. Yellow school bus\n6. Small green double-decker bus\n7. Small red car\n8. Small purple car\n9. Small gray dirt bike\n\nThere are 9 distinct objects in total.\n\n\n9\n" completion = [[{"content": inputs}]] rewards = format_reward(completion) self.assertEqual(rewards[0], 1.0) def test_same_length_responses(self): """Test len_reward when all responses have the same length.""" completions = [ [{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}], ] solutions = [r"\frac{63}{400}", r"\frac{63}{400}"] rewards = len_reward(completions, solutions) self.assertEqual(rewards, [0.0, 0.0]) def test_different_lengths_correct_answers(self): """Test len_reward with different length correct answers.""" completions = [ [{"content": r"\boxed{\frac{63}{400}}"}], # shorter [{"content": r"\boxed{\frac{63}{400}} " + "x" * 10}], # longer ] solutions = [r"\frac{63}{400}", r"\frac{63}{400}"] rewards = len_reward(completions, solutions) self.assertGreater(rewards[0], rewards[1]) # shorter answer should get higher reward self.assertAlmostEqual(rewards[0], 0.5) # shortest correct answer gets maximum reward def test_different_lengths_incorrect_answers(self): """Test len_reward with different length incorrect answers.""" completions = [ [{"content": r"\boxed{\frac{64}{400}}"}], # shorter [{"content": r"\boxed{\frac{64}{400}} " + "x" * 10}], # longer ] solutions = [r"\frac{63}{400}", r"\frac{63}{400}"] rewards = len_reward(completions, solutions) self.assertLessEqual(rewards[0], 0.0) # incorrect answers should get non-positive rewards self.assertLessEqual(rewards[1], 0.0) self.assertGreater(rewards[0], rewards[1]) # shorter answer should still be penalized less def test_mixed_correctness(self): """Test len_reward with mix of correct and incorrect answers of different lengths.""" completions = [ [{"content": r"\boxed{\frac{63}{400}}"}], # correct, shorter [{"content": r"\boxed{\frac{63}{400}} " + "x" * 10}], # correct, longer [{"content": r"\boxed{\frac{64}{400}}"}], # incorrect, shorter [{"content": r"\boxed{\frac{64}{400}} " + "x" * 10}], # incorrect, longer ] solutions = [r"\frac{63}{400}"] * 4 rewards = len_reward(completions, solutions) # Shortest correct answer should get positive reward self.assertGreater(rewards[0], 0.0) # Longer correct answer might get negative reward: self.assertGreater(rewards[2], rewards[1]) self.assertGreaterEqual(rewards[1], rewards[3]) # Incorrect answers should get non-positive rewards self.assertLessEqual(rewards[2], 0.0) self.assertLessEqual(rewards[3], 0.0) # Shorter answers should get better rewards within their correctness category self.assertGreater(rewards[0], rewards[1]) # correct answers self.assertGreater(rewards[2], rewards[3]) # incorrect answers def test_unparseable_solution(self): """Test len_reward with unparseable solution.""" completions = [ [{"content": r"\boxed{answer}"}], [{"content": r"\boxed{answer} " + "x" * 10}], ] solutions = ["unparseable_latex", "unparseable_latex"] rewards = len_reward(completions, solutions) self.assertGreater(rewards[0], rewards[1]) # shorter answer should still get better reward self.assertAlmostEqual(rewards[0], 0.5) # treated as correct, shortest gets maximum reward class TestRepetitionPenaltyReward(unittest.TestCase): def test_positive_max_penalty_raises_value_error(self): with self.assertRaises(ValueError): get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0) with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"): get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5) def test_no_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0) completions = [[{"content": "this is a test sentence"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [0.0]) def test_full_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0) completions = [[{"content": "this this this this this"}]] rewards = reward_fn(completions) # (1 - 1/4) * -1 = -0.75 self.assertEqual(rewards, [-0.75]) def test_partial_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0) completions = [[{"content": "this is a this is a test"}]] rewards = reward_fn(completions) # Unique 2-grams: (this, is), (is, a), (a, this), (a, test). 4 unique out of 6 total # (1 - 4/6) * -1 = -1/3 = -0.3333... self.assertAlmostEqual(rewards[0], -1 / 3) def test_multiple_completions(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5) completions = [ [{"content": "this is a test"}], [{"content": "test test test test"}], ] rewards = reward_fn(completions) # Completion 1: (this, is, a), (is, a, test) -> 2 unique / 2 total -> (1 - 2/2) * -0.5 = 0 # Completion 2: (test, test, test) -> 1 unique / 2 total -> (1 - 1/2) * -0.5 = -0.25 self.assertAlmostEqual(rewards[0], 0.0) self.assertAlmostEqual(rewards[1], -0.25) def test_empty_completion(self): reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0) completions = [[{"content": ""}]] rewards = reward_fn(completions) self.assertEqual(rewards, [0.0]) def test_different_ngram_size(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-2.0) completions = [[{"content": "this is a this is a test"}]] rewards = reward_fn(completions) self.assertAlmostEqual(rewards[0], -0.4) def test_mixed_case(self): reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0) completions = [ [{"content": "This is A Test"}], [{"content": "this IS a test"}], ] rewards = reward_fn(completions) # both completions should produce the same reward, because the text gets lowercased self.assertAlmostEqual(rewards[0], rewards[1]) def test_one_word_completion(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "word"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [0.0]) def test_two_word_completion(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "two words"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [0.0]) def test_three_word_completion(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "three different words"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [0.0]) def test_three_word_repetition_completion(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "word word word word"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [-0.5]) def test_four_word_completion_with_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "one two one two"}]] rewards = reward_fn(completions) # ngrams are (one two one) (two one two). unique is 2 and count is 2, therefore (1-1) * -1. self.assertEqual(rewards, [0.0]) def test_five_word_completion_with_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5) completions = [[{"content": "A B C A B"}]] rewards = reward_fn(completions) # (A B C) (B C A) (C A B). unique is 3. count is 3 (1-1) * -.5 = 0 self.assertEqual(rewards, [0.0]) def test_six_word_completion_with_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "A B C A B C"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [-0.25]) def test_long_completion_with_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "A B C A B C E F G A B C A B C"}]] rewards = reward_fn(completions) self.assertAlmostEqual(rewards[0], -0.3846, places=4) def test_long_completion_without_repetition(self): reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0) completions = [[{"content": "A B C D E F G H I J K L"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [0.0]) def test_tag_count_rewards_all_correct(self): """Test tag_count_reward with correct tags.""" completion = [[{"content": "\nSome reasoning\n\n\nThe answer\n"}]] rewards = tag_count_reward(completion) self.assertEqual(rewards[0], 1.0) def test_tag_count_rewards_missing_think_begin(self): """Test tag_count_reward with missing tag.""" completion = [[{"content": "Some reasoning\n\n\nThe answer\n"}]] rewards = tag_count_reward(completion) self.assertEqual(rewards[0], 0.75) def test_tag_count_rewards_missing_think_end(self): """Test tag_count_reward with missing tag.""" completion = [[{"content": "\nSome reasoning\n\nThe answer\n"}]] rewards = tag_count_reward(completion) self.assertEqual(rewards[0], 0.75) def test_tag_count_rewards_missing_answer_begin(self): """Test tag_count_reward with missing tag.""" completion = [[{"content": "\nSome reasoning\n\nThe answer\n"}]] rewards = tag_count_reward(completion) self.assertEqual(rewards[0], 0.75) def test_tag_count_rewards_missing_answer_end(self): """Test tag_count_reward with missing tag.""" completion = [[{"content": "\nSome reasoning\n\n\nThe answer"}]] rewards = tag_count_reward(completion) self.assertEqual(rewards[0], 0.75) def test_tag_count_rewards_missing_all_tags(self): """Test tag_count_reward with missing all tags.""" completion = [[{"content": "Some reasoning\nThe answer"}]] rewards = tag_count_reward(completion) self.assertEqual(rewards[0], 0.0) def test_full_repetition_with_language(self): reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="en") completions = [[{"content": "that that that that that"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [-0.75]) # begin test for zh language reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="zh") completions = [[{"content": "这个这个这个这个这个"}]] rewards = reward_fn(completions) self.assertEqual(rewards, [-0.75]) def test_soft_overlong_punishment_short_completion(self): """Test soft overlong punishment reward function with a short completion.""" # length 50, with max=100 and soft cache=20, reward should be 0. reward_fn = get_soft_overlong_punishment(max_completion_len=100, soft_punish_cache=20) completion_ids = [[1] * 50] # 50 <= 80 rewards = reward_fn(completion_ids=completion_ids) self.assertEqual(rewards, [0]) def test_soft_overlong_punishment_long_completion(self): """Test soft overlong punishment reward function with a longer than max completion.""" # 110 > 100, reward should be -1. reward_fn = get_soft_overlong_punishment(max_completion_len=100, soft_punish_cache=20) completion_ids = [[1] * 110] rewards = reward_fn(completion_ids) self.assertEqual(rewards, [-1]) def test_soft_overlong_punishment_intermediate_completion(self): """Test soft overlong punishment reward function for intermediate length completion.""" reward_fn = get_soft_overlong_punishment(max_completion_len=100, soft_punish_cache=20) completion_ids = [[1] * 90] # 90 is between 80 and 100 rewards = reward_fn(completion_ids) self.assertAlmostEqual(rewards[0], -0.5, places=4) class TestCodeFormat(unittest.TestCase): def test_correct_python_format(self): """Test code format reward with correct Python format.""" completion = [ [ { "content": "\nLet's solve this\nStep 1: First step\n\n\n```python\ndef hello():\n print('world')\n```\n" } ] ] reward_fn = get_code_format_reward(language="python") rewards = reward_fn(completion) self.assertEqual(rewards[0], 1.0) def test_incorrect_formats(self): """Test code format reward with various incorrect formats.""" incorrect_formats = [ # Missing think/answer tags "```python\ndef hello():\n print('world')\n```", # Missing code block "Some thinkingJust plain text", # Wrong language "Analysis```javascript\nconsole.log('hello');\n```", # Missing language identifier "Analysis```\ndef hello(): pass\n```", # Wrong order of tags "```python\ndef hello(): pass\n```Analysis", ] reward_fn = get_code_format_reward(language="python") for fmt in incorrect_formats: completion = [[{"content": fmt}]] rewards = reward_fn(completion) self.assertEqual(rewards[0], 0.0) def test_multiple_code_blocks(self): """Test format reward with multiple code blocks in think and answer sections.""" completion = [ [ { "content": "\nHere's an example:\n```python\nx = 1\n```\nNow the solution:\n\n\n```python\ndef solution():\n return 42\n```\n" } ] ] reward_fn = get_code_format_reward(language="python") rewards = reward_fn(completion) self.assertEqual(rewards[0], 1.0) def test_different_languages(self): """Test code format reward with different programming languages.""" completion = [ [ { "content": "\nAnalysis\n\n\n```javascript\nconsole.log('hello');\n```\n" } ] ] # Test with JavaScript js_reward_fn = get_code_format_reward(language="javascript") rewards = js_reward_fn(completion) self.assertEqual(rewards[0], 1.0) # Same completion should fail for Python py_reward_fn = get_code_format_reward(language="python") rewards = py_reward_fn(completion) self.assertEqual(rewards[0], 0.0) def test_multiline_code(self): """Test format reward with complex multiline code blocks.""" completion = [ [ { "content": "\nHere's the analysis\n\n\n```python\nclass Solution:\n def __init__(self):\n self.value = 42\n \n def get_value(self):\n return self.value\n```\n" } ] ] reward_fn = get_code_format_reward(language="python") rewards = reward_fn(completion) self.assertEqual(rewards[0], 1.0) class TestThinkFormatReward(unittest.TestCase): @parameterized.expand( [ ("\n\nThought\n\n\n\nSolution",), ("\nThought\n\nSolution",), ("ThoughtSolution",), (" Thought Solution",), ("\n Thought Solution",), (" Thought Solution",), ("\n\n Thought Solution",), (" Thought Solution",), ("\n Thought Solution",), ] ) def test_correct_think_format(self, format_string): """Test think_format_reward with correct think format.""" completion = [[{"content": format_string}]] rewards = think_format_reward(completion) self.assertEqual(rewards[0], 1.0) @parameterized.expand( [ ("Preamble Thought Solution",), ("No tags at all",), (" Missing closing thought",), (" Thought 1 Thought 2 Solution section",), (" Thought 1 Extra think tag Solution"), (" Thought 1 Extra opening think tag Solution"), ] ) def test_incorrect_think_format(self, format_string): """Test think_format_reward with incorrect think format.""" completion = [[{"content": format_string}]] rewards = think_format_reward(completion) self.assertEqual(rewards[0], 0.0) class TestThinkAccuracyReward(unittest.TestCase): def test_correct_answer_after_think(self): completion = [[{"content": " Thought The answer is \\boxed{42}"}]] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 1.0) def test_correct_answer_after_multiple_think(self): completion = [[{"content": " Thought 1 Thought 2 The answer is \\boxed{42}"}]] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 1.0) def test_incorrect_answer_after_think(self): completion = [[{"content": " Thought The answer is \\boxed{43}"}]] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 0.0) def test_multiple_answers_gives_zero_reward(self): completion = [[{"content": " Thought The answer is \\boxed{6} and \\boxed{42}"}]] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 0.0) def test_no_latex_answer_gives_zero_reward(self): completion = [[{"content": " Thought The answer is 42"}]] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 0.0) def test_truncated_answer_gives_zero_reward(self): completion = [[{"content": " Thought the answer is \\boxed{42} The answer is [TRUNCATED]"}]] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 0.0) def test_truncated_answer_after_multiple_think_gives_zero_reward(self): completion = [ [ { "content": " Thought 1 the answer is \\boxed{42} Thought 2 the answer is \\boxed{42} The answer is [TRUNCATED]" } ] ] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 0.0) def test_truncated_thought_gives_zero_reward(self): completion = [[{"content": " Thought the answer is \\boxed{42} and [TRUNCATED]"}]] rewards = think_accuracy_reward(completion, solution=["\\boxed{42}"]) self.assertEqual(rewards[0], 0.0) if __name__ == "__main__": unittest.main()