前言
根据qwen2.5-0.5b和基于规则设计的奖励模型,使用GRPO强化学习复现deepseek-r1-Zero。
数据地址:https://huggingface.co/datasets/swulling/gsm8k_chinese?row=4
1.数据准备
数据选取了swulling/gsm8k_chinese数据,其中用了question_zh-cn和answer_only字段。数据如下所示。
2.训练代码
使用trl中实现的GRPOConfig, GRPOTrainer完成,复杂部分在于奖励函数的设计,定义了多个奖励函数,包括答案正确性奖励、数字奖励、格式奖励等,用于评估模型生成的文本。
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import trl
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig, get_peft_model, TaskType
SYSTEM_PROMPT = """
按照如下格式生成:
<think>
...
</think>
<answer>
...
</answer>
"""
def process_data(data):
data = data.map(lambda x: {
'prompt': [
{'role': 'system', 'content': SYSTEM_PROMPT},
{'role': 'user', 'content': x['question_zh-cn']}
],
'answer': x['answer_only']
})
return data
#提取模型生成的答案
def extract_answer(text):
answer = text.split("<answer>")[-1]
answer = answer.split("</answer>")[0]
return answer.strip()
#格式奖励
def mark_num(text):
reward = 0
if text.count("<think>\n") == 1:
reward += 0.125
if text.count("</think>\n") == 1:
reward += 0.125
if text.count("<answer>\n") == 1:
reward += 0.125
if text.count("</answer>\n") == 1:
reward += 0.125
return reward
# 生成答案是否正确的奖励
def correctness_reward(prompts, completions, answer, **kwargs):
responses = [completion[0]['content'] for completion in completions]
extracted_responses = [extract_answer(r) for r in responses]
print(f"问题:\n{prompts[0][-1]['content']}", f"\n答案:\n{answer[0]}", f"\n模型输出:\n{responses[0]}", f"\n提取后的答案:\n{extracted_responses[0]}")
return [2.0 if response == str(ans) else 0.0 for response, ans in zip(extracted_responses, answer)]
# 生成答案是否是数字的奖励(单纯依赖结果是否正确进行奖励,条件很苛刻,会导致奖励比较稀疏,模型难以收敛,所以加上答案是否是数字的奖励,虽然答案错误,但是至少生成的是数字(对于数学问题),也要给予适当奖励)
def digit_reward(completions, **kwargs):
responses = [completion[0]['content'] for completion in completions]
extracted_responses = [extract_answer(r) for r in responses]
return [0.5 if response.isdigit() else 0.0 for response in extracted_responses]
# 严格格式奖励
def hard_format_reward(completions, **kwargs):
pattern = r"^<think>\n.*?n</think>\n<answer>\n.*?\n</answer>\n$"
responses = [completion[0]["content"] for completion in completions]
matches = [re.match(pattern, response) for response in responses]
return [0.5 if match else 0.0 for match in matches]
# 适当格式奖励
def soft_format_reward(completions, **kwargs):
pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
responses = [completion[0]["content"] for completion in completions]
matches = [re.match(pattern, response) for response in responses]
return [0.5 if match else 0.0 for match in matches]
# 标记奖励(改善格式奖励稀疏问题)
def mark_reward(completions, **kwargs):
responses = [completion[0]["content"] for completion in completions]
return [mark_num(response) for response in responses]
if __name__ == '__main__':
model_name = "/home/user/Downloads/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
model.cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)
ds = load_dataset('/home/user/wyf/deepseek_learn/gsm8k_chinese')
data = process_data(ds['train'])
output_dir="output"
training_args = GRPOConfig(
output_dir=output_dir,
learning_rate=5e-6,
adam_beta1 = 0.9,
adam_beta2 = 0.99,
weight_decay = 0.1,
warmup_ratio = 0.1,
lr_scheduler_type='cosine',
logging_steps=1,
bf16=True,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_generations=16,
max_prompt_length=256,
max_completion_length=200,
num_train_epochs=1,
save_steps=100,
max_grad_norm=0.1,
log_on_each_node=False,
use_vllm=False,
report_to="tensorboard"
)
trainer = GRPOTrainer(
model=model,
processing_class=tokenizer,
reward_funcs=[
mark_reward,
soft_format_reward,
hard_format_reward,
digit_reward,
correctness_reward
],
args=training_args,
train_dataset=data,
)
trainer.train()
trainer.save_model(output_dir)
Reference: