概述:
Meta开发并发布了Meta-Lama 3大语言模型家族(LLM),Llama 3指令调优模型针对对话用例进行了优化,在常见的行业基准上优于许多可用的开源聊天模型。本文尝试对LLAMA3 在中文语料中尝试进行fine tune 为后续对 通义千问的模型进行比较。
代码实现:
加载依赖
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
AutoTokenizer,
TrainingArguments,
Trainer,
GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login
from datasets import Dataset, DatasetDict
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os
# 禁用权重和偏差
os.environ['WANDB_DISABLED']="true"
数据加载
git clone https://www.modelscope.cn/datasets/DAMO_NLP/lcsts_test_set.git
data_train_pth ='../Fine-tune/data/lcsts_test_set/{}'.format('train.csv')
data_train = pd.read_csv(data_train_pth)
data_test_pth = '../Fine-tune/data/lcsts_test_set/{}'.format('test.csv')
data_test = pd.read_csv(data_test_pth)
print(data_train.shape)
print(data_test.shape)
#这里看到原本的训练集合很大 减少部分的训练集以便更快的看到结果
data_train = data_train.head(2000)
data_train = Dataset.from_pandas(data_train)
data_test = Dataset.from_pandas(data_test)
print(data_train)
(100000, 2)
(725, 2)
Dataset({
features: [‘text1’, ‘text2’],
num_rows: 2000
})
模型加载
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=False,
)
model_name=r'D:\临时模型\Meta-Llama-3-8B-Instruct'
device_map = {
"": 0}
original_model = AutoModelForCausalLM.from_pretrained(model_name,
device_map=device_map,
quantization_config=bnb_config,
trust_remote_code=True,
use_auth_token=True)
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
数据预处理
#处理的是中文,所以添加中文的提示工程
def create_prompt_formats(sample):
"""
格式化示例的各个字段('instruction','output')
然后使用两个换行符将它们连接起来
:参数sample:样本字典
"""
ROLE_PROMPT = "你是一个文本记录员,擅长归纳文章的内容。"#校色说明
INTRO_BLURB = " 需要将了解到的内容进行总结概括并输出。尽可能用最少得字来完事内容的概述"#任务简介
INSTRUCTION_KEY = "### 要求:总结以下对话。"
RESPONSE_KEY = "### 总结:"
END_KEY = "### 结束"
blurb = f"\n{
INTRO_BLURB}"
instruction = f"{
INSTRUCTION_KEY}&