运行代码
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
def run_baichuan2(model_id):
model_id = model_id
# 加载分词
tokenizer = AutoTokenizer.from_pretrained(
# 模型地址
model_id,
# 不加速,默认为Ture
use_fast=False,
# 可解决ValueError: Tokenizer class BaichuanTokenizer does not exist
trust_remote_code=True
)
# 加载模型
model = AutoModelForCausalLM.from_pretrained(
model_id,
# 加载模型精度
torch_dtype=torch.float16,
# 信任远程代码
trust_remote_code=True,
# 显存自动分配(如果需要整数量化,该句需要注意,如果需要实现4bits在线量化需要注释)
device_map="auto"
)
# 加载配置文件
model.generation_config = GenerationConfig.from_pretrained(
model_id
)
# 实现推理
messages = []
messages.append({"role": "user", "content":"亚历山大大帝的骑兵为什么强大?"})
response = model.chat(tokenizer, messages)
print(response)
return response
def run_baichuan2_quantize(model_id):
model_id = model_id
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_id,
use_fast=False,
trust_remote_code=True
)
model.generation_config = GenerationConfig.from_pretrained(model_id)
'''
4bits 在线量化!(官方)
需要注意的是,在用 from_pretrained 接口的时候,一般会加上 device_map="auto",在使用在线量化时,需要去掉这个参数,否则会报错
'''
model = model.quantize(4).cuda()
messages = []
messages.append({"role": "user", "content":"亚历山大大帝的骑兵为什么强大?"})
response = model.chat(tokenizer, messages)
print(response)
return response
if __name__ == "__main__":
model_id = "Baichuan2-13B-Chat"
# 不量化运行
run_baichuan2(model_id)
# 量化quantize(4)运行
run_baichuan2_quantize(model_id)