from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BatchEncoding, PreTrainedTokenizerFast
import torch
import json
import numpy as np
def is_within_range(element):
col = element[0][0]
#print("col", col)
if col > 5:
return False
return True
def read_data(file_path):
data_list = []
with open(file_path, 'r', encoding='utf-8') as file:
for lines in file:
a = []
data = json.loads(lines)
temp = []
temp.append(data['query'])
temp.append(data['pos'][0])
#print('temp is:', temp, '\n')
a.append(temp)
for i in range(0, len(data['neg'])):
temp2 = []
temp2.append(data['query'])
temp2.append(data['neg'][i])
#print('temp2 is:', temp2, '\n')
a.append(temp2)
data_list.append(a)
# print(a, '\n')
# print(data_list, '\n')
return data_list
def base_large_v2m3(model_path, sentences):
# 获取相关性分数(分数越高表明相关性越高)
from FlagEmbedding import FlagReranker
from datetime import datetime
now = datetime.now()
reranker = FlagReranker(model_path, use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
end = datetime.now()
print("加载模型耗时:", (end - now).total_seconds())
num = len(sentences)
# scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
# print("bge reranker scores is:", scores) # [-8.1875, 5.26171875]
# You can map the scores into 0-1 by set "normalize=True", which will apply sigmoid function to the score
now2 = datetime.now()
scores = reranker.compute_score(sentences, normalize=True)
end2 = datetime.now()
avg_time = (end2 - now2).total_seconds()/num
#print("计算每一条样例中,每一对向量的相关性耗时:", (end2 - now2).total_seconds()/num)
#print("mormalized bge reranker scores is:", scores) # [0.00027803096387751553, 0.9948403768236574]
return scores, avg_time
def llm_reranker(model_path, sentences):
from FlagEmbedding import FlagLLMReranker
from datetime import datetime
now = datetime.now()
reranker = FlagLLMReranker(model_path, use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
# reranker = FlagLLMReranker('BAAI/bge-reranker-v2-gemma', use_bf16=True) # You can also set use_bf16=True to speed up computation with a slight performance degradation
end = datetime.now()
print("加载模型耗时:", (end - now).total_seconds())
num = len(sentences)
now2 = datetime.now()
# score = reranker.compute_score(sentences)
# print("llm reranker scores is:", score)
scores = reranker.compute_score(sentences, batch_size=8)
end2 = datetime.now()
avg_time = (end2 - now2).total_seconds()/num
#print("normalized llm reranker scores is:", scores)
return scores, avg_time
def layer_reranker(model_path, sentences):
from FlagEmbedding import LayerWiseFlagLLMReranker
from datetime import datetime
now = datetime.now()
reranker = LayerWiseFlagLLMReranker(model_path, use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
end = datetime.now()
print("加载模型耗时:", (end - now).total_seconds())
num = len(sentences)
# reranker = LayerWiseFlagLLMReranker('BAAI/bge-reranker-v2-minicpm-layerwise', use_bf16=True) # You can also set use_bf16=True to speed up computation with a slight performance degradation
#score = reranker.compute_score(sentences, cutoff_layers=[28]) # Adjusting 'cutoff_layers' to pick which layers are used for computing the score.
#print("layer reranker scores is:", score)
now2 = datetime.now()
scores = reranker.compute_score(sentences, cutoff_layers=[28])
end2 = datetime.now()
avg_time = (end2 - now2).total_seconds()/num
#print("normalized llm reranker scores is:", scores)
return scores, avg_time
def hf_reranker(model_path, sentences):
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
with torch.no_grad():
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
print("hf load reranker scores is:", scores)
def llm_reranker2(model_path, sentences):
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def get_inputs(sentences, tokenizer, prompt=None, max_length=1024):
if prompt is None:
prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
sep = "\n"
prompt_inputs = tokenizer(prompt, return_tensors=None, add_special_tokens=False)['input_ids']
sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)['input_ids']
inputs = []
for query, passage in sentences:
query_inputs = tokenizer(f'A: {query}', return_tensors=None, add_special_tokens=False, max_length=max_length * 3 // 4,
truncation=True)
passage_inputs = tokenizer(f'B: {passage}',return_tensors=None,add_special_tokens=False,max_length=max_length,truncation=True)
item = tokenizer.prepare_for_model(
[tokenizer.bos_token_id] + query_inputs['input_ids'], sep_inputs + passage_inputs['input_ids'],
truncation='only_second',max_length=max_length,padding=False,return_attention_mask=False,return_token_type_ids=False,
add_special_tokens=False)
item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
item['attention_mask'] = [1] * len(item['input_ids'])
inputs.append(item)
return tokenizer.pad(inputs,padding=True,
max_length=max_length + len(sep_inputs) + len(prompt_inputs), pad_to_multiple_of=8,return_tensors='pt',)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
yes_loc = tokenizer('Yes', add_special_tokens=False)['input_ids'][0]
model.eval()
with torch.no_grad():
inputs = get_inputs(sentences, tokenizer)
scores = model(**inputs, return_dict=True).logits[:, -1, yes_loc].view(-1, ).float()
print("llm reranker scores2 is:", scores)
def layer_reranker2(model_path, sentences):
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def get_inputs(sentences, tokenizer, prompt=None, max_length=1024):
if prompt is None:
prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
sep = "\n"
prompt_inputs = tokenizer(prompt,return_tensors=None,add_special_tokens=False)['input_ids']
sep_inputs = tokenizer(sep,return_tensors=None,add_special_tokens=False)['input_ids']
inputs = []
for query, passage in sentences:
query_inputs = tokenizer(f'A: {query}',return_tensors=None,add_special_tokens=False,max_length=max_length * 3 // 4,
truncation=True)
passage_inputs = tokenizer(f'B: {passage}'
没有合适的资源?快使用搜索试试~ 我知道了~
FlagEmbedding模型微调时json数据处理

共20个文件
py:16个
json:3个
jsonl:1个

需积分: 5 0 下载量 9 浏览量
2024-05-21
14:15:21
上传
评论
收藏 18KB ZIP 举报
温馨提示
FlagEmbedding项目中对reranker和embedding模型微调。 用于json格式数据处理的,及自动化脚本测试。自动化找出前top1和top5占比。 将instruction-input-output格式数据转换为query-pos-neg数据; 将多个文件的json数据合并; 将json数据划分为训练集和验证集。
资源推荐
资源详情
资源评论


















收起资源包目录






















共 20 条
- 1
资源评论


。七十二。
- 粉丝: 1544
上传资源 快速赚钱
我的内容管理 展开
我的资源 快来上传第一个资源
我的收益
登录查看自己的收益我的积分 登录查看自己的积分
我的C币 登录后查看C币余额
我的收藏
我的下载
下载帮助


最新资源
- 2023年计算机二级考试公式大全.docx
- 四办公软件处理PowerPointPPT课件.ppt
- 电子商务物流运营策略.ppt
- 2022年最新网络编辑招聘笔试题解析.doc
- 微机原理与接口技术习题答案章.doc
- 基于单片机的温度控制系统毕业论文2.doc
- 网络高清数字IP摄像机基础知识培训资料.ppt
- 网络营销战略规划培训.pptx
- 三大秘籍助你做好财务信息化[会计实务-会计实操].doc
- 基于物联网的光伏发电EPC管理系统-使用及培训手册.pptx
- 通信工程施工管理.doc
- 烟灰缸的CAD设计与CAM制作设计与训练报告(1).doc
- 软件技术整体解决方案.doc
- 网络的基本知识及故障排除.pptx
- 将互联网思维植入财务转型黄世忠教授.pptx
- 河源市福新会所网站建设方案.doc
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈



安全验证
文档复制为VIP权益,开通VIP直接复制
