1.服务器
CUDA 11.7
2.依赖
conda create -n rebel_env python=3.10 -y
conda activate rebel_env# 安装 PyTorch(建议与 CUDA 11.7 配合)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117# 安装 Transformers 和基础工具包
pip install transformers==4.33.2
pip install pandas tqdmpip install "numpy<2"
3.加载模型
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
# 加载模型
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large").cuda()
def extract_triplets_from_texts(texts):
results = []
for text in texts:
prompt = f"extract triples: {text}"
inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
inputs = {k: v.cuda() for k, v in inputs.items()}
outputs = model.generate(**inputs, max_length=256)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
triplets = parse_rebel_output(decoded)
results.append({
"text": text,
"decoded_output": decoded,
"triplets": triplets
})
return results
def parse_rebel_output(text):
"""
改进版解析:ReBEL 输出短语之间两个空格,短语内部为一个空格。
"""
# 按两个空格拆分短语
phrases = [p.strip() for p in text.strip().split(" ") if p.strip()]
triplets = []
# 每三个短语组成一个三元组
for i in range(0, len(phrases) - 2, 3):
subj = phrases[i]
rel = phrases[i + 1]
obj = phrases[i + 2]
triplets.append((subj, rel, obj))
return triplets
# 示例输入(支持多个句子)
sentences = [
"Barack Obama was born in Hawaii and was elected president of the United States in 2008.",
"Elon Musk founded SpaceX and Tesla.",
"Paris is the capital of France.",
"Apple released the iPhone in 2007."
]
# 批量处理
outputs = extract_triplets_from_texts(sentences)
# 输出结果
for item in outputs:
print("🔎 原文:", item["text"])
print("🧾 解码:", item["decoded_output"])
print("🧩 三元组:")
for triple in item["triplets"]:
print(" ", triple)
print("-" * 60)
4.运行效果