langchain 使用RAG 过程

 本地文档使用百科中的颐和园的一段

"""
rag
中文分隔符优先级 ["\n\n", "\n", "。", "!", "?", ",",",""]
ying文分隔符优先级 ["\n\n", "\n", ".", "!", "?", ",", " ",""]
混合分隔符优先级 ["\n\n", "\n", "。", "!", "?", ",",",".", "!", "?", ",", " ",""]
"""
import os
import warnings
import pickle
from langchain_openai import ChatOpenAI
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS

from langchain.memory import ConversationBufferMemory
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader,PyPDFLoader

warnings.filterwarnings("ignore")
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# 1、加载外部文档
separators= ["\n\n", "\n", "。", "!", "?", ",", ".", "!", "?", ",", " ",""]
loader = TextLoader('palace.txt',encoding='utf8')
docs = loader.load()
#2 、切割文档
spliter =  RecursiveCharacterTextSplitter(separators=separators,
                               chunk_size=1000,
                               chunk_overlap=100
                               )
texts = spliter.split_documents(docs)
# 3、词嵌入
em_model = OpenAIEmbeddings(
    model='text-embedding-3-small',
    openai_api_base='https://api.deepseek.com'
)

# print(em_model.embed_documents(['cat','dog','cannon']))

#使用 huggingface 模型   need  cross the wall   
em_model = HuggingFaceEmbeddings(
    model_name='bert-base-chinese'
)
# temp = em_model.embed_documents(['cat','dog','cannon'])
# len(temp[0])
# 如果使用本地向量数据库
# with open('em.pkl','rb') as file_obj:
#     em_model = pickle.load(file_obj)
# 4 创建词向量数据库
db = FAISS.from_document(texts,em_model)
db.save_local('faiss_index')

db.FAISS.load_local('faiss_index',em_model)
retriever = db.as_retriever()
# resp= retriever.invoke('颐和园修建于什么时间')
# print(resp)

load_dotenv(override=True)

llm = ChatOpenAI(
    base_url="https://api.deepseek.com",
    model='deepseek-chat',
    temperature=0.3,
    frequency_penalty=0.5
)

memory = ConversationBufferMemory(
    return_message=True,
    memory_key = 'chat_history',
    input_key = "question",
    output_key = "answer"
),

#检索
chain= ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever=retriever,
    memory=memory,
    # return_source_documents = True
)

result= chain.invole({"question":'颐和园是什么时候建成的?'})
print(result['answer'])


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

larance

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值