本地文档使用百科中的颐和园的一段
"""
rag
中文分隔符优先级 ["\n\n", "\n", "。", "!", "?", ",",",""]
ying文分隔符优先级 ["\n\n", "\n", ".", "!", "?", ",", " ",""]
混合分隔符优先级 ["\n\n", "\n", "。", "!", "?", ",",",".", "!", "?", ",", " ",""]
"""
import os
import warnings
import pickle
from langchain_openai import ChatOpenAI
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader,PyPDFLoader
warnings.filterwarnings("ignore")
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
# 1、加载外部文档
separators= ["\n\n", "\n", "。", "!", "?", ",", ".", "!", "?", ",", " ",""]
loader = TextLoader('palace.txt',encoding='utf8')
docs = loader.load()
#2 、切割文档
spliter = RecursiveCharacterTextSplitter(separators=separators,
chunk_size=1000,
chunk_overlap=100
)
texts = spliter.split_documents(docs)
# 3、词嵌入
em_model = OpenAIEmbeddings(
model='text-embedding-3-small',
openai_api_base='https://api.deepseek.com'
)
# print(em_model.embed_documents(['cat','dog','cannon']))
#使用 huggingface 模型 need cross the wall
em_model = HuggingFaceEmbeddings(
model_name='bert-base-chinese'
)
# temp = em_model.embed_documents(['cat','dog','cannon'])
# len(temp[0])
# 如果使用本地向量数据库
# with open('em.pkl','rb') as file_obj:
# em_model = pickle.load(file_obj)
# 4 创建词向量数据库
db = FAISS.from_document(texts,em_model)
db.save_local('faiss_index')
db.FAISS.load_local('faiss_index',em_model)
retriever = db.as_retriever()
# resp= retriever.invoke('颐和园修建于什么时间')
# print(resp)
load_dotenv(override=True)
llm = ChatOpenAI(
base_url="https://api.deepseek.com",
model='deepseek-chat',
temperature=0.3,
frequency_penalty=0.5
)
memory = ConversationBufferMemory(
return_message=True,
memory_key = 'chat_history',
input_key = "question",
output_key = "answer"
),
#检索
chain= ConversationalRetrievalChain.from_llm(
llm = llm,
retriever=retriever,
memory=memory,
# return_source_documents = True
)
result= chain.invole({"question":'颐和园是什么时候建成的?'})
print(result['answer'])