Labsheet9
Labsheet9
#############################################
# document loading and QA retrieval chains
# pip install chromadb
# pip install tiktoken
import chromadb
# Write the text back to a new file, ensuring it's in UTF-8 encoding
with open("input_text_utf8.txt", "w", encoding="utf-8") as f:
f.write(text)
loader = TextLoader("/content/input_text.txt")
document = loader.load()
print(document)
#Split the document into smaller chunks that are semantically related
# it will split the text in three recursive calls. First it splits at paragraph
level
#Second call at sentence level and third at word level.
# This is at \n\n , \n and space
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
chunk_overlap = 200)
texts = text_splitter.split_documents(document)
#print(texts[0])
#print(texts[1])
# Create the embeddings for the texts. Each text is converted to vector space
# and using the embeddings of two texts which is a floating point number
#we can understand how related they are. OpenAI uses sentence embedding, word
embeddings
#based on various factors
embeddings = OpenAIEmbeddings()
#Start querying
question1 = "What is a natural disaster"
result = chain.invoke({"query": question1})
print(result)