multimodel_text
multimodel_text
print(sys.executable)
raw_pdf_elements=partition_pdf(
filename="RAG_NLP.pdf",
strategy="hi_res",
extract_images_in_pdf=True,
extract_image_block_types=["Image", "Table"],
extract_image_block_to_payload=False,
extract_image_block_output_dir="extracted_data"
)
raw_pdf_elements
Header=[]
Footer=[]
Title=[]
NarrativeText=[]
Text=[]
ListItem=[]
NarrativeText
ListItem
img=[]
for element in raw_pdf_elements:
if "unstructured.documents.elements.Image" in str(type(element)):
img.append(str(element))
img
len(img)
tab=[]
for element in raw_pdf_elements:
if "unstructured.documents.elements.Table" in str(type(element)):
tab.append(str(element))
tab
len(tab)
prompt_text = """You are an assistant tasked with summarizing tables for retrieval.
\
These summaries will be embedded and used to retrieve the raw table elements. \
Give a concise summary of the table that is well optimized for retrieval. Table
{element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
table_summaries = []
table_summaries=summarize_chain.batch(tab,{"max_concurrency": 5})
tab[0]
table_summaries[0]
import base64
import os
from groq import Groq
os.environ["GROQ_API_KEY"] =
"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"
def encode_image(image_path):
"""Getting the base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def image_summarize(image_path, prompt):
"""Summarize the image using Groq's llama-3.2-11b-vision-preview model."""
# Encode image
base64_image = encode_image(image_path)
def generate_img_summaries(path):
"""
Generate summaries and base64 encoded strings for images.
path: Path to list of .jpg files extracted by Unstructured
"""
img_base64_list = []
image_summaries = []
print(os.listdir("extracted_data"))
fpath="extracted_data/figure-2-1.jpg"
img_base64_list,image_summaries=generate_img_summaries(fpath)
image_summaries[0]
len(tab)
prompt_text = """You are an assistant tasked with summarizing tables for retrieval.
\
These summaries will be embedded and used to retrieve the raw table elements. \
Give a concise summary of the table that is well optimized for retrieval. Table
{element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
table_summaries = []
table_summaries=summarize_chain.batch(tab,{"max_concurrency": 5})
tab[0]
table_summaries[0]
import base64
import os
from groq import Groq
os.environ["GROQ_API_KEY"] =
"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"
def encode_image(image_path):
"""Getting the base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# Encode image
base64_image = encode_image(image_path)
def generate_img_summaries(path):
"""
Generate summaries and base64 encoded strings for images.
path: Path to list of .jpg files extracted by Unstructured
"""
img_base64_list = []
image_summaries = []
import os
print(os.listdir("extracted_data"))
fpath="extracted_data/figure-2-1.jpg"
img_base64_list,image_summaries=generate_img_summaries(fpath)
image_summaries[0]
import uuid
summary_docs = [
Document(page_content=s, metadata={id_key: doc_ids[i]})
for i, s in enumerate(doc_summaries)
]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, doc_contents)))
return retriever
vectorstore = Chroma(
collection_name="mm_rag", embedding_function=OpenAIEmbeddings()
)
# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
vectorstore,
text_summaries,
Text,
table_summaries,
Table,
image_summaries,
img_base64_list,
)
retriever_multi_vector_img
import io
import re
def plt_img_base64(img_base64):
"""Disply base64 encoded string as image"""
# Create an HTML img tag with the base64 string as the source
image_html = f''
# Display the image by rendering the HTML
display(HTML(image_html))
plt_img_base64(img_base64_list[1])
image_summaries[1]
def looks_like_base64(sb):
"""Check if the string looks like base64"""
return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None
def is_image_data(b64data):
"""
Check if the base64 data is an image by looking at the start of the data
"""
image_signatures = {
b"\xFF\xD8\xFF": "jpg",
b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
b"\x47\x49\x46\x38": "gif",
b"\x52\x49\x46\x46": "webp",
}
try:
header = base64.b64decode(b64data)[:8] # Decode and get the first 8 bytes
for sig, format in image_signatures.items():
if header.startswith(sig):
return True
return False
except Exception:
return False
def split_image_text_types(docs):
"""
Split base64-encoded images and texts
"""
b64_images = []
texts = []
def img_prompt_func(data_dict):
"""
Join the context into a single string
"""
#print(data_dict)
formatted_texts = "\n".join(data_dict["context"]["texts"])
messages = []