0% found this document useful (0 votes)
16 views

multimodel_text

Uploaded by

dksk0945
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views

multimodel_text

Uploaded by

dksk0945
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 9

import sys

print(sys.executable)

from unstructured.partition.pdf import partition_pdf

raw_pdf_elements=partition_pdf(
filename="RAG_NLP.pdf",
strategy="hi_res",
extract_images_in_pdf=True,
extract_image_block_types=["Image", "Table"],
extract_image_block_to_payload=False,
extract_image_block_output_dir="extracted_data"
)

raw_pdf_elements

Header=[]
Footer=[]
Title=[]
NarrativeText=[]
Text=[]
ListItem=[]

for element in raw_pdf_elements:


if "unstructured.documents.elements.Header" in str(type(element)):
Header.append(str(element))
elif "unstructured.documents.elements.Footer" in str(type(element)):
Footer.append(str(element))
elif "unstructured.documents.elements.Title" in str(type(element)):
Title.append(str(element))
elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
NarrativeText.append(str(element))
elif "unstructured.documents.elements.Text" in str(type(element)):
Text.append(str(element))
elif "unstructured.documents.elements.ListItem" in str(type(element)):
ListItem.append(str(element))

NarrativeText

ListItem

img=[]
for element in raw_pdf_elements:
if "unstructured.documents.elements.Image" in str(type(element)):
img.append(str(element))

img

len(img)

tab=[]
for element in raw_pdf_elements:
if "unstructured.documents.elements.Table" in str(type(element)):
tab.append(str(element))

tab

len(tab)

from langchain.llms import Ollama


ollama = Ollama(model="llama3.1")
print(ollama("who are you ?"))

from langchain_core.output_parsers import StrOutputParser


from langchain_core.prompts import ChatPromptTemplate

prompt_text = """You are an assistant tasked with summarizing tables for retrieval.
\
These summaries will be embedded and used to retrieve the raw table elements. \
Give a concise summary of the table that is well optimized for retrieval. Table
{element} """

prompt = ChatPromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt | ollama | StrOutputParser()

table_summaries = []

table_summaries=summarize_chain.batch(tab,{"max_concurrency": 5})

tab[0]

table_summaries[0]

import base64
import os
from groq import Groq

os.environ["GROQ_API_KEY"] =
"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"

def encode_image(image_path):
"""Getting the base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def image_summarize(image_path, prompt):
"""Summarize the image using Groq's llama-3.2-11b-vision-preview model."""

# Initialize Groq client


client = Groq()

# Encode image
base64_image = encode_image(image_path)

# Create a chat completion request with Groq


response = client.chat.completions.create(
model="llama-3.2-11b-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url":
f"data:image/jpeg;base64,{base64_image}"}}
]
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None
)

# Return the assistant's response, handling potential issues


return response.choices[0].message.content if response.choices else "No
response received from Groq API."

def generate_img_summaries(path):
"""
Generate summaries and base64 encoded strings for images.
path: Path to list of .jpg files extracted by Unstructured
"""
img_base64_list = []
image_summaries = []

# Prompt for summarizing images


prompt = """ Analyze the image and provide a factual description of its
content.
Identify any visible text, objects, or figures in the image, and
summarize the main theme or topic.
without any additional information on image summarization techniques.
If the image includes multiple components, describe each component
concisely.
"""
# Generate base64 encoded image and summary
base64_image = encode_image(path)
img_base64_list.append(base64_image)
image_summaries.append(image_summarize(path, prompt))

return img_base64_list, image_summaries


import os

print(os.listdir("extracted_data"))

fpath="extracted_data/figure-2-1.jpg"

img_base64_list,image_summaries=generate_img_summaries(fpath)

image_summaries[0]

len(tab)

from langchain.llms import Ollama


ollama = Ollama(model="llama3.1")
print(ollama("who are you ?"))

from langchain_core.output_parsers import StrOutputParser


from langchain_core.prompts import ChatPromptTemplate

prompt_text = """You are an assistant tasked with summarizing tables for retrieval.
\
These summaries will be embedded and used to retrieve the raw table elements. \
Give a concise summary of the table that is well optimized for retrieval. Table
{element} """

prompt = ChatPromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt | ollama | StrOutputParser()

table_summaries = []

table_summaries=summarize_chain.batch(tab,{"max_concurrency": 5})

tab[0]

table_summaries[0]

import base64
import os
from groq import Groq
os.environ["GROQ_API_KEY"] =
"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"

def encode_image(image_path):
"""Getting the base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")

def image_summarize(image_path, prompt):


"""Summarize the image using Groq's llama-3.2-11b-vision-preview model."""

# Initialize Groq client


client = Groq()

# Encode image
base64_image = encode_image(image_path)

# Create a chat completion request with Groq


response = client.chat.completions.create(
model="llama-3.2-11b-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url":
f"data:image/jpeg;base64,{base64_image}"}}
]
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None
)

# Return the assistant's response, handling potential issues


return response.choices[0].message.content if response.choices else "No
response received from Groq API."

def generate_img_summaries(path):
"""
Generate summaries and base64 encoded strings for images.
path: Path to list of .jpg files extracted by Unstructured
"""
img_base64_list = []
image_summaries = []

# Prompt for summarizing images


prompt = """ Analyze the image and provide a factual description of its
content.
Identify any visible text, objects, or figures in the image, and
summarize the main theme or topic.
without any additional information on image summarization techniques.
If the image includes multiple components, describe each component
concisely.
"""
# Generate base64 encoded image and summary
base64_image = encode_image(path)
img_base64_list.append(base64_image)
image_summaries.append(image_summarize(path, prompt))

return img_base64_list, image_summaries

import os

print(os.listdir("extracted_data"))

fpath="extracted_data/figure-2-1.jpg"

img_base64_list,image_summaries=generate_img_summaries(fpath)

image_summaries[0]

import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever


from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

def create_multi_vector_retriever(vectorstore, text_summaries, texts,


table_summaries, tables, image_summaries, images):
"""
Create retriever that indexes summaries, but returns raw images or texts
"""

# Initialize the storage layer


store = InMemoryStore()
id_key = "doc_id"

# Create the multi-vector retriever


retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)

# Helper function to add documents to the vectorstore and docstore


def add_documents(retriever, doc_summaries, doc_contents):

doc_ids = [str(uuid.uuid4()) for _ in doc_contents]

summary_docs = [
Document(page_content=s, metadata={id_key: doc_ids[i]})
for i, s in enumerate(doc_summaries)
]

retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

# Add texts, tables, and images


# Check that text_summaries is not empty before adding
if text_summaries:
add_documents(retriever, text_summaries, texts)
# Check that table_summaries is not empty before adding
if table_summaries:
add_documents(retriever, table_summaries, tab)
# Check that image_summaries is not empty before adding
if image_summaries:
add_documents(retriever, image_summaries, img)

return retriever

vectorstore = Chroma(
collection_name="mm_rag", embedding_function=OpenAIEmbeddings()
)

# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
vectorstore,
text_summaries,
Text,
table_summaries,
Table,
image_summaries,
img_base64_list,
)

retriever_multi_vector_img

import io
import re

from IPython.display import HTML, display


from PIL import Image

def plt_img_base64(img_base64):
"""Disply base64 encoded string as image"""
# Create an HTML img tag with the base64 string as the source
image_html = f''
# Display the image by rendering the HTML
display(HTML(image_html))

plt_img_base64(img_base64_list[1])
image_summaries[1]

def looks_like_base64(sb):
"""Check if the string looks like base64"""
return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None

def is_image_data(b64data):
"""
Check if the base64 data is an image by looking at the start of the data
"""
image_signatures = {
b"\xFF\xD8\xFF": "jpg",
b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
b"\x47\x49\x46\x38": "gif",
b"\x52\x49\x46\x46": "webp",
}
try:
header = base64.b64decode(b64data)[:8] # Decode and get the first 8 bytes
for sig, format in image_signatures.items():
if header.startswith(sig):
return True
return False
except Exception:
return False

def resize_base64_image(base64_string, size=(128, 128)):


"""
Resize an image encoded as a Base64 string
"""
# Decode the Base64 string
img_data = base64.b64decode(base64_string)
img = Image.open(io.BytesIO(img_data))

# Resize the image


resized_img = img.resize(size, Image.LANCZOS)

# Save the resized image to a bytes buffer


buffered = io.BytesIO()
resized_img.save(buffered, format=img.format)

# Encode the resized image to Base64


return base64.b64encode(buffered.getvalue()).decode("utf-8")

def split_image_text_types(docs):
"""
Split base64-encoded images and texts
"""
b64_images = []
texts = []

for doc in docs:


# Check if the document is of type Document and extract page_content if so
if isinstance(doc, Document):
doc = doc.page_content
if looks_like_base64(doc) and is_image_data(doc):
doc = resize_base64_image(doc, size=(1300, 600))
b64_images.append(doc)
else:
texts.append(doc)

return {"images": b64_images, "texts": texts}

def img_prompt_func(data_dict):
"""
Join the context into a single string
"""
#print(data_dict)
formatted_texts = "\n".join(data_dict["context"]["texts"])
messages = []

# Adding image(s) to the messages if present


if data_dict["context"]["images"]:
for image in data_dict["context"]["images"]:
image_message = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
}
messages.append(image_message)

# Adding the text for analysis


text_message = {
"type": "text",
"text": (
"You are a helpful assistant.\n"
"You will be given a mixed info(s) .\n"
"Use this information to provide relevant information to the user
question. \n"
f"User-provided question: {data_dict['question']}\n\n"
"Text and / or tables:\n"
f"{formatted_texts}"
),
}
messages.append(text_message)
return [HumanMessage(content=messages)]

You might also like