what function for OCR or RAG in langchain #24760
Unanswered
LiuChao888
asked this question in
Q&A
Replies: 1 comment 1 reply
-
Here is an example code for OCR and RAG functions using LangChain that handle various file types such as images, PPT, PDF, DOC, CSV, and video: import base64
import os
import uuid
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from unstructured.partition.pdf import partition_pdf
from langchain_community.document_loaders import PyPDFLoader
# Function to extract elements from PDF
def extract_pdf_elements(path, fname):
return partition_pdf(
filename=path + fname,
extract_images_in_pdf=True,
infer_table_structure=True,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
image_output_dir_path=path,
)
# Function to categorize elements by type
def categorize_elements(raw_pdf_elements):
tables = []
texts = []
for element in raw_pdf_elements:
if "unstructured.documents.elements.Table" in str(type(element)):
tables.append(str(element))
elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
texts.append(str(element))
return texts, tables
# Function to generate text summaries
def generate_text_summaries(texts, tables, summarize_texts=False):
prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
These summaries will be embedded and used to retrieve the raw text or table elements. \
Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
text_summaries = []
table_summaries = []
if texts and summarize_texts:
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
elif texts:
text_summaries = texts
if tables:
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
return text_summaries, table_summaries
# Function to encode image to base64
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# Function to summarize image
def image_summarize(img_base64, prompt):
chat = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024)
msg = chat.invoke(
[
HumanMessage(
content=[
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
]
)
]
)
return msg.content
# Function to generate image summaries
def generate_img_summaries(path):
img_base64_list = []
image_summaries = []
prompt = """You are an assistant tasked with summarizing images for retrieval. \
These summaries will be embedded and used to retrieve the raw image. \
Give a concise summary of the image that is well optimized for retrieval."""
for img_file in sorted(os.listdir(path)):
if img_file.endswith(".jpg"):
img_path = os.path.join(path, img_file)
base64_image = encode_image(img_path)
img_base64_list.append(base64_image)
image_summaries.append(image_summarize(base64_image, prompt))
return img_base64_list, image_summaries
# Function to create multi-vector retriever
def create_multi_vector_retriever(
vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
def add_documents(retriever, doc_summaries, doc_contents):
doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
summary_docs = [
Document(page_content=s, metadata={id_key: doc_ids[i]})
for i, s in enumerate(doc_summaries)
]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, doc_contents)))
if text_summaries:
add_documents(retriever, text_summaries, texts)
if table_summaries:
add_documents(retriever, table_summaries, tables)
if image_summaries:
add_documents(retriever, image_summaries, images)
return retriever
# Function to split image and text types
def split_image_text_types(docs):
b64_images = []
texts = []
for doc in docs:
if isinstance(doc, Document):
doc = doc.page_content
if looks_like_base64(doc) and is_image_data(doc):
b64_images.append(doc)
else:
texts.append(doc)
return {"images": b64_images, "texts": texts}
# Function to create image prompt
def img_prompt_func(data_dict):
formatted_texts = "\n".join(data_dict["context"]["texts"])
messages = []
if data_dict["context"]["images"]:
for image in data_dict["context"]["images"]:
image_message = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
}
messages.append(image_message)
text_message = {
"type": "text",
"text": (
"You are financial analyst tasking with providing investment advice.\n"
"You will be given a mixed of text, tables, and image(s) usually of charts or graphs.\n"
"Use this information to provide investment advice related to the user question. \n"
f"User-provided question: {data_dict['question']}\n\n"
"Text and / or tables:\n"
f"{formatted_texts}"
),
}
messages.append(text_message)
return [HumanMessage(content=messages)]
# Function to create multi-modal RAG chain
def multi_modal_rag_chain(retriever):
model = ChatOpenAI(temperature=0, model="gpt-4-vision-preview", max_tokens=1024)
chain = (
{
"context": retriever | RunnableLambda(split_image_text_types),
"question": RunnablePassthrough(),
}
| RunnableLambda(img_prompt_func)
| model
| StrOutputParser()
)
return chain
# Example usage
fpath = "/path/to/your/files/"
fname = "example.pdf"
raw_pdf_elements = extract_pdf_elements(fpath, fname)
texts, tables = categorize_elements(raw_pdf_elements)
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=4000, chunk_overlap=0)
joined_texts = " ".join(texts)
texts_4k_token = text_splitter.split_text(joined_texts)
text_summaries, table_summaries = generate_text_summaries(texts_4k_token, tables, summarize_texts=True)
img_base64_list, image_summaries = generate_img_summaries(fpath)
vectorstore = Chroma(collection_name="mm_rag_example", embedding_function=OpenAIEmbeddings())
retriever_multi_vector_img = create_multi_vector_retriever(
vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, img_base64_list
)
rag_chain = multi_modal_rag_chain(retriever_multi_vector_img) This code covers the extraction, summarization, and retrieval of various file types, including images and PDFs, using LangChain. It also demonstrates how to handle semi-structured and multi-modal data, leveraging the capabilities of multimodal LLMs like GPT-4V [1][2][3]. |
Beta Was this translation helpful? Give feedback.
1 reply
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
Checked other resources
Commit to Help
Example Code
Description
I want to code some functions use langchain
Mainly for OCR and RAG
function as for image, ppt, pdf, doc , csv, video
and now ,can you give me some example codes for me
thanks
System Info
langchain 0.2.9
python 3.11
Beta Was this translation helpful? Give feedback.
All reactions