Skip to content

Commit 50f3f11

Browse files
committed
updates to response and llm funcs to simplify the npc system messasge handling and to add attachment handling which was already more or less existent elsewhere in spool and yap and etc.
1 parent 5b4253f commit 50f3f11

File tree

4 files changed

+449
-86
lines changed

4 files changed

+449
-86
lines changed

examples/ocr_pipeline.py

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
from npcpy.llm_funcs import get_llm_response
2+
from npcpy.data.load import load_pdf, load_image
3+
import os
4+
import pandas as pd
5+
import json
6+
from PIL import Image
7+
import io
8+
import numpy as np
9+
import argparse
10+
from typing import List, Dict, Any, Optional, Union
11+
import time
12+
13+
def process_pdf(pdf_path: str, extract_images: bool = True, extract_tables: bool = False) -> Dict[str, Any]:
14+
"""
15+
Process PDF file to extract text, images, and optionally tables
16+
17+
Args:
18+
pdf_path: Path to the PDF file
19+
extract_images: Whether to extract images from PDF
20+
extract_tables: Whether to extract tables from PDF
21+
22+
Returns:
23+
Dictionary containing extracted content
24+
"""
25+
result = {"text": [], "images": [], "tables": []}
26+
27+
if not os.path.exists(pdf_path):
28+
print(f"Error: PDF file not found at {pdf_path}")
29+
return result
30+
31+
try:
32+
pdf_df = load_pdf(pdf_path)
33+
34+
# Extract text
35+
if 'texts' in pdf_df.columns:
36+
texts = json.loads(pdf_df['texts'].iloc[0])
37+
for item in texts:
38+
result["text"].append({
39+
"page": item.get('page', 0),
40+
"content": item.get('content', ''),
41+
"bbox": item.get('bbox', None)
42+
})
43+
44+
# Extract images
45+
if extract_images and 'images' in pdf_df.columns:
46+
images_data = json.loads(pdf_df['images'].iloc[0])
47+
temp_paths = []
48+
49+
for idx, img_data in enumerate(images_data):
50+
if 'array' in img_data and 'shape' in img_data and 'dtype' in img_data:
51+
shape = img_data['shape']
52+
dtype = img_data['dtype']
53+
img_array = np.frombuffer(img_data['array'], dtype=np.dtype(dtype))
54+
img_array = img_array.reshape(shape)
55+
56+
img = Image.fromarray(img_array)
57+
temp_img_path = f"temp_pdf_image_{os.path.basename(pdf_path)}_{idx}.png"
58+
img.save(temp_img_path)
59+
60+
result["images"].append({
61+
"path": temp_img_path,
62+
"page": img_data.get('page', 0),
63+
"bbox": img_data.get('bbox', None)
64+
})
65+
temp_paths.append(temp_img_path)
66+
67+
result["temp_paths"] = temp_paths
68+
69+
# Extract tables (if requested and available)
70+
if extract_tables and 'tables' in pdf_df.columns:
71+
tables_data = json.loads(pdf_df['tables'].iloc[0])
72+
for table in tables_data:
73+
if isinstance(table, dict) and 'data' in table:
74+
result["tables"].append({
75+
"page": table.get('page', 0),
76+
"data": table.get('data'),
77+
"caption": table.get('caption', '')
78+
})
79+
80+
except Exception as e:
81+
print(f"Error processing PDF {pdf_path}: {e}")
82+
83+
return result
84+
85+
def process_image(image_path: str) -> Optional[str]:
86+
"""Process image file and return path if valid"""
87+
if not os.path.exists(image_path):
88+
print(f"Error: Image file not found at {image_path}")
89+
return None
90+
91+
try:
92+
# Just verify it's a valid image
93+
Image.open(image_path)
94+
return image_path
95+
except Exception as e:
96+
print(f"Error processing image {image_path}: {e}")
97+
return None
98+
99+
def process_csv(csv_path: str, max_rows: int = 10) -> Optional[str]:
100+
"""Process CSV file and return sample content"""
101+
if not os.path.exists(csv_path):
102+
print(f"Error: CSV file not found at {csv_path}")
103+
return None
104+
105+
try:
106+
data = pd.read_csv(csv_path)
107+
return data.head(max_rows).to_string()
108+
except Exception as e:
109+
print(f"Error processing CSV {csv_path}: {e}")
110+
return None
111+
112+
def extract_and_analyze(
113+
file_paths: List[str],
114+
model: str = 'gemma3:4b',
115+
provider: str = 'ollama',
116+
preprocess: bool = False,
117+
extract_tables: bool = False,
118+
output_json: bool = False,
119+
output_file: str = None,
120+
) -> Dict[str, Any]:
121+
"""
122+
Extract content from files and analyze using an LLM
123+
124+
Args:
125+
file_paths: List of paths to files (PDFs, images, CSVs)
126+
model: LLM model to use
127+
provider: LLM provider
128+
preprocess: Whether to do detailed preprocessing (True) or use attachment-based approach (False)
129+
extract_tables: Whether to extract tables from PDFs
130+
output_json: Whether to ask for structured JSON output
131+
output_file: Optional path to save results
132+
133+
Returns:
134+
Dictionary containing analysis results
135+
"""
136+
start_time = time.time()
137+
138+
if not preprocess:
139+
# Simple attachment-based approach
140+
print(f"Using simple attachment-based approach with {len(file_paths)} files")
141+
format_param = "json" if output_json else None
142+
143+
response = get_llm_response(
144+
'Extract and analyze content from these files. Identify key concepts, data points, and provide a comprehensive analysis.',
145+
model=model,
146+
provider=provider,
147+
attachments=file_paths,
148+
format=format_param
149+
)
150+
151+
result = {
152+
"analysis": response['response'],
153+
"processing_time": time.time() - start_time,
154+
"file_count": len(file_paths),
155+
"approach": "attachment-based"
156+
}
157+
158+
else:
159+
# Detailed preprocessing approach
160+
print(f"Using detailed preprocessing approach with {len(file_paths)} files")
161+
pdf_results = []
162+
image_paths = []
163+
csv_contents = []
164+
temp_files = []
165+
166+
# Process each file based on type
167+
for file_path in file_paths:
168+
_, ext = os.path.splitext(file_path)
169+
ext = ext.lower()
170+
171+
if ext == '.pdf':
172+
print(f"Processing PDF: {file_path}")
173+
pdf_result = process_pdf(file_path, extract_tables=extract_tables)
174+
pdf_results.append({"path": file_path, "content": pdf_result})
175+
176+
# Add extracted images to the list
177+
if "temp_paths" in pdf_result:
178+
image_paths.extend(pdf_result["temp_paths"])
179+
temp_files.extend(pdf_result["temp_paths"])
180+
181+
elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
182+
print(f"Processing image: {file_path}")
183+
img_path = process_image(file_path)
184+
if img_path:
185+
image_paths.append(img_path)
186+
187+
elif ext == '.csv':
188+
print(f"Processing CSV: {file_path}")
189+
csv_content = process_csv(file_path)
190+
if csv_content:
191+
csv_contents.append({"path": file_path, "content": csv_content})
192+
193+
# Build prompt with extracted content
194+
prompt = "Analyze the following content extracted from multiple documents:\n\n"
195+
196+
# Add PDF text content
197+
for pdf_result in pdf_results:
198+
pdf_path = pdf_result["path"]
199+
pdf_content = pdf_result["content"]
200+
201+
if pdf_content["text"]:
202+
prompt += f"PDF TEXT CONTENT ({os.path.basename(pdf_path)}):\n"
203+
# Limit to first 5 text blocks to avoid exceeding context window
204+
for i, text_item in enumerate(pdf_content["text"][:5]):
205+
prompt += f"- Page {text_item['page']}: {text_item['content'][:500]}...\n"
206+
prompt += "\n"
207+
208+
# Add table content if available
209+
if pdf_content["tables"]:
210+
prompt += f"PDF TABLES ({os.path.basename(pdf_path)}):\n"
211+
for i, table in enumerate(pdf_content["tables"][:3]):
212+
prompt += f"- Table {i+1} (Page {table['page']}): {table['caption']}\n"
213+
prompt += f"{str(table['data'])[:500]}...\n"
214+
prompt += "\n"
215+
216+
# Add CSV content
217+
for csv_item in csv_contents:
218+
prompt += f"CSV DATA ({os.path.basename(csv_item['path'])}):\n"
219+
prompt += f"{csv_item['content']}\n\n"
220+
221+
# Add analysis instructions
222+
prompt += "\nPlease provide a comprehensive analysis of the content above, identifying key concepts, patterns, and insights."
223+
224+
if output_json:
225+
prompt += "\nFormat your response as a JSON object with the following structure: " + \
226+
'{"key_concepts": [], "data_points": [], "analysis": "", "insights": []}'
227+
228+
# Call LLM with preprocessed content and images
229+
format_param = "json" if output_json else None
230+
response = get_llm_response(
231+
prompt=prompt,
232+
model=model,
233+
provider=provider,
234+
images=image_paths,
235+
format=format_param
236+
)
237+
238+
result = {
239+
"analysis": response['response'],
240+
"processing_time": time.time() - start_time,
241+
"file_count": len(file_paths),
242+
"pdf_count": len(pdf_results),
243+
"image_count": len(image_paths),
244+
"csv_count": len(csv_contents),
245+
"approach": "detailed-preprocessing"
246+
}
247+
248+
# Clean up temporary files
249+
for temp_file in temp_files:
250+
if os.path.exists(temp_file):
251+
try:
252+
os.remove(temp_file)
253+
except Exception as e:
254+
print(f"Error removing temp file {temp_file}: {e}")
255+
256+
# Save results if output file specified
257+
if output_file:
258+
try:
259+
with open(output_file, 'w') as f:
260+
json.dump(result, f, indent=2)
261+
print(f"Results saved to {output_file}")
262+
except Exception as e:
263+
print(f"Error saving results to {output_file}: {e}")
264+
265+
return result
266+
267+
if __name__ == "__main__":
268+
parser = argparse.ArgumentParser(description="OCR Pipeline for extracting and analyzing document content")
269+
parser.add_argument('files', nargs='+', help='Paths to files (PDFs, images, CSVs)')
270+
parser.add_argument('--model', default='gemma3:4b', help='LLM model to use')
271+
parser.add_argument('--provider', default='ollama', help='LLM provider')
272+
parser.add_argument('--preprocess', action='store_true', help='Use detailed preprocessing (default: attachment-based)')
273+
parser.add_argument('--tables', action='store_true', help='Extract tables from PDFs')
274+
parser.add_argument('--json', action='store_true', help='Request JSON-formatted output')
275+
parser.add_argument('--output', help='Save results to file')
276+
277+
args = parser.parse_args()
278+
279+
result = extract_and_analyze(
280+
file_paths=args.files,
281+
model=args.model,
282+
provider=args.provider,
283+
preprocess=args.preprocess,
284+
extract_tables=args.tables,
285+
output_json=args.json,
286+
output_file=args.output
287+
)
288+
289+
print("\nAnalysis Results:")
290+
print(result["analysis"])
291+
print(f"\nProcessing completed in {result['processing_time']:.2f} seconds")
292+
293+
# Example paths for direct script execution if no args provided
294+
if not sys.argv[1:]:
295+
print("\nRunning example with default paths:")
296+
pdf_path = 'test_data/yuan2004.pdf'
297+
image_path = 'test_data/markov_chain.png'
298+
csv_path = 'test_data/sample_data.csv'
299+
300+
result = extract_and_analyze(
301+
file_paths=[pdf_path, image_path, csv_path],
302+
model='gemma:4b',
303+
provider='ollama',
304+
preprocess=False
305+
)
306+
307+
print("\nExample Analysis Results:")
308+
print(result["analysis"])
309+
print(f"\nExample processing completed in {result['processing_time']:.2f} seconds")

0 commit comments

Comments
 (0)