Skip to content

Commit f7935db

Browse files
authored
Merge pull request #163 from NPC-Worldwide/chris/alicanto_knowledge_graph_upgrades
Chris/alicanto knowledge graph upgrades
2 parents cf58ba6 + 135291a commit f7935db

File tree

4 files changed

+450
-86
lines changed

4 files changed

+450
-86
lines changed

examples/ocr_pipeline.py

Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
from npcpy.llm_funcs import get_llm_response
2+
from npcpy.data.load import load_pdf, load_image
3+
import os
4+
import pandas as pd
5+
import json
6+
from PIL import Image
7+
import io
8+
import numpy as np
9+
import argparse
10+
from typing import List, Dict, Any, Optional, Union
11+
import time
12+
import sys
13+
14+
def process_pdf(pdf_path: str, extract_images: bool = True, extract_tables: bool = False) -> Dict[str, Any]:
15+
"""
16+
Process PDF file to extract text, images, and optionally tables
17+
18+
Args:
19+
pdf_path: Path to the PDF file
20+
extract_images: Whether to extract images from PDF
21+
extract_tables: Whether to extract tables from PDF
22+
23+
Returns:
24+
Dictionary containing extracted content
25+
"""
26+
result = {"text": [], "images": [], "tables": []}
27+
28+
if not os.path.exists(pdf_path):
29+
print(f"Error: PDF file not found at {pdf_path}")
30+
return result
31+
32+
try:
33+
pdf_df = load_pdf(pdf_path)
34+
35+
# Extract text
36+
if 'texts' in pdf_df.columns:
37+
texts = json.loads(pdf_df['texts'].iloc[0])
38+
for item in texts:
39+
result["text"].append({
40+
"page": item.get('page', 0),
41+
"content": item.get('content', ''),
42+
"bbox": item.get('bbox', None)
43+
})
44+
45+
# Extract images
46+
if extract_images and 'images' in pdf_df.columns:
47+
images_data = json.loads(pdf_df['images'].iloc[0])
48+
temp_paths = []
49+
50+
for idx, img_data in enumerate(images_data):
51+
if 'array' in img_data and 'shape' in img_data and 'dtype' in img_data:
52+
shape = img_data['shape']
53+
dtype = img_data['dtype']
54+
img_array = np.frombuffer(img_data['array'], dtype=np.dtype(dtype))
55+
img_array = img_array.reshape(shape)
56+
57+
img = Image.fromarray(img_array)
58+
temp_img_path = f"temp_pdf_image_{os.path.basename(pdf_path)}_{idx}.png"
59+
img.save(temp_img_path)
60+
61+
result["images"].append({
62+
"path": temp_img_path,
63+
"page": img_data.get('page', 0),
64+
"bbox": img_data.get('bbox', None)
65+
})
66+
temp_paths.append(temp_img_path)
67+
68+
result["temp_paths"] = temp_paths
69+
70+
# Extract tables (if requested and available)
71+
if extract_tables and 'tables' in pdf_df.columns:
72+
tables_data = json.loads(pdf_df['tables'].iloc[0])
73+
for table in tables_data:
74+
if isinstance(table, dict) and 'data' in table:
75+
result["tables"].append({
76+
"page": table.get('page', 0),
77+
"data": table.get('data'),
78+
"caption": table.get('caption', '')
79+
})
80+
81+
except Exception as e:
82+
print(f"Error processing PDF {pdf_path}: {e}")
83+
84+
return result
85+
86+
def process_image(image_path: str) -> Optional[str]:
87+
"""Process image file and return path if valid"""
88+
if not os.path.exists(image_path):
89+
print(f"Error: Image file not found at {image_path}")
90+
return None
91+
92+
try:
93+
# Just verify it's a valid image
94+
Image.open(image_path)
95+
return image_path
96+
except Exception as e:
97+
print(f"Error processing image {image_path}: {e}")
98+
return None
99+
100+
def process_csv(csv_path: str, max_rows: int = 10) -> Optional[str]:
101+
"""Process CSV file and return sample content"""
102+
if not os.path.exists(csv_path):
103+
print(f"Error: CSV file not found at {csv_path}")
104+
return None
105+
106+
try:
107+
data = pd.read_csv(csv_path)
108+
return data.head(max_rows).to_string()
109+
except Exception as e:
110+
print(f"Error processing CSV {csv_path}: {e}")
111+
return None
112+
113+
def extract_and_analyze(
114+
file_paths: List[str],
115+
model: str = 'gemma3:4b',
116+
provider: str = 'ollama',
117+
preprocess: bool = False,
118+
extract_tables: bool = False,
119+
output_json: bool = False,
120+
output_file: str = None,
121+
) -> Dict[str, Any]:
122+
"""
123+
Extract content from files and analyze using an LLM
124+
125+
Args:
126+
file_paths: List of paths to files (PDFs, images, CSVs)
127+
model: LLM model to use
128+
provider: LLM provider
129+
preprocess: Whether to do detailed preprocessing (True) or use attachment-based approach (False)
130+
extract_tables: Whether to extract tables from PDFs
131+
output_json: Whether to ask for structured JSON output
132+
output_file: Optional path to save results
133+
134+
Returns:
135+
Dictionary containing analysis results
136+
"""
137+
start_time = time.time()
138+
139+
if not preprocess:
140+
# Simple attachment-based approach
141+
print(f"Using simple attachment-based approach with {len(file_paths)} files")
142+
format_param = "json" if output_json else None
143+
144+
response = get_llm_response(
145+
'Extract and analyze content from these files. Identify key concepts, data points, and provide a comprehensive analysis.',
146+
model=model,
147+
provider=provider,
148+
attachments=file_paths,
149+
format=format_param
150+
)
151+
152+
result = {
153+
"analysis": response['response'],
154+
"processing_time": time.time() - start_time,
155+
"file_count": len(file_paths),
156+
"approach": "attachment-based"
157+
}
158+
159+
else:
160+
# Detailed preprocessing approach
161+
print(f"Using detailed preprocessing approach with {len(file_paths)} files")
162+
pdf_results = []
163+
image_paths = []
164+
csv_contents = []
165+
temp_files = []
166+
167+
# Process each file based on type
168+
for file_path in file_paths:
169+
_, ext = os.path.splitext(file_path)
170+
ext = ext.lower()
171+
172+
if ext == '.pdf':
173+
print(f"Processing PDF: {file_path}")
174+
pdf_result = process_pdf(file_path, extract_tables=extract_tables)
175+
pdf_results.append({"path": file_path, "content": pdf_result})
176+
177+
# Add extracted images to the list
178+
if "temp_paths" in pdf_result:
179+
image_paths.extend(pdf_result["temp_paths"])
180+
temp_files.extend(pdf_result["temp_paths"])
181+
182+
elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
183+
print(f"Processing image: {file_path}")
184+
img_path = process_image(file_path)
185+
if img_path:
186+
image_paths.append(img_path)
187+
188+
elif ext == '.csv':
189+
print(f"Processing CSV: {file_path}")
190+
csv_content = process_csv(file_path)
191+
if csv_content:
192+
csv_contents.append({"path": file_path, "content": csv_content})
193+
194+
# Build prompt with extracted content
195+
prompt = "Analyze the following content extracted from multiple documents:\n\n"
196+
197+
# Add PDF text content
198+
for pdf_result in pdf_results:
199+
pdf_path = pdf_result["path"]
200+
pdf_content = pdf_result["content"]
201+
202+
if pdf_content["text"]:
203+
prompt += f"PDF TEXT CONTENT ({os.path.basename(pdf_path)}):\n"
204+
# Limit to first 5 text blocks to avoid exceeding context window
205+
for i, text_item in enumerate(pdf_content["text"][:5]):
206+
prompt += f"- Page {text_item['page']}: {text_item['content'][:500]}...\n"
207+
prompt += "\n"
208+
209+
# Add table content if available
210+
if pdf_content["tables"]:
211+
prompt += f"PDF TABLES ({os.path.basename(pdf_path)}):\n"
212+
for i, table in enumerate(pdf_content["tables"][:3]):
213+
prompt += f"- Table {i+1} (Page {table['page']}): {table['caption']}\n"
214+
prompt += f"{str(table['data'])[:500]}...\n"
215+
prompt += "\n"
216+
217+
# Add CSV content
218+
for csv_item in csv_contents:
219+
prompt += f"CSV DATA ({os.path.basename(csv_item['path'])}):\n"
220+
prompt += f"{csv_item['content']}\n\n"
221+
222+
# Add analysis instructions
223+
prompt += "\nPlease provide a comprehensive analysis of the content above, identifying key concepts, patterns, and insights."
224+
225+
if output_json:
226+
prompt += "\nFormat your response as a JSON object with the following structure: " + \
227+
'{"key_concepts": [], "data_points": [], "analysis": "", "insights": []}'
228+
229+
# Call LLM with preprocessed content and images
230+
format_param = "json" if output_json else None
231+
response = get_llm_response(
232+
prompt=prompt,
233+
model=model,
234+
provider=provider,
235+
images=image_paths,
236+
format=format_param
237+
)
238+
239+
result = {
240+
"analysis": response['response'],
241+
"processing_time": time.time() - start_time,
242+
"file_count": len(file_paths),
243+
"pdf_count": len(pdf_results),
244+
"image_count": len(image_paths),
245+
"csv_count": len(csv_contents),
246+
"approach": "detailed-preprocessing"
247+
}
248+
249+
# Clean up temporary files
250+
for temp_file in temp_files:
251+
if os.path.exists(temp_file):
252+
try:
253+
os.remove(temp_file)
254+
except Exception as e:
255+
print(f"Error removing temp file {temp_file}: {e}")
256+
257+
# Save results if output file specified
258+
if output_file:
259+
try:
260+
with open(output_file, 'w') as f:
261+
json.dump(result, f, indent=2)
262+
print(f"Results saved to {output_file}")
263+
except Exception as e:
264+
print(f"Error saving results to {output_file}: {e}")
265+
266+
return result
267+
268+
if __name__ == "__main__":
269+
parser = argparse.ArgumentParser(description="OCR Pipeline for extracting and analyzing document content")
270+
parser.add_argument('files', nargs='+', help='Paths to files (PDFs, images, CSVs)')
271+
parser.add_argument('--model', default='gemma3:4b', help='LLM model to use')
272+
parser.add_argument('--provider', default='ollama', help='LLM provider')
273+
parser.add_argument('--preprocess', action='store_true', help='Use detailed preprocessing (default: attachment-based)')
274+
parser.add_argument('--tables', action='store_true', help='Extract tables from PDFs')
275+
parser.add_argument('--json', action='store_true', help='Request JSON-formatted output')
276+
parser.add_argument('--output', help='Save results to file')
277+
278+
args = parser.parse_args()
279+
280+
result = extract_and_analyze(
281+
file_paths=args.files,
282+
model=args.model,
283+
provider=args.provider,
284+
preprocess=args.preprocess,
285+
extract_tables=args.tables,
286+
output_json=args.json,
287+
output_file=args.output
288+
)
289+
290+
print("\nAnalysis Results:")
291+
print(result["analysis"])
292+
print(f"\nProcessing completed in {result['processing_time']:.2f} seconds")
293+
294+
# Example paths for direct script execution if no args provided
295+
if not sys.argv[1:]:
296+
print("\nRunning example with default paths:")
297+
pdf_path = 'test_data/yuan2004.pdf'
298+
image_path = 'test_data/markov_chain.png'
299+
csv_path = 'test_data/sample_data.csv'
300+
301+
result = extract_and_analyze(
302+
file_paths=[pdf_path, image_path, csv_path],
303+
model='gemma:4b',
304+
provider='ollama',
305+
preprocess=False
306+
)
307+
308+
print("\nExample Analysis Results:")
309+
print(result["analysis"])
310+
print(f"\nExample processing completed in {result['processing_time']:.2f} seconds")

0 commit comments

Comments
 (0)