Skip to content

Commit ef962f0

Browse files
committed
updated simple rag docling code - Adithya S K
1 parent d5af89d commit ef962f0

File tree

1 file changed

+36
-14
lines changed

1 file changed

+36
-14
lines changed

varag/rag/_simpleRAG.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -125,25 +125,47 @@ def _process_file(
125125
verbose: bool,
126126
ocr: bool,
127127
):
128+
"""
129+
Process a single file for text extraction and embedding.
130+
131+
Args:
132+
file_path: Path to the file to process
133+
chunker: Chunking strategy to use
134+
metadata: Additional metadata to store with chunks
135+
verbose: Whether to show progress bars
136+
ocr: Whether to use OCR for text extraction
137+
"""
128138
if ocr:
129139
logger.info(f"Using OCR for file: {file_path}")
130140
try:
131-
from docling.datamodel.document import DocumentConversionInput
132-
from docling.document_converter import DocumentConverter
141+
from docling.datamodel.pipeline_options import PdfPipelineOptions
142+
from docling.document_converter import DocumentConverter, PdfFormatOption
143+
from docling.datamodel.base_models import InputFormat
144+
145+
# Configure pipeline options
146+
pipeline_options = PdfPipelineOptions()
147+
pipeline_options.do_ocr = True
148+
pipeline_options.do_table_structure = True
149+
pipeline_options.table_structure_options.do_cell_matching = True
150+
151+
# Initialize document converter with configured options
152+
doc_converter = DocumentConverter(
153+
format_options={
154+
InputFormat.PDF: PdfFormatOption(
155+
pipeline_options=pipeline_options
156+
)
157+
}
158+
)
133159

134-
doc_converter = DocumentConverter()
135-
input_data = DocumentConversionInput.from_paths([Path(file_path)])
136160
logger.info(f"Starting OCR conversion for file: {file_path}")
137-
conv_results = doc_converter.convert(input_data)
138-
139-
conv_result = next(conv_results, None)
140-
if conv_result is None:
161+
conv_result = doc_converter.convert(Path(file_path))
162+
163+
if not conv_result:
141164
raise ValueError("No conversion results")
142-
143-
text = conv_result.render_as_markdown()
144-
logger.info(
145-
f"OCR conversion completed for file: {file_path}. Status: {conv_result.status}"
146-
)
165+
166+
# Export as markdown for consistent formatting
167+
text = conv_result.document.export_to_markdown()
168+
logger.info(f"OCR conversion completed for file: {file_path}")
147169

148170
except ImportError:
149171
logger.error(
@@ -174,7 +196,7 @@ def _process_file(
174196
{
175197
"document_name": os.path.basename(file_path),
176198
"chunk_index": i,
177-
"chunk_id": str(uuid.uuid4()), # Generate unique ID for each chunk
199+
"chunk_id": str(uuid.uuid4()),
178200
"text": chunk,
179201
"vector": embedding.tolist(),
180202
"metadata": json.dumps(metadata or {}),

0 commit comments

Comments
 (0)