@@ -125,25 +125,47 @@ def _process_file(
125125        verbose : bool ,
126126        ocr : bool ,
127127    ):
128+         """ 
129+         Process a single file for text extraction and embedding. 
130+          
131+         Args: 
132+             file_path: Path to the file to process 
133+             chunker: Chunking strategy to use 
134+             metadata: Additional metadata to store with chunks 
135+             verbose: Whether to show progress bars 
136+             ocr: Whether to use OCR for text extraction 
137+         """ 
128138        if  ocr :
129139            logger .info (f"Using OCR for file: { file_path }  " )
130140            try :
131-                 from  docling .datamodel .document  import  DocumentConversionInput 
132-                 from  docling .document_converter  import  DocumentConverter 
141+                 from  docling .datamodel .pipeline_options  import  PdfPipelineOptions 
142+                 from  docling .document_converter  import  DocumentConverter , PdfFormatOption 
143+                 from  docling .datamodel .base_models  import  InputFormat 
144+ 
145+                 # Configure pipeline options 
146+                 pipeline_options  =  PdfPipelineOptions ()
147+                 pipeline_options .do_ocr  =  True 
148+                 pipeline_options .do_table_structure  =  True 
149+                 pipeline_options .table_structure_options .do_cell_matching  =  True 
150+                 
151+                 # Initialize document converter with configured options 
152+                 doc_converter  =  DocumentConverter (
153+                     format_options = {
154+                         InputFormat .PDF : PdfFormatOption (
155+                             pipeline_options = pipeline_options 
156+                         )
157+                     }
158+                 )
133159
134-                 doc_converter  =  DocumentConverter ()
135-                 input_data  =  DocumentConversionInput .from_paths ([Path (file_path )])
136160                logger .info (f"Starting OCR conversion for file: { file_path }  " )
137-                 conv_results  =  doc_converter .convert (input_data )
138- 
139-                 conv_result  =  next (conv_results , None )
140-                 if  conv_result  is  None :
161+                 conv_result  =  doc_converter .convert (Path (file_path ))
162+                 
163+                 if  not  conv_result :
141164                    raise  ValueError ("No conversion results" )
142- 
143-                 text  =  conv_result .render_as_markdown ()
144-                 logger .info (
145-                     f"OCR conversion completed for file: { file_path }  . Status: { conv_result .status }  " 
146-                 )
165+                 
166+                 # Export as markdown for consistent formatting 
167+                 text  =  conv_result .document .export_to_markdown ()
168+                 logger .info (f"OCR conversion completed for file: { file_path }  " )
147169
148170            except  ImportError :
149171                logger .error (
@@ -174,7 +196,7 @@ def _process_file(
174196            {
175197                "document_name" : os .path .basename (file_path ),
176198                "chunk_index" : i ,
177-                 "chunk_id" : str (uuid .uuid4 ()),   # Generate unique ID for each chunk 
199+                 "chunk_id" : str (uuid .uuid4 ()),
178200                "text" : chunk ,
179201                "vector" : embedding .tolist (),
180202                "metadata" : json .dumps (metadata  or  {}),
0 commit comments