@@ -251,65 +251,114 @@ def process_images_parallel(image_files, output_path, max_workers=None):
251251 return successful_files , failed_files , results
252252
253253
254- def main (input_path , output_path , max_workers = None ):
254+ def validate_and_setup (input_path , output_path ):
255+ """
256+ Validate prerequisites and setup output directory
257+ :param input_path: Input path to validate
258+ :param output_path: Output path to create if needed
259+ :return: True if validation passes, False otherwise
260+ """
255261 # Check if tesseract is installed or not
256262 if not check_pre_requisites_tesseract ():
257- return
263+ return False
258264
259265 # Check if a valid input directory is given or not
260266 if not check_path (input_path ):
261267 logging .error ("Nothing found at `{}`" .format (input_path ))
262- return
268+ return False
263269
264270 # Create output directory
265271 if output_path :
266272 create_directory (output_path )
267273 logging .debug ("Creating Output Path {}" .format (output_path ))
268274
269- # Check if input_path is directory or file
270- if os .path .isdir (input_path ):
271- logging .debug ("The Input Path is a directory." )
275+ return True
272276
273- # Get valid image files efficiently
274- image_files , other_files = get_valid_image_files (input_path )
275277
276- if len (image_files ) == 0 :
277- logging .error ("No valid image files found at your input location" )
278- logging .error (
279- "Supported formats: [{}]" .format (", " .join (VALID_IMAGE_EXTENSIONS ))
280- )
281- return
278+ def process_directory (input_path , output_path , max_workers ):
279+ """
280+ Process all images in a directory
281+ :param input_path: Directory containing images
282+ :param output_path: Output directory for text files
283+ :param max_workers: Number of parallel workers
284+ """
285+ logging .debug ("The Input Path is a directory." )
282286
283- total_file_count = len (image_files ) + other_files
284- logging .info (
285- "Found total {} file(s) ({} valid images, {} other files)\n " .format (
286- total_file_count , len (image_files ), other_files
287- )
287+ # Get valid image files efficiently
288+ image_files , other_files = get_valid_image_files (input_path )
289+
290+ if len (image_files ) == 0 :
291+ logging .error ("No valid image files found at your input location" )
292+ logging .error (
293+ "Supported formats: [{}]" .format (", " .join (VALID_IMAGE_EXTENSIONS ))
288294 )
295+ return
289296
290- # Process images in parallel
291- successful_files , failed_files , results = process_images_parallel (image_files , output_path , max_workers )
297+ total_file_count = len (image_files ) + other_files
298+ logging .info (
299+ "Found total {} file(s) ({} valid images, {} other files)\n " .format (
300+ total_file_count , len (image_files ), other_files
301+ )
302+ )
292303
293- # Print results if not writing to files
294- if not output_path :
295- for filename , text in results :
296- print (f"\n === { filename } ===" )
297- print (text )
304+ # Process images in parallel
305+ successful_files , failed_files , results = process_images_parallel (image_files , output_path , max_workers )
306+
307+ # Print results if not writing to files
308+ if not output_path :
309+ for filename , text in results :
310+ print (f"\n === { filename } ===" )
311+ print (text )
312+
313+ # Log final results
314+ log_processing_results (successful_files , failed_files , other_files )
315+
316+
317+ def process_single_file (input_path , output_path ):
318+ """
319+ Process a single image file
320+ :param input_path: Path to the image file
321+ :param output_path: Output directory for text file
322+ """
323+ filename = os .path .basename (input_path )
324+ logging .debug ("The Input Path is a file {}" .format (filename ))
325+ image_path = Path (input_path )
326+ success , text , _ = run_tesseract_optimized (image_path , output_path )
327+ if success and text :
328+ print (text )
298329
299- logging .info ("Parsing Completed!\n " )
300- logging .info ("Successfully parsed images: {}" .format (successful_files ))
301- if failed_files > 0 :
302- logging .warning ("Failed to parse images: {}" .format (failed_files ))
303- if other_files > 0 :
304- logging .info ("Files with unsupported file extensions: {}" .format (other_files ))
305330
331+ def log_processing_results (successful_files , failed_files , other_files ):
332+ """
333+ Log the results of image processing
334+ :param successful_files: Number of successfully processed files
335+ :param failed_files: Number of failed files
336+ :param other_files: Number of non-image files
337+ """
338+ logging .info ("Parsing Completed!\n " )
339+ logging .info ("Successfully parsed images: {}" .format (successful_files ))
340+ if failed_files > 0 :
341+ logging .warning ("Failed to parse images: {}" .format (failed_files ))
342+ if other_files > 0 :
343+ logging .info ("Files with unsupported file extensions: {}" .format (other_files ))
344+
345+
346+ def main (input_path , output_path , max_workers = None ):
347+ """
348+ Main function to process images and extract text using OCR
349+ :param input_path: Path to input file or directory
350+ :param output_path: Path to output directory
351+ :param max_workers: Number of parallel workers
352+ """
353+ # Validate prerequisites and setup
354+ if not validate_and_setup (input_path , output_path ):
355+ return
356+
357+ # Process based on input type
358+ if os .path .isdir (input_path ):
359+ process_directory (input_path , output_path , max_workers )
306360 else :
307- filename = os .path .basename (input_path )
308- logging .debug ("The Input Path is a file {}" .format (filename ))
309- image_path = Path (input_path )
310- success , text , _ = run_tesseract_optimized (image_path , output_path )
311- if success and text :
312- print (text )
361+ process_single_file (input_path , output_path )
313362
314363
315364if __name__ == "__main__" :
0 commit comments