19
19
public class OcrService {
20
20
private static final Logger LOGGER = LoggerFactory .getLogger (OcrService .class );
21
21
private static final String JNA_LIBRARY_PATH = "jna.library.path" ;
22
+ private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX" ;
23
+
22
24
// The OCR engine instance
23
25
private final Tesseract tesseract ;
24
26
25
27
/**
26
28
* Constructs a new OcrService with default settings.
27
29
* Currently uses Tesseract with English language support.
28
30
*/
29
- public OcrService () {
31
+ public OcrService () throws OcrException {
32
+ configureLibraryPath ();
33
+
34
+ try {
35
+ this .tesseract = new Tesseract ();
36
+ tesseract .setLanguage ("eng" );
37
+ configureTessdata ();
38
+ LOGGER .debug ("Initialized OcrService with Tesseract" );
39
+ } catch (Exception e ) {
40
+ throw new OcrException ("Failed to initialize OCR engine" , e );
41
+ }
42
+ }
43
+
44
+ private void configureLibraryPath () {
30
45
if (Platform .isMac ()) {
46
+ String originalPath = System .getProperty (JNA_LIBRARY_PATH , "" );
31
47
if (Platform .isARM ()) {
32
- System .setProperty (JNA_LIBRARY_PATH , JNA_LIBRARY_PATH + File .pathSeparator + "/opt/homebrew/lib/" );
48
+ System .setProperty (JNA_LIBRARY_PATH ,
49
+ originalPath + File .pathSeparator + "/opt/homebrew/lib/" );
50
+ } else {
51
+ System .setProperty (JNA_LIBRARY_PATH ,
52
+ originalPath + File .pathSeparator + "/usr/local/cellar/" );
53
+ }
54
+ }
55
+ }
56
+
57
+ private void configureTessdata () throws OcrException {
58
+ // First, check environment variable
59
+ String tessdataPath = System .getenv (TESSDATA_PREFIX );
60
+
61
+ if (tessdataPath != null && !tessdataPath .isEmpty ()) {
62
+ File tessdataDir = new File (tessdataPath );
63
+ if (tessdataDir .exists () && tessdataDir .isDirectory ()) {
64
+ // Tesseract expects the parent directory of tessdata
65
+ if (tessdataDir .getName ().equals ("tessdata" )) {
66
+ tesseract .setDatapath (tessdataDir .getParent ());
67
+ } else {
68
+ tesseract .setDatapath (tessdataPath );
69
+ }
70
+ LOGGER .info ("Using tessdata from environment variable: {}" , tessdataPath );
71
+ return ;
33
72
} else {
34
- System . setProperty ( JNA_LIBRARY_PATH , JNA_LIBRARY_PATH + File . pathSeparator + "/usr/local/cellar/" );
73
+ LOGGER . warn ( "TESSDATA_PREFIX points to non-existent directory: {}" , tessdataPath );
35
74
}
36
75
}
37
- this .tesseract = new Tesseract ();
38
76
39
- // Configure Tesseract
40
- tesseract .setLanguage ("eng" );
77
+ // Fall back to system locations
78
+ String systemPath = findSystemTessdata ();
79
+ if (systemPath != null ) {
80
+ tesseract .setDatapath (systemPath );
81
+ LOGGER .info ("Using system tessdata at: {}" , systemPath );
82
+ } else {
83
+ throw new OcrException ("Could not find tessdata directory. Please set TESSDATA_PREFIX environment variable." );
84
+ }
85
+ }
86
+
87
+ private String findSystemTessdata () {
88
+ String [] possiblePaths = {
89
+ "/usr/local/share" , // Homebrew Intel
90
+ "/opt/homebrew/share" , // Homebrew ARM
91
+ "/usr/share" // System
92
+ };
41
93
42
- // TODO: This path needs to be configurable and bundled properly
43
- // For now, we'll use a relative path that works during development
44
- tesseract .setDatapath ("tessdata" );
94
+ for (String path : possiblePaths ) {
95
+ File tessdata = new File (path , "tessdata" );
96
+ File engData = new File (tessdata , "eng.traineddata" );
97
+ if (tessdata .exists () && engData .exists ()) {
98
+ return path ; // Return parent of tessdata
99
+ }
100
+ }
45
101
46
- LOGGER . debug ( "Initialized OcrService with Tesseract" ) ;
102
+ return null ;
47
103
}
48
104
49
105
/**
@@ -53,35 +109,35 @@ public OcrService() {
53
109
* @return The extracted text, or empty string if no text found
54
110
* @throws OcrException if OCR processing fails
55
111
*/
56
- public String performOcr (Path pdfPath ) throws OcrException {
57
- // Validate input
112
+ public OcrResult performOcr (Path pdfPath ) {
113
+ // User error - not an exception
58
114
if (pdfPath == null ) {
59
- throw new OcrException ("PDF path cannot be null" );
115
+ LOGGER .warn ("PDF path is null" );
116
+ return OcrResult .failure ("No file path provided" );
60
117
}
61
118
62
119
File pdfFile = pdfPath .toFile ();
120
+
121
+ // User error - not an exception
63
122
if (!pdfFile .exists ()) {
64
- throw new OcrException ("PDF file does not exist: " + pdfPath );
123
+ LOGGER .warn ("PDF file does not exist: {}" , pdfPath );
124
+ return OcrResult .failure ("File does not exist: " + pdfPath .getFileName ());
65
125
}
66
126
67
127
try {
68
128
LOGGER .info ("Starting OCR for file: {}" , pdfFile .getName ());
69
129
70
- // Perform OCR
71
130
String result = tesseract .doOCR (pdfFile );
72
-
73
- // Clean up the result (remove extra whitespace, etc.)
74
131
result = StringUtil .isBlank (result ) ? "" : result .trim ();
75
132
76
133
LOGGER .info ("OCR completed successfully. Extracted {} characters" , result .length ());
77
- return result ;
78
- } catch (
79
- TesseractException e ) {
80
- LOGGER .error ("OCR failed for file: {}" , pdfFile .getName (), e );
81
- throw new OcrException (
82
- "Failed to perform OCR on file: " + pdfFile .getName () +
83
- ". Error: " + e .getMessage (), e
84
- );
134
+ return OcrResult .success (result );
135
+
136
+ } catch (TesseractException e ) {
137
+ // This could be either a user error (corrupt PDF) or our bug
138
+ // Log it as error but return as failure, not exception
139
+ LOGGER .error ("OCR processing failed for file: {}" , pdfFile .getName (), e );
140
+ return OcrResult .failure ("Failed to extract text from PDF: " + e .getMessage ());
85
141
}
86
142
}
87
143
}
0 commit comments