Skip to content

Commit 5a256ae

Browse files
committed
Adapted Exception handling and configured tessdata variable
1 parent 48ffb06 commit 5a256ae

File tree

2 files changed

+116
-25
lines changed

2 files changed

+116
-25
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package org.jabref.logic.ocr;
2+
3+
import java.util.Optional;
4+
5+
public class OcrResult {
6+
private final boolean success;
7+
private final String text;
8+
private final String errorMessage;
9+
10+
private OcrResult(boolean success, String text, String errorMessage) {
11+
this.success = success;
12+
this.text = text;
13+
this.errorMessage = errorMessage;
14+
}
15+
16+
public static OcrResult success(String text) {
17+
return new OcrResult(true, text, null);
18+
}
19+
20+
public static OcrResult failure(String errorMessage) {
21+
return new OcrResult(false, null, errorMessage);
22+
}
23+
24+
public boolean isSuccess() {
25+
return success;
26+
}
27+
28+
public Optional<String> getText() {
29+
return Optional.ofNullable(text);
30+
}
31+
32+
public Optional<String> getErrorMessage() {
33+
return Optional.ofNullable(errorMessage);
34+
}
35+
}

jablib/src/main/java/org/jabref/logic/ocr/OcrService.java

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,31 +19,87 @@
1919
public class OcrService {
2020
private static final Logger LOGGER = LoggerFactory.getLogger(OcrService.class);
2121
private static final String JNA_LIBRARY_PATH = "jna.library.path";
22+
private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX";
23+
2224
// The OCR engine instance
2325
private final Tesseract tesseract;
2426

2527
/**
2628
* Constructs a new OcrService with default settings.
2729
* Currently uses Tesseract with English language support.
2830
*/
29-
public OcrService() {
31+
public OcrService() throws OcrException {
32+
configureLibraryPath();
33+
34+
try {
35+
this.tesseract = new Tesseract();
36+
tesseract.setLanguage("eng");
37+
configureTessdata();
38+
LOGGER.debug("Initialized OcrService with Tesseract");
39+
} catch (Exception e) {
40+
throw new OcrException("Failed to initialize OCR engine", e);
41+
}
42+
}
43+
44+
private void configureLibraryPath() {
3045
if (Platform.isMac()) {
46+
String originalPath = System.getProperty(JNA_LIBRARY_PATH, "");
3147
if (Platform.isARM()) {
32-
System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/opt/homebrew/lib/");
48+
System.setProperty(JNA_LIBRARY_PATH,
49+
originalPath + File.pathSeparator + "/opt/homebrew/lib/");
50+
} else {
51+
System.setProperty(JNA_LIBRARY_PATH,
52+
originalPath + File.pathSeparator + "/usr/local/cellar/");
53+
}
54+
}
55+
}
56+
57+
private void configureTessdata() throws OcrException {
58+
// First, check environment variable
59+
String tessdataPath = System.getenv(TESSDATA_PREFIX);
60+
61+
if (tessdataPath != null && !tessdataPath.isEmpty()) {
62+
File tessdataDir = new File(tessdataPath);
63+
if (tessdataDir.exists() && tessdataDir.isDirectory()) {
64+
// Tesseract expects the parent directory of tessdata
65+
if (tessdataDir.getName().equals("tessdata")) {
66+
tesseract.setDatapath(tessdataDir.getParent());
67+
} else {
68+
tesseract.setDatapath(tessdataPath);
69+
}
70+
LOGGER.info("Using tessdata from environment variable: {}", tessdataPath);
71+
return;
3372
} else {
34-
System.setProperty(JNA_LIBRARY_PATH, JNA_LIBRARY_PATH + File.pathSeparator + "/usr/local/cellar/");
73+
LOGGER.warn("TESSDATA_PREFIX points to non-existent directory: {}", tessdataPath);
3574
}
3675
}
37-
this.tesseract = new Tesseract();
3876

39-
// Configure Tesseract
40-
tesseract.setLanguage("eng");
77+
// Fall back to system locations
78+
String systemPath = findSystemTessdata();
79+
if (systemPath != null) {
80+
tesseract.setDatapath(systemPath);
81+
LOGGER.info("Using system tessdata at: {}", systemPath);
82+
} else {
83+
throw new OcrException("Could not find tessdata directory. Please set TESSDATA_PREFIX environment variable.");
84+
}
85+
}
86+
87+
private String findSystemTessdata() {
88+
String[] possiblePaths = {
89+
"/usr/local/share", // Homebrew Intel
90+
"/opt/homebrew/share", // Homebrew ARM
91+
"/usr/share" // System
92+
};
4193

42-
// TODO: This path needs to be configurable and bundled properly
43-
// For now, we'll use a relative path that works during development
44-
tesseract.setDatapath("tessdata");
94+
for (String path : possiblePaths) {
95+
File tessdata = new File(path, "tessdata");
96+
File engData = new File(tessdata, "eng.traineddata");
97+
if (tessdata.exists() && engData.exists()) {
98+
return path; // Return parent of tessdata
99+
}
100+
}
45101

46-
LOGGER.debug("Initialized OcrService with Tesseract");
102+
return null;
47103
}
48104

49105
/**
@@ -53,35 +109,35 @@ public OcrService() {
53109
* @return The extracted text, or empty string if no text found
54110
* @throws OcrException if OCR processing fails
55111
*/
56-
public String performOcr(Path pdfPath) throws OcrException {
57-
// Validate input
112+
public OcrResult performOcr(Path pdfPath) {
113+
// User error - not an exception
58114
if (pdfPath == null) {
59-
throw new OcrException("PDF path cannot be null");
115+
LOGGER.warn("PDF path is null");
116+
return OcrResult.failure("No file path provided");
60117
}
61118

62119
File pdfFile = pdfPath.toFile();
120+
121+
// User error - not an exception
63122
if (!pdfFile.exists()) {
64-
throw new OcrException("PDF file does not exist: " + pdfPath);
123+
LOGGER.warn("PDF file does not exist: {}", pdfPath);
124+
return OcrResult.failure("File does not exist: " + pdfPath.getFileName());
65125
}
66126

67127
try {
68128
LOGGER.info("Starting OCR for file: {}", pdfFile.getName());
69129

70-
// Perform OCR
71130
String result = tesseract.doOCR(pdfFile);
72-
73-
// Clean up the result (remove extra whitespace, etc.)
74131
result = StringUtil.isBlank(result) ? "" : result.trim();
75132

76133
LOGGER.info("OCR completed successfully. Extracted {} characters", result.length());
77-
return result;
78-
} catch (
79-
TesseractException e) {
80-
LOGGER.error("OCR failed for file: {}", pdfFile.getName(), e);
81-
throw new OcrException(
82-
"Failed to perform OCR on file: " + pdfFile.getName() +
83-
". Error: " + e.getMessage(), e
84-
);
134+
return OcrResult.success(result);
135+
136+
} catch (TesseractException e) {
137+
// This could be either a user error (corrupt PDF) or our bug
138+
// Log it as error but return as failure, not exception
139+
LOGGER.error("OCR processing failed for file: {}", pdfFile.getName(), e);
140+
return OcrResult.failure("Failed to extract text from PDF: " + e.getMessage());
85141
}
86142
}
87143
}

0 commit comments

Comments
 (0)