@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
2828import com .itextpdf .styledxmlparser .jsoup .Jsoup ;
2929import com .itextpdf .styledxmlparser .jsoup .nodes .Document ;
3030import com .itextpdf .styledxmlparser .jsoup .nodes .Element ;
31+ import com .itextpdf .styledxmlparser .jsoup .nodes .Node ;
3132import com .itextpdf .styledxmlparser .jsoup .select .Elements ;
3233
3334import java .io .File ;
@@ -60,6 +61,27 @@ public class TesseractHelper {
6061 private static final Logger LOGGER = LoggerFactory
6162 .getLogger (TesseractHelper .class );
6263
64+ /**
65+ * Patterns for matching hOCR element bboxes.
66+ */
67+ private static final Pattern BBOX_PATTERN = Pattern .compile (".*bbox(\\ s+\\ d+){4}.*" );
68+ private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
69+ .compile (
70+ ".*\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+).*" );
71+
72+ /**
73+ * Indices in array representing bbox.
74+ */
75+ private static final int LEFT_IDX = 0 ;
76+ private static final int BOTTOM_IDX = 1 ;
77+ private static final int RIGHT_IDX = 2 ;
78+ private static final int TOP_IDX = 3 ;
79+
80+ /**
81+ * Size of the array containing bbox.
82+ */
83+ private static final int BBOX_ARRAY_SIZE = 4 ;
84+
6385 /**
6486 * Creates a new {@link TesseractHelper} instance.
6587 */
@@ -86,23 +108,20 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
86108 throws IOException {
87109 Map <Integer , List <TextInfo >> imageData =
88110 new LinkedHashMap <Integer , List <TextInfo >>();
111+ Map <String , Node > unparsedBBoxes = new LinkedHashMap <>();
89112
90113 for (File inputFile : inputFiles ) {
91114 if (inputFile != null
92115 && Files .exists (
93- java .nio .file .Paths
94- .get (inputFile .getAbsolutePath ()))) {
116+ java .nio .file .Paths
117+ .get (inputFile .getAbsolutePath ()))) {
95118 FileInputStream fileInputStream =
96119 new FileInputStream (inputFile .getAbsolutePath ());
97120 Document doc = Jsoup .parse (fileInputStream ,
98121 java .nio .charset .StandardCharsets .UTF_8 .name (),
99122 inputFile .getAbsolutePath ());
100123 Elements pages = doc .getElementsByClass ("ocr_page" );
101124
102- Pattern bboxPattern = Pattern .compile (".*bbox(\\ s+\\ d+){4}.*" );
103- Pattern bboxCoordinatePattern = Pattern
104- .compile (
105- ".*\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+)\\ s+(\\ d+).*" );
106125 List <String > searchedClasses = TextPositioning .BY_LINES
107126 .equals (textPositioning )
108127 ? Arrays .<String >asList ("ocr_line" , "ocr_caption" )
@@ -124,26 +143,11 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
124143 }
125144 }
126145 for (Element obj : objects ) {
127- String value = obj .attr ("title" );
128- Matcher bboxMatcher = bboxPattern .matcher (value );
129- if (bboxMatcher .matches ()) {
130- Matcher bboxCoordinateMatcher =
131- bboxCoordinatePattern
132- .matcher (bboxMatcher .group ());
133- if (bboxCoordinateMatcher .matches ()) {
134- List <Float > coordinates =
135- new ArrayList <Float >();
136- for (int i = 0 ; i < 4 ; i ++) {
137- String coord = bboxCoordinateMatcher
138- .group (i + 1 );
139- coordinates
140- .add (Float .parseFloat (coord ));
141- }
142-
143- textData .add (new TextInfo (obj .text (),
144- coordinates ));
145- }
146- }
146+ List <Float > coordinates = getAlignedBBox (obj ,
147+ textPositioning ,
148+ unparsedBBoxes );
149+ textData .add (new TextInfo (obj .text (),
150+ coordinates ));
147151 }
148152 }
149153 if (textData .size () > 0 ) {
@@ -157,9 +161,97 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
157161 fileInputStream .close ();
158162 }
159163 }
164+ for (Node node : unparsedBBoxes .values ()) {
165+ LOGGER .warn (MessageFormatUtil .format (
166+ Tesseract4LogMessageConstant .CANNOT_PARSE_NODE_BBOX ,
167+ node .toString ()
168+ ));
169+ }
160170 return imageData ;
161171 }
162172
173+ /**
174+ * Get and align (if needed) bbox of the element.
175+ */
176+ static List <Float > getAlignedBBox (Element object ,
177+ TextPositioning textPositioning ,
178+ Map <String , Node > unparsedBBoxes ) {
179+ final List <Float > coordinates = parseBBox (object , unparsedBBoxes );
180+ if (TextPositioning .BY_WORDS_AND_LINES == textPositioning
181+ || TextPositioning .BY_WORDS == textPositioning ) {
182+ Node line = object .parent ();
183+ final List <Float > lineCoordinates = parseBBox (line , unparsedBBoxes );
184+ if (TextPositioning .BY_WORDS_AND_LINES == textPositioning ) {
185+ coordinates .set (BOTTOM_IDX , lineCoordinates .get (BOTTOM_IDX ));
186+ coordinates .set (TOP_IDX , lineCoordinates .get (TOP_IDX ));
187+ }
188+ detectAndFixBrokenBBoxes (object , coordinates ,
189+ lineCoordinates , unparsedBBoxes );
190+ }
191+ return coordinates ;
192+ }
193+
194+ /**
195+ * Parses element bbox.
196+ *
197+ * @param node element containing bbox
198+ * @param unparsedBBoxes list of element ids with bboxes which could not be parsed
199+ * @return parsed bbox
200+ */
201+ static List <Float > parseBBox (Node node , Map <String , Node > unparsedBBoxes ) {
202+ List <Float > bbox = new ArrayList <>();
203+ Matcher bboxMatcher = BBOX_PATTERN .matcher (node .attr ("title" ));
204+ if (bboxMatcher .matches ()) {
205+ Matcher bboxCoordinateMatcher =
206+ BBOX_COORDINATE_PATTERN
207+ .matcher (bboxMatcher .group ());
208+ if (bboxCoordinateMatcher .matches ()) {
209+ for (int i = 0 ; i < BBOX_ARRAY_SIZE ; i ++) {
210+ String coord = bboxCoordinateMatcher
211+ .group (i + 1 );
212+ bbox .add (Float .parseFloat (coord ));
213+ }
214+ }
215+ }
216+ if (bbox .size () == 0 ) {
217+ bbox = Arrays .asList (0f , 0f , 0f , 0f );
218+ String id = node .attr ("id" );
219+ if (id != null && !unparsedBBoxes .containsKey (id )) {
220+ unparsedBBoxes .put (id , node );
221+ }
222+ }
223+ return bbox ;
224+ }
225+
226+ /**
227+ * Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
228+ * This method attempts to detect and fix them.
229+ */
230+ static void detectAndFixBrokenBBoxes (Element object , List <Float > coordinates ,
231+ List <Float > lineCoordinates ,
232+ Map <String , Node > unparsedBBoxes ) {
233+ if (coordinates .get (LEFT_IDX ) < lineCoordinates .get (LEFT_IDX )
234+ || coordinates .get (LEFT_IDX ) > lineCoordinates .get (RIGHT_IDX )) {
235+ if (object .previousElementSibling () == null ) {
236+ coordinates .set (LEFT_IDX , lineCoordinates .get (LEFT_IDX ));
237+ } else {
238+ Element sibling = object .previousElementSibling ();
239+ List <Float > siblingBBox = parseBBox (sibling , unparsedBBoxes );
240+ coordinates .set (LEFT_IDX , siblingBBox .get (RIGHT_IDX ));
241+ }
242+ }
243+ if (coordinates .get (RIGHT_IDX ) > lineCoordinates .get (RIGHT_IDX )
244+ || coordinates .get (RIGHT_IDX ) < lineCoordinates .get (LEFT_IDX )) {
245+ if (object .nextElementSibling () == null ) {
246+ coordinates .set (RIGHT_IDX , lineCoordinates .get (RIGHT_IDX ));
247+ } else {
248+ Element sibling = object .nextElementSibling ();
249+ List <Float > siblingBBox = parseBBox (sibling , unparsedBBoxes );
250+ coordinates .set (RIGHT_IDX , siblingBBox .get (LEFT_IDX ));
251+ }
252+ }
253+ }
254+
163255 /**
164256 * Deletes file using provided path.
165257 *
@@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
208300 * @param data text data in required format as {@link java.lang.String}
209301 */
210302 static void writeToTextFile (final String path ,
211- final String data ) {
303+ final String data ) {
212304 try (Writer writer = new OutputStreamWriter (new FileOutputStream (path ),
213305 StandardCharsets .UTF_8 )) {
214306 writer .write (data );
@@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
228320 * @throws Tesseract4OcrException if provided command failed
229321 */
230322 static void runCommand (final String execPath ,
231- final List <String > paramsList ) throws Tesseract4OcrException {
323+ final List <String > paramsList ) throws Tesseract4OcrException {
232324 try {
233325 String params = String .join (" " , paramsList );
234326 boolean cmdSucceeded = SystemUtil
@@ -251,4 +343,4 @@ static void runCommand(final String execPath,
251343 .TESSERACT_FAILED );
252344 }
253345 }
254- }
346+ }
0 commit comments