Skip to content

Commit 8784a59

Browse files
committed
Add logging of page processing progress in PagePdfDocumentReader
1 parent 7f60e03 commit 8784a59

File tree

2 files changed

+21
-2
lines changed

2 files changed

+21
-2
lines changed

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import org.apache.pdfbox.pdmodel.PDDocument;
2626
import org.apache.pdfbox.pdmodel.PDPage;
2727

28+
import org.slf4j.Logger;
29+
import org.slf4j.LoggerFactory;
2830
import org.springframework.ai.document.Document;
2931
import org.springframework.ai.document.DocumentReader;
3032
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
@@ -44,6 +46,8 @@
4446
*/
4547
public class PagePdfDocumentReader implements DocumentReader {
4648

49+
private final Logger logger = LoggerFactory.getLogger(getClass());
50+
4751
private static final String PDF_PAGE_REGION = "pdfPageRegion";
4852

4953
public static final String METADATA_START_PAGE_NUMBER = "page_number";
@@ -99,7 +103,17 @@ public List<Document> get() {
99103

100104
List<String> pageTextGroupList = new ArrayList<>();
101105

106+
int totalPages = this.document.getDocumentCatalog().getPages().getCount();
107+
int logFrequency = totalPages > 10 ? totalPages / 10 : 1; // if less than 10
108+
// pages, print
109+
// each iteration
110+
int counter = 0;
111+
102112
for (PDPage page : this.document.getDocumentCatalog().getPages()) {
113+
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
114+
logger.info("Processing PDF page: {}", (counter + 1));
115+
}
116+
counter++;
103117

104118
pagesPerDocument++;
105119

@@ -139,7 +153,7 @@ public List<Document> get() {
139153
readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber,
140154
pageNumber));
141155
}
142-
156+
logger.info("Processing {} pages", totalPages);
143157
return readDocuments;
144158

145159
}

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import org.apache.pdfbox.pdfparser.PDFParser;
2424
import org.apache.pdfbox.pdmodel.PDDocument;
2525

26+
import org.slf4j.Logger;
27+
import org.slf4j.LoggerFactory;
2628
import org.springframework.ai.document.Document;
2729
import org.springframework.ai.document.DocumentReader;
2830
import org.springframework.ai.reader.pdf.config.ParagraphManager;
@@ -46,6 +48,8 @@
4648
*/
4749
public class ParagraphPdfDocumentReader implements DocumentReader {
4850

51+
private final Logger logger = LoggerFactory.getLogger(getClass());
52+
4953
// Constants for metadata keys
5054
private static final String METADATA_START_PAGE = "page_number";
5155

@@ -128,6 +132,7 @@ public List<Document> get() {
128132
List<Document> documents = new ArrayList<>(paragraphs.size());
129133

130134
if (!CollectionUtils.isEmpty(paragraphs)) {
135+
logger.info("Start processing paragraphs from PDF");
131136
Iterator<Paragraph> itr = paragraphs.iterator();
132137

133138
var current = itr.next();
@@ -146,7 +151,7 @@ public List<Document> get() {
146151
}
147152
}
148153
}
149-
154+
logger.info("End processing paragraphs from PDF");
150155
return documents;
151156
}
152157

0 commit comments

Comments
 (0)