Skip to content

Commit eebb575

Browse files
authored
✨ allow rendering a single page from a PDF (#164)
1 parent 224a08c commit eebb575

File tree

3 files changed

+94
-37
lines changed

3 files changed

+94
-37
lines changed

src/main/java/com/mindee/extraction/ImageExtractor.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,7 @@ public int getPageCount() {
6262
}
6363

6464
/**
65-
* Extract images from a list of fields having position data.
66-
* Use this when the input file is a PDF with multiple pages.
65+
* Extract multiple images on a given page from a list of fields having position data.
6766
* @param fields List of Fields to extract.
6867
* @param pageIndex The page index to extract, begins at 0.
6968
* @return A list of {@link ExtractedImage}.
@@ -76,8 +75,7 @@ public <FieldT extends PositionData> List<ExtractedImage> extractImagesFromPage(
7675
}
7776

7877
/**
79-
* Extract images from a list of fields having position data.
80-
* Use this when the input file is a PDF with multiple pages.
78+
* Extract multiple images on a given page from a list of fields having position data.
8179
* @param fields List of Fields to extract.
8280
* @param pageIndex The page index to extract, begins at 0.
8381
* @param outputName The base output filename, must have an image extension.
@@ -118,7 +116,7 @@ private <FieldT extends PositionData> List<ExtractedImage> extractFromPage(
118116
}
119117

120118
/**
121-
* Extract an image from a field having position data.
119+
* Extract a single image from a field having position data.
122120
* @param field The field to extract.
123121
* @param index The index to use for naming the extracted image.
124122
* @param pageIndex The page index to extract, begins at 0.
@@ -145,7 +143,7 @@ public <FieldT extends PositionData> ExtractedImage extractImage(
145143
}
146144

147145
/**
148-
* Extract an image from a field having position data.
146+
* Extract a single image from a field having position data.
149147
* @param field The field to extract.
150148
* @param index The index to use for naming the extracted image.
151149
* @param pageIndex The page index to extract, begins at 0.

src/main/java/com/mindee/pdf/PDFUtils.java

Lines changed: 74 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import java.io.ByteArrayOutputStream;
66
import java.io.File;
77
import java.io.IOException;
8-
import java.io.InputStream;
98
import java.util.ArrayList;
109
import java.util.List;
1110
import org.apache.pdfbox.cos.COSName;
@@ -21,26 +20,19 @@
2120
*/
2221
public final class PDFUtils {
2322

24-
private PDFUtils() {
25-
}
23+
private PDFUtils() {}
2624

27-
private static int countPDDocumentPages(PDDocument document) throws IOException {
25+
/**
26+
* Get the number of pages in the PDF.
27+
* @param inputSource The PDF file.
28+
*/
29+
public static int getNumberOfPages(LocalInputSource inputSource) throws IOException {
30+
PDDocument document = PDDocument.load(inputSource.getFile());
2831
int pageCount = document.getNumberOfPages();
2932
document.close();
3033
return pageCount;
3134
}
3235

33-
public static int countPdfPages(InputStream inputStream) throws IOException {
34-
try {
35-
PDDocument document = PDDocument.load(inputStream);
36-
int pageCount = countPDDocumentPages(document);
37-
document.close();
38-
return pageCount;
39-
} finally {
40-
inputStream.close();
41-
}
42-
}
43-
4436
private static byte[] createPdfFromExistingPdf(
4537
PDDocument document,
4638
List<Integer> pageNumbers
@@ -61,6 +53,11 @@ private static byte[] createPdfFromExistingPdf(
6153
return output;
6254
}
6355

56+
/**
57+
* Merge specified PDF pages together.
58+
* @param file The PDF file.
59+
* @param pageNumbers Lit of page numbers to merge together.
60+
*/
6461
public static byte[] mergePdfPages(
6562
File file,
6663
List<Integer> pageNumbers
@@ -74,7 +71,6 @@ public static boolean isPdfEmpty(File file) throws IOException {
7471
}
7572

7673
private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException {
77-
7874
boolean isEmpty = true;
7975
for (PDPage page : document.getPages()) {
8076
PDResources resources = page.getResources();
@@ -97,29 +93,80 @@ private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException
9793
return isEmpty;
9894
}
9995

96+
/**
97+
* Render all pages of a PDF as images.
98+
* Converting PDFs with hundreds of pages may result in a heap space error.
99+
* @param filePath The path to the PDF file.
100+
* @return List of all pages as images.
101+
*/
100102
public static List<PdfPageImage> pdfToImages(String filePath) throws IOException {
101103
return pdfToImages(new LocalInputSource(filePath));
102104
}
103105

106+
/**
107+
* Render all pages of a PDF as images.
108+
* Converting PDFs with hundreds of pages may result in a heap space error.
109+
* @param source The PDF file.
110+
* @return List of all pages as images.
111+
*/
104112
public static List<PdfPageImage> pdfToImages(LocalInputSource source) throws IOException {
105113
PDDocument document = PDDocument.load(source.getFile());
106114
PDFRenderer pdfRenderer = new PDFRenderer(document);
107115
List<PdfPageImage> pdfPageImages = new ArrayList<>();
108116
for (int i = 0; i < document.getNumberOfPages(); i++) {
109-
PDRectangle bbox = document.getPage(i).getBBox();
110-
float dimension = bbox.getWidth() * bbox.getHeight();
111-
int dpi;
112-
if (dimension < 200000) {
113-
dpi = 300;
114-
} else if (dimension < 300000) {
115-
dpi = 250;
116-
} else {
117-
dpi = 200;
118-
}
119-
BufferedImage imageBuffer = pdfRenderer.renderImageWithDPI(i, dpi, ImageType.RGB);
117+
BufferedImage imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
120118
pdfPageImages.add(new PdfPageImage(imageBuffer, i, source.getFilename(), "jpg"));
121119
}
122120
document.close();
123121
return pdfPageImages;
124122
}
123+
124+
/**
125+
* Render a single page of a PDF as an image.
126+
* Main use case is for processing PDFs with hundreds of pages.
127+
* If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and then <code>pdfToImages</code>.
128+
* @param filePath The path to the PDF file.
129+
* @param pageNumber The page number to render, first page is 1.
130+
* @return The page as an image.
131+
*/
132+
public static PdfPageImage pdfPageToImage(String filePath, int pageNumber) throws IOException {
133+
return pdfPageToImage(new LocalInputSource(filePath), pageNumber);
134+
}
135+
136+
/**
137+
* Render a single page of a PDF as an image.
138+
* Main use case is for processing PDFs with hundreds of pages.
139+
* If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and then <code>pdfToImages</code>.
140+
* @param source The PDF file.
141+
* @param pageNumber The page number to render, first page is 1.
142+
* @return The page as an image.
143+
*/
144+
public static PdfPageImage pdfPageToImage(
145+
LocalInputSource source,
146+
int pageNumber
147+
) throws IOException {
148+
int index = pageNumber - 1;
149+
PDDocument document = PDDocument.load(source.getFile());
150+
PDFRenderer pdfRenderer = new PDFRenderer(document);
151+
BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
152+
return new PdfPageImage(imageBuffer, index, source.getFilename(), "jpg");
153+
}
154+
155+
private static BufferedImage pdfPageToImageBuffer(
156+
int index,
157+
PDDocument document,
158+
PDFRenderer pdfRenderer
159+
) throws IOException {
160+
PDRectangle bbox = document.getPage(index).getBBox();
161+
float dimension = bbox.getWidth() * bbox.getHeight();
162+
int dpi;
163+
if (dimension < 200000) {
164+
dpi = 300;
165+
} else if (dimension < 300000) {
166+
dpi = 250;
167+
} else {
168+
dpi = 200;
169+
}
170+
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
171+
}
125172
}

src/test/java/com/mindee/pdf/PDFUtilsTest.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package com.mindee.pdf;
22

3+
import com.mindee.input.LocalInputSource;
34
import java.io.File;
4-
import java.io.FileInputStream;
55
import java.io.IOException;
66
import java.nio.file.Files;
77
import java.nio.file.Path;
@@ -29,7 +29,8 @@ public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws
2929
document.save("src/test/resources/output/test.pdf");
3030
document.close();
3131
File file = new File("src/test/resources/output/test.pdf");
32-
Assertions.assertEquals(random, PDFUtils.countPdfPages(new FileInputStream(file)));
32+
LocalInputSource source = new LocalInputSource(file);
33+
Assertions.assertEquals(random, PDFUtils.getNumberOfPages(source));
3334
file.delete();
3435
}
3536

@@ -71,14 +72,25 @@ public void givenAnEmptyDocument_whenEmptyChecked_shouldReturnTrue() throws IOEx
7172
}
7273

7374
@Test
74-
public void shouldConvertToJpg() throws IOException {
75+
public void shouldConvertAllPagesToJpg() throws IOException {
7576
List<PdfPageImage> pdfPageImages = PDFUtils.pdfToImages(
76-
"src/test/resources/file_types/pdf/not_blank_image_only.pdf"
77+
"src/test/resources/file_types/pdf/multipage_cut-2.pdf"
7778
);
7879
for (PdfPageImage pdfPageImage : pdfPageImages) {
7980
Assertions.assertNotNull(pdfPageImage.getImage());
8081
Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename());
8182
pdfPageImage.writeToFile("src/test/resources/output/");
83+
Assertions.assertTrue(Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())));
8284
}
8385
}
86+
87+
@Test
88+
public void shouldConvertSinglePageToJpg() throws IOException {
89+
LocalInputSource source = new LocalInputSource("src/test/resources/file_types/pdf/multipage.pdf");
90+
PdfPageImage pdfPageImage = PDFUtils.pdfPageToImage(source, 3);
91+
Assertions.assertNotNull(pdfPageImage.getImage());
92+
Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename());
93+
pdfPageImage.writeToFile("src/test/resources/output/");
94+
Assertions.assertTrue(Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())));
95+
}
8496
}

0 commit comments

Comments
 (0)