✨ allow rendering a single page from a PDF (#164)

ianardee · web-flow · commit eebb575bd075 · 2024-03-26T10:12:08.000+01:00
diff --git a/src/main/java/com/mindee/extraction/ImageExtractor.java b/src/main/java/com/mindee/extraction/ImageExtractor.java
@@ -62,8 +62,7 @@ public int getPageCount() {
   }
 
   /**
-   * Extract images from a list of fields having position data.
-   * Use this when the input file is a PDF with multiple pages.
+   * Extract multiple images on a given page from a list of fields having position data.
    * @param fields List of Fields to extract.
    * @param pageIndex The page index to extract, begins at 0.
    * @return A list of {@link ExtractedImage}.
@@ -76,8 +75,7 @@ public <FieldT extends PositionData> List<ExtractedImage> extractImagesFromPage(
   }
 
   /**
-   * Extract images from a list of fields having position data.
-   * Use this when the input file is a PDF with multiple pages.
+   * Extract multiple images on a given page from a list of fields having position data.
    * @param fields List of Fields to extract.
    * @param pageIndex The page index to extract, begins at 0.
    * @param outputName The base output filename, must have an image extension.
@@ -118,7 +116,7 @@ private <FieldT extends PositionData> List<ExtractedImage> extractFromPage(
   }
 
   /**
-   * Extract an image from a field having position data.
+   * Extract a single image from a field having position data.
    * @param field The field to extract.
    * @param index The index to use for naming the extracted image.
    * @param pageIndex The page index to extract, begins at 0.
@@ -145,7 +143,7 @@ public <FieldT extends PositionData> ExtractedImage extractImage(
   }
 
   /**
-   * Extract an image from a field having position data.
+   * Extract a single image from a field having position data.
    * @param field The field to extract.
    * @param index The index to use for naming the extracted image.
    * @param pageIndex The page index to extract, begins at 0.
diff --git a/src/main/java/com/mindee/pdf/PDFUtils.java b/src/main/java/com/mindee/pdf/PDFUtils.java
@@ -5,7 +5,6 @@
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.pdfbox.cos.COSName;
@@ -21,26 +20,19 @@
  */
 public final class PDFUtils {
 
-  private PDFUtils() {
-  }
+  private PDFUtils() {}
 
-  private static int countPDDocumentPages(PDDocument document) throws IOException {
+  /**
+   * Get the number of pages in the PDF.
+   * @param inputSource The PDF file.
+   */
+  public static int getNumberOfPages(LocalInputSource inputSource) throws IOException {
+    PDDocument document = PDDocument.load(inputSource.getFile());
     int pageCount = document.getNumberOfPages();
     document.close();
     return pageCount;
   }
 
-  public static int countPdfPages(InputStream inputStream) throws IOException {
-    try {
-      PDDocument document = PDDocument.load(inputStream);
-      int pageCount = countPDDocumentPages(document);
-      document.close();
-      return pageCount;
-    } finally {
-      inputStream.close();
-    }
-  }
-
   private static byte[] createPdfFromExistingPdf(
       PDDocument document,
       List<Integer> pageNumbers
@@ -61,6 +53,11 @@ private static byte[] createPdfFromExistingPdf(
     return output;
   }
 
+  /**
+   * Merge specified PDF pages together.
+   * @param file The PDF file.
+   * @param pageNumbers Lit of page numbers to merge together.
+   */
   public static byte[] mergePdfPages(
       File file,
       List<Integer> pageNumbers
@@ -74,7 +71,6 @@ public static boolean isPdfEmpty(File file) throws IOException {
   }
 
   private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException {
-
     boolean isEmpty = true;
     for (PDPage page : document.getPages()) {
       PDResources resources = page.getResources();
@@ -97,29 +93,80 @@ private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException
     return isEmpty;
   }
 
+  /**
+   * Render all pages of a PDF as images.
+   * Converting PDFs with hundreds of pages may result in a heap space error.
+   * @param filePath The path to the PDF file.
+   * @return List of all pages as images.
+   */
   public static List<PdfPageImage> pdfToImages(String filePath) throws IOException {
     return pdfToImages(new LocalInputSource(filePath));
   }
 
+  /**
+   * Render all pages of a PDF as images.
+   * Converting PDFs with hundreds of pages may result in a heap space error.
+   * @param source The PDF file.
+   * @return List of all pages as images.
+   */
   public static List<PdfPageImage> pdfToImages(LocalInputSource source) throws IOException {
     PDDocument document = PDDocument.load(source.getFile());
     PDFRenderer pdfRenderer = new PDFRenderer(document);
     List<PdfPageImage> pdfPageImages = new ArrayList<>();
     for (int i = 0; i < document.getNumberOfPages(); i++) {
-      PDRectangle bbox = document.getPage(i).getBBox();
-      float dimension = bbox.getWidth() * bbox.getHeight();
-      int dpi;
-      if (dimension < 200000) {
-        dpi = 300;
-      } else if (dimension < 300000) {
-        dpi = 250;
-      } else {
-        dpi = 200;
-      }
-      BufferedImage imageBuffer = pdfRenderer.renderImageWithDPI(i, dpi, ImageType.RGB);
+      BufferedImage imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
       pdfPageImages.add(new PdfPageImage(imageBuffer, i, source.getFilename(), "jpg"));
     }
     document.close();
     return pdfPageImages;
   }
+
+  /**
+   * Render a single page of a PDF as an image.
+   * Main use case is for processing PDFs with hundreds of pages.
+   * If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and then <code>pdfToImages</code>.
+   * @param filePath The path to the PDF file.
+   * @param pageNumber The page number to render, first page is 1.
+   * @return The page as an image.
+   */
+  public static PdfPageImage pdfPageToImage(String filePath, int pageNumber) throws IOException {
+    return pdfPageToImage(new LocalInputSource(filePath), pageNumber);
+  }
+
+  /**
+   * Render a single page of a PDF as an image.
+   * Main use case is for processing PDFs with hundreds of pages.
+   * If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and then <code>pdfToImages</code>.
+   * @param source The PDF file.
+   * @param pageNumber The page number to render, first page is 1.
+   * @return The page as an image.
+   */
+  public static PdfPageImage pdfPageToImage(
+      LocalInputSource source,
+      int pageNumber
+  ) throws IOException {
+    int index = pageNumber - 1;
+    PDDocument document = PDDocument.load(source.getFile());
+    PDFRenderer pdfRenderer = new PDFRenderer(document);
+    BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
+    return new PdfPageImage(imageBuffer, index, source.getFilename(), "jpg");
+  }
+
+  private static BufferedImage pdfPageToImageBuffer(
+      int index,
+      PDDocument document,
+      PDFRenderer pdfRenderer
+  ) throws IOException {
+    PDRectangle bbox = document.getPage(index).getBBox();
+    float dimension = bbox.getWidth() * bbox.getHeight();
+    int dpi;
+    if (dimension < 200000) {
+      dpi = 300;
+    } else if (dimension < 300000) {
+      dpi = 250;
+    } else {
+      dpi = 200;
+    }
+    return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
+  }
 }
diff --git a/src/test/java/com/mindee/pdf/PDFUtilsTest.java b/src/test/java/com/mindee/pdf/PDFUtilsTest.java
@@ -1,7 +1,7 @@
 package com.mindee.pdf;
 
+import com.mindee.input.LocalInputSource;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -29,7 +29,8 @@ public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws
     document.save("src/test/resources/output/test.pdf");
     document.close();
     File file = new File("src/test/resources/output/test.pdf");
-    Assertions.assertEquals(random, PDFUtils.countPdfPages(new FileInputStream(file)));
+    LocalInputSource source = new LocalInputSource(file);
+    Assertions.assertEquals(random, PDFUtils.getNumberOfPages(source));
     file.delete();
   }
 
@@ -71,14 +72,25 @@ public void givenAnEmptyDocument_whenEmptyChecked_shouldReturnTrue() throws IOEx
   }
 
   @Test
-  public void shouldConvertToJpg() throws IOException {
+  public void shouldConvertAllPagesToJpg() throws IOException {
     List<PdfPageImage> pdfPageImages = PDFUtils.pdfToImages(
-      "src/test/resources/file_types/pdf/not_blank_image_only.pdf"
+      "src/test/resources/file_types/pdf/multipage_cut-2.pdf"
     );
     for (PdfPageImage pdfPageImage : pdfPageImages) {
       Assertions.assertNotNull(pdfPageImage.getImage());
       Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename());
       pdfPageImage.writeToFile("src/test/resources/output/");
+      Assertions.assertTrue(Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())));
     }
   }
+
+  @Test
+  public void shouldConvertSinglePageToJpg() throws IOException {
+    LocalInputSource source = new LocalInputSource("src/test/resources/file_types/pdf/multipage.pdf");
+    PdfPageImage pdfPageImage = PDFUtils.pdfPageToImage(source, 3);
+    Assertions.assertNotNull(pdfPageImage.getImage());
+    Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename());
+    pdfPageImage.writeToFile("src/test/resources/output/");
+    Assertions.assertTrue(Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())));
+  }
 }