Skip to content

Commit c3e53cc

Browse files
authored
♻️ use an interface to define PDF rasterization (#324)
1 parent 660460e commit c3e53cc

8 files changed

Lines changed: 93 additions & 52 deletions

File tree

src/main/java/com/mindee/image/ImageExtractor.java

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,6 @@
1010
import java.util.ArrayList;
1111
import java.util.List;
1212
import javax.imageio.ImageIO;
13-
import org.apache.pdfbox.Loader;
14-
import org.apache.pdfbox.pdmodel.PDDocument;
15-
import org.apache.pdfbox.pdmodel.common.PDRectangle;
16-
import org.apache.pdfbox.rendering.ImageType;
17-
import org.apache.pdfbox.rendering.PDFRenderer;
1813

1914
/**
2015
* Extract sub-images from an image.
@@ -30,7 +25,7 @@ public ImageExtractor(LocalInputSource source) throws IOException {
3025

3126
if (source.isPDF()) {
3227
this.saveFormat = "jpg";
33-
var pdfPageImages = pdfToImages(source.getFile(), source.getFilename());
28+
var pdfPageImages = getPDFRasterizer().PDFToImages(source.getFile(), source.getFilename());
3429
for (PDFPageImage pdfPageImage : pdfPageImages) {
3530
this.pageImages.add(pdfPageImage.getImage());
3631
}
@@ -45,34 +40,14 @@ public ImageExtractor(LocalInputSource source) throws IOException {
4540
}
4641
}
4742

48-
private List<PDFPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
49-
PDDocument document = Loader.loadPDF(fileBytes);
50-
var pdfRenderer = new PDFRenderer(document);
51-
List<PDFPageImage> pdfPageImages = new ArrayList<>();
52-
for (int i = 0; i < document.getNumberOfPages(); i++) {
53-
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
54-
pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg"));
55-
}
56-
document.close();
57-
return pdfPageImages;
58-
}
59-
60-
private BufferedImage pdfPageToImageBuffer(
61-
int index,
62-
PDDocument document,
63-
PDFRenderer pdfRenderer
64-
) throws IOException {
65-
PDRectangle bbox = document.getPage(index).getBBox();
66-
float dimension = bbox.getWidth() * bbox.getHeight();
67-
int dpi;
68-
if (dimension < 200000) {
69-
dpi = 300;
70-
} else if (dimension < 300000) {
71-
dpi = 250;
72-
} else {
73-
dpi = 200;
74-
}
75-
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
43+
/**
44+
* Get the PDF rasterization implementation.
45+
* Override this method to provide custom PDF rasterization handling.
46+
*
47+
* @return The PDF rasterization implementation.
48+
*/
49+
protected PDFRasterization getPDFRasterizer() {
50+
return new PDFRasterizer();
7651
}
7752

7853
/**
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package com.mindee.image;
2+
3+
import java.io.IOException;
4+
import java.util.List;
5+
6+
/**
7+
* Rasterize a PDF into images.
8+
*/
9+
public interface PDFRasterization {
10+
/**
11+
* Rasterize a PDF into a list of images, one image per page.
12+
*/
13+
List<PDFPageImage> PDFToImages(byte[] fileBytes, String filename) throws IOException;
14+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package com.mindee.image;
2+
3+
import java.awt.image.BufferedImage;
4+
import java.io.IOException;
5+
import java.util.ArrayList;
6+
import java.util.List;
7+
import org.apache.pdfbox.Loader;
8+
import org.apache.pdfbox.pdmodel.PDDocument;
9+
import org.apache.pdfbox.pdmodel.common.PDRectangle;
10+
import org.apache.pdfbox.rendering.ImageType;
11+
import org.apache.pdfbox.rendering.PDFRenderer;
12+
13+
/**
14+
* Default PDF rasterization implementation.
15+
*/
16+
public class PDFRasterizer implements PDFRasterization {
17+
public List<PDFPageImage> PDFToImages(byte[] fileBytes, String filename) throws IOException {
18+
PDDocument document = Loader.loadPDF(fileBytes);
19+
var pdfRenderer = new PDFRenderer(document);
20+
List<PDFPageImage> pdfPageImages = new ArrayList<>();
21+
for (int i = 0; i < document.getNumberOfPages(); i++) {
22+
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
23+
pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg"));
24+
}
25+
document.close();
26+
return pdfPageImages;
27+
}
28+
29+
private BufferedImage pdfPageToImageBuffer(
30+
int index,
31+
PDDocument document,
32+
PDFRenderer pdfRenderer
33+
) throws IOException {
34+
PDRectangle bbox = document.getPage(index).getBBox();
35+
float dimension = bbox.getWidth() * bbox.getHeight();
36+
int dpi;
37+
if (dimension < 200000) {
38+
dpi = 300;
39+
} else if (dimension < 300000) {
40+
dpi = 250;
41+
} else {
42+
dpi = 200;
43+
}
44+
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
45+
}
46+
}

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public LocalInputSource(String fileAsBase64, String filename) {
6161

6262
/**
6363
* Get the PDFInputOperation instance.
64-
* Override this method to provide custom PDF input operation handling.
64+
* Override this method to provide custom PDF input handling.
6565
*
6666
* @return PDFInputOperation instance
6767
*/

src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,19 @@ public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws
8282
return extractedPDFs;
8383
}
8484

85+
/**
86+
* Make a nice filename for the split.
87+
*/
88+
protected String makeFilename(List<Integer> pageNumbers) {
89+
String[] splitName = InputSourceUtils.splitNameStrict(filename);
90+
return splitName[0]
91+
+ String.format("_%3s", pageNumbers.get(0)).replace(" ", "0")
92+
+ "-"
93+
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0")
94+
+ "."
95+
+ splitName[1];
96+
}
97+
8598
/**
8699
* Converts an array to a buffered image.
87100
*
@@ -95,19 +108,6 @@ private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws I
95108
}
96109
}
97110

98-
/**
99-
* Make a nice filename for the split.
100-
*/
101-
private String makeFilename(List<Integer> pageNumbers) {
102-
String[] splitName = InputSourceUtils.splitNameStrict(filename);
103-
return splitName[0]
104-
+ String.format("_%3s", pageNumbers.get(0)).replace(" ", "0")
105-
+ "-"
106-
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0")
107-
+ "."
108-
+ splitName[1];
109-
}
110-
111111
private static PDPage clonePage(PDPage page) {
112112

113113
COSDictionary pageDict = page.getCOSObject();

src/main/java/com/mindee/pdf/PDFCompression.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,26 @@
22

33
import java.io.IOException;
44

5+
/**
6+
* Compress a PDF.
7+
*/
58
public interface PDFCompression {
69
byte[] compressPDF(
710
byte[] fileBytes,
8-
Integer imageQuality,
11+
int imageQuality,
912
Boolean forceSourceTextCompression,
1013
Boolean disableSourceText
1114
) throws IOException;
1215

1316
default byte[] compressPDF(
1417
byte[] fileBytes,
15-
Integer imageQuality,
18+
int imageQuality,
1619
Boolean forceSourceTextCompression
1720
) throws IOException {
1821
return compressPDF(fileBytes, imageQuality, forceSourceTextCompression, true);
1922
}
2023

21-
default byte[] compressPDF(byte[] fileBytes, Integer imageQuality) throws IOException {
24+
default byte[] compressPDF(byte[] fileBytes, int imageQuality) throws IOException {
2225
return compressPDF(fileBytes, imageQuality, false, true);
2326
}
2427

src/main/java/com/mindee/pdf/PDFCompressor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public PDFCompressor() {
3636
@Override
3737
public byte[] compressPDF(
3838
byte[] fileBytes,
39-
Integer imageQuality,
39+
int imageQuality,
4040
Boolean forceSourceTextCompression,
4141
Boolean disableSourceText
4242
) throws IOException {

src/main/java/com/mindee/pdf/PDFInputOperation.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import com.mindee.input.PageOptions;
44
import java.io.IOException;
55

6+
/**
7+
* Various operations required for PDF input files.
8+
*/
69
public interface PDFInputOperation {
710

811
/**

0 commit comments

Comments
 (0)