1010import java .util .ArrayList ;
1111import java .util .List ;
1212import javax .imageio .ImageIO ;
13- import org .apache .pdfbox .Loader ;
14- import org .apache .pdfbox .pdmodel .PDDocument ;
15- import org .apache .pdfbox .pdmodel .common .PDRectangle ;
16- import org .apache .pdfbox .rendering .ImageType ;
17- import org .apache .pdfbox .rendering .PDFRenderer ;
1813
1914/**
2015 * Extract sub-images from an image.
@@ -30,7 +25,7 @@ public ImageExtractor(LocalInputSource source) throws IOException {
3025
3126 if (source .isPDF ()) {
3227 this .saveFormat = "jpg" ;
33- var pdfPageImages = pdfToImages (source .getFile (), source .getFilename ());
28+ var pdfPageImages = getPDFRasterizer (). PDFToImages (source .getFile (), source .getFilename ());
3429 for (PDFPageImage pdfPageImage : pdfPageImages ) {
3530 this .pageImages .add (pdfPageImage .getImage ());
3631 }
@@ -45,34 +40,14 @@ public ImageExtractor(LocalInputSource source) throws IOException {
4540 }
4641 }
4742
48- private List <PDFPageImage > pdfToImages (byte [] fileBytes , String filename ) throws IOException {
49- PDDocument document = Loader .loadPDF (fileBytes );
50- var pdfRenderer = new PDFRenderer (document );
51- List <PDFPageImage > pdfPageImages = new ArrayList <>();
52- for (int i = 0 ; i < document .getNumberOfPages (); i ++) {
53- var imageBuffer = pdfPageToImageBuffer (i , document , pdfRenderer );
54- pdfPageImages .add (new PDFPageImage (imageBuffer , i , filename , "jpg" ));
55- }
56- document .close ();
57- return pdfPageImages ;
58- }
59-
60- private BufferedImage pdfPageToImageBuffer (
61- int index ,
62- PDDocument document ,
63- PDFRenderer pdfRenderer
64- ) throws IOException {
65- PDRectangle bbox = document .getPage (index ).getBBox ();
66- float dimension = bbox .getWidth () * bbox .getHeight ();
67- int dpi ;
68- if (dimension < 200000 ) {
69- dpi = 300 ;
70- } else if (dimension < 300000 ) {
71- dpi = 250 ;
72- } else {
73- dpi = 200 ;
74- }
75- return pdfRenderer .renderImageWithDPI (index , dpi , ImageType .RGB );
43+ /**
44+ * Get the PDF rasterization implementation.
45+ * Override this method to provide custom PDF rasterization handling.
46+ *
47+ * @return The PDF rasterization implementation.
48+ */
49+ protected PDFRasterization getPDFRasterizer () {
50+ return new PDFRasterizer ();
7651 }
7752
7853 /**
0 commit comments