-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathImageExtractor.php
More file actions
328 lines (295 loc) · 10.7 KB
/
Copy pathImageExtractor.php
File metadata and controls
328 lines (295 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
<?php
declare(strict_types=1);
namespace Mindee\Extraction;
use Exception;
use Mindee\Dependency\DependencyChecker;
use Mindee\Error\ErrorCode;
use Mindee\Error\MindeeGeometryException;
use Mindee\Error\MindeeImageException;
use Mindee\Error\MindeePDFException;
use Mindee\Geometry\BBox;
use Mindee\Geometry\BBoxUtils;
use Mindee\Geometry\Point;
use Mindee\Geometry\Polygon;
use Mindee\Input\LocalInputSource;
use Mindee\V1\Parsing\Standard\BaseField;
use Imagick;
use ImagickException;
use function count;
use function sprintf;
/**
* Extract sub-images from an image.
*/
class ImageExtractor
{
/**
* @var Imagick[] Array of extracted page images.
*/
protected array $pageImages = [];
/**
* @var string Name of the file.
*/
protected string $filename;
/**
* @var string Format to save the image as.
*/
protected string $saveFormat;
/**
* @var LocalInputSource Local input object used by the ImageExtractor.
*/
protected LocalInputSource $inputSource;
/**
* @param LocalInputSource $localInput Local input, accepts all compatible formats.
* @param null|string $saveFormat Save format, will be coerced to jpg by default.
*
* @throws MindeePDFException Throws if PDF operations aren't supported, or if the file can't be read, respectively.
*/
public function __construct(LocalInputSource $localInput, ?string $saveFormat = null)
{
DependencyChecker::isImageMagickAvailable();
DependencyChecker::isGhostscriptAvailable();
$this->filename = $localInput->fileName;
$this->inputSource = $localInput;
$extension = pathinfo($localInput->fileName, PATHINFO_EXTENSION);
if (null === $saveFormat) {
if ($extension && 'pdf' !== strtolower($extension)) {
$this->saveFormat = $extension;
} else {
$this->saveFormat = 'jpg';
}
} else {
$this->saveFormat = $saveFormat;
}
if ($this->inputSource->isPDF()) {
$this->pageImages = $this->pdfToImages($this->inputSource->readContents()[1]);
} else {
try {
$image = new Imagick();
$image->readImageBlob($this->inputSource->readContents()[1]);
} catch (ImagickException $e) {
throw new MindeePDFException(
"Image couldn't be processed.",
ErrorCode::IMAGE_CANT_PROCESS,
$e
);
}
$this->pageImages[] = $image;
}
}
/**
* Renders the input PDF's pages as individual images.
*
* @param string $fileBytes Input pdf.
*
* @return Imagick[] A list of pages.
*
* @throws MindeeImageException Throws if the image can't be handled.
*/
public static function pdfToImages(string $fileBytes): array
{
try {
$images = [];
$imagick = new Imagick();
$imagick->readImageBlob($fileBytes);
foreach ($imagick as $page) {
$page->setImageFormat('jpg');
$images[] = $page;
}
return $images;
} catch (ImagickException $e) {
throw new MindeeImageException(
"Couldn't convert PDF to images.",
ErrorCode::FILE_OPERATION_ABORTED,
$e
);
}
}
/**
* Gets the number of pages in the file.
* @return integer Page count.
*/
public function getPageCount(): int
{
return count($this->pageImages);
}
/**
* Extract multiple images on a given page from a list of fields having position data.
*
* @param array<BaseField<string|float|integer|boolean|Polygon>> $fields List of Fields to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param null|string $outputName The base output filename, must have an image extension.
*
* @return array<ExtractedImage> a list of extracted images
*/
public function extractImagesFromPage(array $fields, int $pageIndex, ?string $outputName = null): array
{
$outputName ??= $this->filename;
return $this->extractFromPage($fields, $pageIndex, $outputName);
}
/**
* Extracts images from a page.
*
* @param array<Polygon|array<Point>> $polygons List of polygons to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param null|string $filenamePrefix Output filename prefix.
* @param null|string $format Save format for extracted images. Defaults to the original format.
*
* @return array<ExtractedImage> An array of created images
* @throws MindeeImageException Throws if the image can't be processed.
*/
public function extractPolygonsFromPage(
array $polygons,
int $pageIndex,
?string $filenamePrefix = null,
?string $format = null
): array {
$saveFormat = $format ?? $this->saveFormat;
$extractedImages = [];
try {
foreach ($polygons as $i => $polygon) {
$filenamePrefix ??= $this->filename;
$outputFilename = sprintf('%s-%d.%s', $filenamePrefix, $i, $saveFormat);
$extractedImages[] = $this->extractPolygonFromPage(
$polygon,
$pageIndex,
$i,
$outputFilename,
$saveFormat
);
}
} catch (Exception $e) {
throw new MindeeImageException($e->getMessage(), $e->getCode(), $e);
}
return $extractedImages;
}
/**
* Extracts a cropped portion from an image.
*
* @param Polygon $polygon Polygon to extract.
* @param integer $pageIndex Page index to extract from.
* @param integer $index Index to use for naming the extracted image.
* @param null|string $filename Output filename.
* @param null|string $format Output format.
*
* @return ExtractedImage Extracted image data.
* @throws MindeeImageException Throws if the image can't be processed.
*/
public function extractPolygonFromPage(
Polygon $polygon,
int $pageIndex,
int $index,
?string $filename = null,
?string $format = null
): ExtractedImage {
$bbox = BBoxUtils::generateBBoxFromPolygon($polygon);
try {
$extractedImageData = $this->extractImageFromBbox($bbox, $pageIndex);
} catch (ImagickException $e) {
throw new MindeeImageException($e->getMessage(), $e->getCode(), $e);
}
$format ??= $this->saveFormat;
$filename ??= sprintf('%s_page%d-%d.%s', $this->filename, $pageIndex, $index, $format);
return new ExtractedImage($extractedImageData, $filename, $format, $pageIndex, $index);
}
/**
* Extracts a single image from a Position field.
*
* @param BaseField<string|float|integer|boolean|Polygon> $field The field to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param integer $index The index to use for naming the extracted image.
* @param string $filename The output filename.
* @param string $format The output format.
*
* @return null|ExtractedImage The extracted image, or null if the field does not have valid position data.
*
* @throws MindeeGeometryException Throws if a field does not contain positional data.
*/
public function extractImage(
BaseField $field,
int $pageIndex,
int $index,
string $filename,
string $format
): ?ExtractedImage {
$polygon = null;
if (!empty($field->polygon)) {
$polygon = $field->polygon;
} elseif (!empty($field->boundingBox)) {
$polygon = $field->boundingBox;
} elseif (!empty($field->quadrangle)) {
$polygon = $field->quadrangle;
} elseif (!empty($field->rectangle)) {
$polygon = $field->rectangle;
}
if (null === $polygon) {
throw new MindeeGeometryException(
'Provided field has no valid position data.',
ErrorCode::GEOMETRIC_OPERATION_FAILED
);
}
return $this->extractPolygonFromPage($polygon, $pageIndex, $index, $filename, $format);
}
/**
* Getter for the local input source.
*/
public function getInputSource(): LocalInputSource
{
return $this->inputSource;
}
/**
* Extracts images from a page.
*
* @param array<BaseField<string|float|integer|boolean|Polygon>> $fields List of Fields to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param string $outputName Name of the created file.
* @param string $format The output format.
*
* @return array<ExtractedImage> An array of created images
*/
protected function extractFromPage(array $fields, int $pageIndex, string $outputName, string $format = 'jpg'): array
{
$extractedImages = [];
$i = 0;
foreach ($fields as $field) {
$filename = sprintf('%s_page%d-%d.%s', $outputName, $pageIndex, $i, $format);
$extractedImage = $this->extractImage($field, $pageIndex, $i, $filename, $format);
if (null !== $extractedImage) {
$extractedImages[] = $extractedImage;
}
++$i;
}
return $extractedImages;
}
/**
* Extracts an image from a set of coordinates.
*
* @param BBox $bbox BBox coordinates.
* @param integer|float $pageIndex The page index to extract, begins at 0.
* @throws ImagickException Throws if the image can't be processed.
*/
protected function extractImageFromBbox(BBox $bbox, int|float $pageIndex): Imagick
{
$image = $this->pageImages[$pageIndex]->clone();
$width = $image->getImageWidth();
$height = $image->getImageHeight();
$minX = round($bbox->getMinX() * $width);
$maxX = round($bbox->getMaxX() * $width);
$minY = round($bbox->getMinY() * $height);
$maxY = round($bbox->getMaxY() * $height);
$image->cropImage((int) ($maxX - $minX), (int) ($maxY - $minY), (int) $minX, (int) $minY);
return $image;
}
/**
* Splits the filename into name and extension.
*
* @param string $filename Name of the file.
* @return array{0: string, 1: string} An array containing the name and extension of the file.
*/
protected static function splitNameStrict(string $filename): array
{
return [
pathinfo($filename, PATHINFO_FILENAME),
pathinfo($filename, PATHINFO_EXTENSION),
];
}
}