-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathPDFExtractor.php
More file actions
205 lines (179 loc) · 6.74 KB
/
Copy pathPDFExtractor.php
File metadata and controls
205 lines (179 loc) · 6.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
<?php
declare(strict_types=1);
namespace Mindee\Extraction;
use Mindee\Dependency\DependencyChecker;
use Mindee\Error\MindeePDFException;
use Mindee\Input\LocalInputSource;
use Mindee\V1\Product\InvoiceSplitter\InvoiceSplitterV1InvoicePageGroups;
use setasign\Fpdi\Fpdi;
use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException;
use setasign\Fpdi\PdfParser\Filter\FilterException;
use setasign\Fpdi\PdfParser\PdfParserException;
use setasign\Fpdi\PdfReader\PdfReaderException;
use Imagick;
use ImagickException;
use InvalidArgumentException;
use function count;
use function is_array;
use function sprintf;
/**
* PDF extraction class.
*/
class PDFExtractor
{
/**
* @var string bytes representation of a file
*/
private string $pdfBytes;
/**
* @var string name of the file
*/
private string $fileName;
/**
* @param LocalInputSource $localInput Local Input, accepts all compatible formats.
*
* @throws MindeePDFException Throws if PDF operations aren't supported, or if the file
* can't be read, respectively.
*/
public function __construct(LocalInputSource $localInput)
{
DependencyChecker::isImageMagickAvailable();
DependencyChecker::isGhostscriptAvailable();
$this->fileName = $localInput->fileName;
if ($localInput->isPDF()) {
$this->pdfBytes = $localInput->readContents()[1];
} else {
try {
$image = new Imagick();
} catch (ImagickException $e) {
throw new MindeePDFException("Imagick could not process this file.\n", 0, $e);
}
$image->readImageBlob($localInput->readContents()[1]);
$image->setImageFormat('pdf');
$this->pdfBytes = $image->getImageBlob();
}
}
/**
* Wrapper for pdf GetPageCount().
*
* @return integer The number of pages in the file.
*
* @throws MindeePDFException Throws if FPDI is unable to process the file.
*/
public function getPageCount(): int
{
try {
$pdfHandle = new Fpdi();
$tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_');
file_put_contents($tempFilename, $this->pdfBytes);
return $pdfHandle->setSourceFile($tempFilename);
} catch (PdfParserException $e) {
throw new MindeePDFException("Couldn't open PDF file. FPDI sent the following: ", 0, $e);
}
}
/**
* Extracts sub-documents from the source document using list of page indexes.
*
* @param array<array<integer>>|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep.
*
* @return ExtractedPDF[] list of extracted documents
*
* @throws MindeePDFException Throws if FDPF/FPDI wasn't able to handle the pdf during the extraction.
* @throws InvalidArgumentException Throws if invalid indexes are provided.
*/
public function extractSubDocuments(array|InvoiceSplitterV1InvoicePageGroups $pageIndexes): array
{
$extractedPdfs = [];
foreach ($pageIndexes as $pageIndexElem) {
if (empty($pageIndexElem)) {
throw new InvalidArgumentException('Empty indexes not allowed for extraction.');
}
$extension = pathinfo($this->fileName, PATHINFO_EXTENSION);
$prefix = pathinfo($this->fileName, PATHINFO_FILENAME);
$fieldFilename = sprintf(
'%s_%03d-%03d.%s',
$prefix,
$pageIndexElem[0] + 1,
$pageIndexElem[count($pageIndexElem) - 1] + 1,
$extension
);
try {
$pdf = new Fpdi();
$tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_');
file_put_contents($tempFilename, $this->pdfBytes);
$pdf->setSourceFile($tempFilename);
foreach ($pageIndexElem as $pageIndex) {
$pdf->AddPage();
$pdf->useTemplate($pdf->importPage($pageIndex + 1));
}
$mergedPdfBytes = $pdf->Output('S');
} catch (
CrossReferenceException|
FilterException|
PdfParserException|
PdfReaderException $e
) {
throw new MindeePDFException("PDF file couldn't be processed during extraction.", 0, $e);
}
$extractedPdfs[] = new ExtractedPDF($mergedPdfBytes, $fieldFilename);
}
return $extractedPdfs;
}
/**
* Extracts invoices as complete PDFs from the document.
*
* @param array<array<integer>>|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep.
* @param boolean $strict Whether to trust confidence scores or not.
*
* @return ExtractedPDF[] a list of extracted invoices
*/
public function extractInvoices(array|InvoiceSplitterV1InvoicePageGroups $pageIndexes, bool $strict = false): array
{
if (empty($pageIndexes)) {
return [];
}
if (!$strict) {
$indexes = array_map(static fn($invoicePageIndexes) => $invoicePageIndexes->pageIndexes, (array) $pageIndexes);
return $this->extractSubDocuments($indexes);
}
if (is_array($pageIndexes[0])) {
return $this->extractSubDocuments($pageIndexes);
}
$correctPageIndexes = [];
$currentList = [];
$previousConfidence = null;
$i = 0;
foreach ($pageIndexes as $pageIndex) {
$confidence = $pageIndex->confidence;
$pageList = $pageIndex->pageIndexes;
if ($confidence >= 0.5 && null === $previousConfidence) {
$currentList = $pageList;
} elseif ($confidence >= 0.5 && $i !== count($pageIndexes) - 1) {
if (!empty($currentList)) {
$correctPageIndexes[] = $currentList;
}
$currentList = $pageList;
} elseif ($confidence < 0.5 && $i === count($pageIndexes) - 1) {
$currentList = array_merge($currentList, $pageList);
if (!empty($currentList)) {
$correctPageIndexes[] = $currentList;
}
} else {
if (!empty($currentList)) {
$correctPageIndexes[] = $currentList;
}
$correctPageIndexes[] = $pageList;
}
$previousConfidence = $confidence;
++$i;
}
return $this->extractSubDocuments($correctPageIndexes);
}
/**
* @return string name of the file
*/
public function getFileName(): string
{
return $this->fileName;
}
}