Skip to content

Commit 01d60a1

Browse files
authored
[ExcelExtractor] Allow detecting Excel files by reading first bytes (#1860)
* [ExcelExtractor] Allow detecting Excel files by reading first bytes * [ExcelExtractor] Allow detecting Excel files by reading first bytes
1 parent 3e2d945 commit 01d60a1

5 files changed

Lines changed: 49 additions & 7 deletions

File tree

src/adapter/etl-adapter-excel/src/Flow/ETL/Adapter/Excel/ExcelExtractor.php

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,12 +145,13 @@ private function createRowsFromCells(Row $row, int $previousRowDataCount = 0) :
145145
*/
146146
private function extractRows(SourceStream $stream, array $headers, int $offset) : \Generator
147147
{
148+
$reader = $this->reader($stream);
149+
148150
try {
149-
$this->reader()->open($stream->path()->path());
151+
$reader->open($stream->path()->path());
150152

151-
$manager = new SheetsManager($this->reader()->getSheetIterator());
153+
$manager = new SheetsManager($reader->getSheetIterator());
152154

153-
$rows = [];
154155
$previousRowDataCount = 0;
155156

156157
$sheet = $this->sheetName ? $manager->get($this->sheetName) : $manager->first();
@@ -183,20 +184,49 @@ private function extractRows(SourceStream $stream, array $headers, int $offset)
183184
}
184185
}
185186

186-
$this->reader()->close();
187+
$reader->close();
187188
} catch (\Throwable $e) {
188189
throw new InvalidArgumentException('Failed to open file: ' . $e->getMessage(), previous: $e);
189190
}
190191
}
191192

192-
private function reader() : XlsxReader|OdsReader
193+
private function reader(SourceStream $stream) : XlsxReader|OdsReader
193194
{
194195
if (null === $this->reader) {
195-
$this->reader = match ($this->path->extension()) {
196+
$this->reader = match ($stream->path()->extension()) {
196197
'xlsx' => new XlsxReader(),
197198
'ods' => new OdsReader(),
198-
default => throw new InvalidArgumentException('Unsupported file extension: ' . ($this->path->extension() ?: 'n/a')),
199+
default => null,
199200
};
201+
202+
if (null === $this->reader) {
203+
$line = $stream->read(8, 0);
204+
205+
// XLS signature: D0 CF 11 E0 A1 B1 1A E1
206+
if (\str_starts_with($line, "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")) {
207+
return $this->reader = new XlsxReader();
208+
}
209+
210+
// ZIP signature: 50 4B 03 04
211+
if (\str_starts_with($line, "\x50\x4B\x03\x04")) {
212+
$zip = new \ZipArchive();
213+
214+
if ($zip->open($stream->path()->path())) {
215+
$mimetype = $zip->getFromName('mimetype');
216+
$zip->close();
217+
218+
$this->reader = match ($mimetype) {
219+
'application/vnd.oasis.opendocument.spreadsheet' => new OdsReader(),
220+
// Other zip-based file formats
221+
default => new XlsxReader(),
222+
};
223+
}
224+
}
225+
}
226+
227+
if (!$this->reader) {
228+
throw new InvalidArgumentException('Unsupported file format: ' . ($stream->path()->extension() ?: 'n/a'));
229+
}
200230
}
201231

202232
return $this->reader;

src/adapter/etl-adapter-excel/tests/Flow/ETL/Adapter/Excel/Tests/Fixtures/empty_file

Whitespace-only changes.
Binary file not shown.
Binary file not shown.

src/adapter/etl-adapter-excel/tests/Flow/ETL/Adapter/Excel/Tests/Integration/ExcelExtractorTest.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ public static function provide_fixtures() : iterable
2222
{
2323
yield 'ods' => [__DIR__ . '/../Fixtures/fixture.ods'];
2424
yield 'xlsx' => [__DIR__ . '/../Fixtures/fixture.xlsx'];
25+
yield 'alike ods' => [__DIR__ . '/../Fixtures/fixture_as_ods'];
26+
yield 'alike xlsx' => [__DIR__ . '/../Fixtures/fixture_as_xlsx'];
2527
}
2628

2729
/**
@@ -196,6 +198,16 @@ public function test_extract_excel_nullable_file(string $fixtureName) : void
196198
self::assertSame(5, $total);
197199
}
198200

201+
public function test_extract_with_unknown_file() : void
202+
{
203+
$extractor = from_excel(__DIR__ . '/../Fixtures/empty_file');
204+
205+
$this->expectException(InvalidArgumentException::class);
206+
$this->expectExceptionMessage('Unsupported file format: n/a');
207+
208+
iterator_to_array($extractor->extract(flow_context(config())));
209+
}
210+
199211
public function test_extract_with_wrongly_selected_reader() : void
200212
{
201213
$extractor = from_excel(__DIR__ . '/../Fixtures/fixture.xlsx');

0 commit comments

Comments
 (0)