Skip to content

Commit 298c656

Browse files
bronchanorberttech
andauthored
feat: add bom removal in CSV extractor (#1685)
* add bom removal in CSV extractor * ensure BOM exists in tests and add test for disabling bom removal, fix CS --------- Co-authored-by: Norbert Orzechowicz <1921950+norberttech@users.noreply.github.com>
1 parent fa329d5 commit 298c656

7 files changed

Lines changed: 186 additions & 2 deletions

File tree

src/adapter/etl-adapter-csv/src/Flow/ETL/Adapter/CSV/CSVExtractor.php

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ final class CSVExtractor implements Extractor, FileExtractor, LimitableExtractor
2626

2727
private ?string $escape = null;
2828

29+
private bool $removeBOM = true;
30+
2931
private ?Schema $schema = null;
3032

3133
private ?string $separator = null;
@@ -51,9 +53,13 @@ public function extract(FlowContext $context) : \Generator
5153
$headers = [];
5254
$headersCount = 0;
5355

54-
foreach ($stream->readLines(length: $this->charactersReadInLine) as $csvLine) {
56+
foreach ($stream->readLines(length: $this->charactersReadInLine) as $line => $csvLine) {
57+
if ($line === 0 && $this->removeBOM) {
58+
$csvLine = preg_replace('/^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/', '', $csvLine);
59+
}
60+
5561
/** @var non-empty-list<null|string> $rowData */
56-
$rowData = \str_getcsv($csvLine, $separator, $enclosure, $escape);
62+
$rowData = \str_getcsv((string) $csvLine, $separator, $enclosure, $escape);
5763
$rowDataCount = \count($rowData);
5864

5965
if ([] === $headers) {
@@ -113,6 +119,13 @@ public function source() : Path
113119
return $this->path;
114120
}
115121

122+
public function withBOMRemoval(bool $removeBOM) : self
123+
{
124+
$this->removeBOM = $removeBOM;
125+
126+
return $this;
127+
}
128+
116129
/**
117130
* @param int<1, max> $charactersReadInLine
118131
*/
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
��id,name,reference
2+
2,asd,144
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
��id,name,reference
2+
2,asd,144
Binary file not shown.
Binary file not shown.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
id,name,reference
2+
2,asd,144

src/adapter/etl-adapter-csv/tests/Flow/ETL/Adapter/CSV/Tests/Integration/CSVExtractorTest.php

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,134 @@
1414

1515
final class CSVExtractorTest extends FlowTestCase
1616
{
17+
public function test_bom_removal_utf16_be() : void
18+
{
19+
$extractor = from_csv(
20+
$path = Path::realpath(__DIR__ . '/../Fixtures/with_utf16be_bom.csv'),
21+
);
22+
self::assertTrue($this->ensureBOMExists(__DIR__ . '/../Fixtures/with_utf16be_bom.csv', "\xFE\xFF"));
23+
24+
self::assertSame(
25+
[
26+
[
27+
[
28+
'id' => '2',
29+
'name' => 'asd',
30+
'reference' => '144',
31+
'_input_file_uri' => $path->uri(),
32+
],
33+
],
34+
],
35+
\array_map(
36+
static fn (Rows $r) => $r->toArray(),
37+
\iterator_to_array($extractor->extract(flow_context(Config::builder()->putInputIntoRows()->build())))
38+
)
39+
);
40+
}
41+
42+
public function test_bom_removal_utf16_le() : void
43+
{
44+
$extractor = from_csv(
45+
$path = Path::realpath(__DIR__ . '/../Fixtures/with_utf16le_bom.csv'),
46+
);
47+
self::assertTrue($this->ensureBOMExists(__DIR__ . '/../Fixtures/with_utf16le_bom.csv', "\xFF\xFE"));
48+
49+
self::assertSame(
50+
[
51+
[
52+
[
53+
'id' => '2',
54+
'name' => 'asd',
55+
'reference' => '144',
56+
'_input_file_uri' => $path->uri(),
57+
],
58+
],
59+
],
60+
\array_map(
61+
static fn (Rows $r) => $r->toArray(),
62+
\iterator_to_array($extractor->extract(flow_context(Config::builder()->putInputIntoRows()->build())))
63+
)
64+
);
65+
}
66+
67+
public function test_bom_removal_utf32_be() : void
68+
{
69+
$extractor = from_csv(
70+
$path = Path::realpath(__DIR__ . '/../Fixtures/with_utf32be_bom.csv'),
71+
);
72+
73+
self::assertTrue($this->ensureBOMExists(__DIR__ . '/../Fixtures/with_utf32be_bom.csv', "\x00\x00\xFE\xFF"));
74+
75+
self::assertSame(
76+
[
77+
[
78+
[
79+
'id' => '2',
80+
'name' => 'asd',
81+
'reference' => '144',
82+
'_input_file_uri' => $path->uri(),
83+
],
84+
],
85+
],
86+
\array_map(
87+
static fn (Rows $r) => $r->toArray(),
88+
\iterator_to_array($extractor->extract(flow_context(Config::builder()->putInputIntoRows()->build())))
89+
)
90+
);
91+
}
92+
93+
public function test_bom_removal_utf32_le() : void
94+
{
95+
$extractor = from_csv(
96+
$path = Path::realpath(__DIR__ . '/../Fixtures/with_utf32le_bom.csv'),
97+
);
98+
99+
self::assertTrue($this->ensureBOMExists(__DIR__ . '/../Fixtures/with_utf32le_bom.csv', "\xFF\xFE\x00\x00"));
100+
101+
self::assertSame(
102+
[
103+
[
104+
[
105+
'id' => '2',
106+
'name' => 'asd',
107+
'reference' => '144',
108+
'_input_file_uri' => $path->uri(),
109+
],
110+
],
111+
],
112+
\array_map(
113+
static fn (Rows $r) => $r->toArray(),
114+
\iterator_to_array($extractor->extract(flow_context(Config::builder()->putInputIntoRows()->build())))
115+
)
116+
);
117+
}
118+
119+
public function test_bom_removal_utf8() : void
120+
{
121+
$extractor = from_csv(
122+
$path = Path::realpath(__DIR__ . '/../Fixtures/with_utf8_bom.csv'),
123+
);
124+
125+
self::assertTrue($this->ensureBOMExists(__DIR__ . '/../Fixtures/with_utf8_bom.csv', "\xEF\xBB\xBF"));
126+
127+
self::assertSame(
128+
[
129+
[
130+
[
131+
'id' => '2',
132+
'name' => 'asd',
133+
'reference' => '144',
134+
'_input_file_uri' => $path->uri(),
135+
],
136+
],
137+
],
138+
\array_map(
139+
static fn (Rows $r) => $r->toArray(),
140+
\iterator_to_array($extractor->extract(flow_context(Config::builder()->putInputIntoRows()->build())))
141+
)
142+
);
143+
}
144+
17145
public function test_extracting_csv_empty_columns_as_empty_strings() : void
18146
{
19147
$extractor = from_csv(
@@ -332,4 +460,41 @@ public function test_signal_stop() : void
332460
$generator->send(Signal::STOP);
333461
self::assertFalse($generator->valid());
334462
}
463+
464+
public function test_without_bom_removal_utf8() : void
465+
{
466+
$extractor = from_csv(
467+
$path = Path::realpath(__DIR__ . '/../Fixtures/with_utf8_bom.csv'),
468+
);
469+
470+
$extractor = $extractor->withBOMRemoval(false);
471+
472+
self::assertTrue($this->ensureBOMExists(__DIR__ . '/../Fixtures/with_utf8_bom.csv', "\xEF\xBB\xBF"));
473+
474+
self::assertSame(
475+
[
476+
[
477+
[
478+
"\xEF\xBB\xBFid" => '2',
479+
'name' => 'asd',
480+
'reference' => '144',
481+
'_input_file_uri' => $path->uri(),
482+
],
483+
],
484+
],
485+
\array_map(
486+
static fn (Rows $r) => $r->toArray(),
487+
\iterator_to_array($extractor->extract(flow_context(Config::builder()->putInputIntoRows()->build())))
488+
)
489+
);
490+
}
491+
492+
private function ensureBOMExists(string $path, string $BOM) : bool
493+
{
494+
$handle = fopen($path, 'rb');
495+
$contents = fread($handle, strlen($BOM));
496+
fclose($handle);
497+
498+
return $contents === $BOM;
499+
}
335500
}

0 commit comments

Comments
 (0)