Skip to content

Commit a05c096

Browse files
authored
Merge pull request #2401 from flow-php/2394-bug-csv-extractor---too-small-default-for-characters_read_in_line
fix: performance when processing remote CSV files
2 parents 6a0c325 + b973a74 commit a05c096

4 files changed

Lines changed: 93 additions & 2 deletions

File tree

src/adapter/etl-adapter-csv/src/Flow/ETL/Adapter/CSV/functions.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ function from_csv(
3737
?string $separator = null,
3838
?string $enclosure = null,
3939
?string $escape = null,
40-
int $characters_read_in_line = 1000,
40+
int $characters_read_in_line = 10 * 1024 * 1024,
4141
?Schema $schema = null,
4242
): CSVExtractor {
4343
$loader = (new CSVExtractor(is_string($path) ? path_real($path) : $path))
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Adapter\CSV\Tests\Double;
6+
7+
use Flow\Filesystem\Path;
8+
use Flow\Filesystem\SourceStream;
9+
use Generator;
10+
11+
use function explode;
12+
use function strlen;
13+
use function substr;
14+
15+
final class LengthCapturingSourceStream implements SourceStream
16+
{
17+
/**
18+
* @var array<null|int> the length argument captured on each readLines() call
19+
*/
20+
public array $capturedLengths = [];
21+
22+
public function __construct(
23+
private readonly string $contents,
24+
private readonly Path $path,
25+
) {}
26+
27+
public function close(): void {}
28+
29+
public function content(): string
30+
{
31+
return $this->contents;
32+
}
33+
34+
public function isOpen(): bool
35+
{
36+
return true;
37+
}
38+
39+
public function iterate(int $length = 1): Generator
40+
{
41+
for ($i = 0; $i < strlen($this->contents); $i += $length) {
42+
yield substr($this->contents, $i, $length);
43+
}
44+
}
45+
46+
public function path(): Path
47+
{
48+
return $this->path;
49+
}
50+
51+
public function read(int $length, int $offset): string
52+
{
53+
return substr($this->contents, $offset, $length);
54+
}
55+
56+
public function readLines(string $separator = "\n", ?int $length = null): Generator
57+
{
58+
$this->capturedLengths[] = $length;
59+
60+
foreach (explode($separator, $this->contents) as $line) {
61+
yield $line;
62+
}
63+
}
64+
65+
public function size(): int
66+
{
67+
return strlen($this->contents);
68+
}
69+
}

src/adapter/etl-adapter-csv/tests/Flow/ETL/Adapter/CSV/Tests/Unit/CSVLineReaderTest.php

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,33 @@
55
namespace Flow\ETL\Adapter\CSV\Tests\Unit;
66

77
use Flow\ETL\Adapter\CSV\CSVLineReader;
8+
use Flow\ETL\Adapter\CSV\Tests\Double\LengthCapturingSourceStream;
89
use Flow\Filesystem\Stream\MemorySourceStream;
10+
use PHPUnit\Framework\Attributes\TestWith;
911
use PHPUnit\Framework\TestCase;
1012

13+
use function Flow\Filesystem\DSL\path;
14+
1115
final class CSVLineReaderTest extends TestCase
1216
{
17+
public function test_characters_read_in_line_is_passed_through_to_the_stream(): void
18+
{
19+
$stream = new LengthCapturingSourceStream("id,name\n1,foo", path('s3://bucket/users.csv'));
20+
21+
iterator_to_array((new CSVLineReader('"', 4096))->readLines($stream));
22+
23+
static::assertSame([4096], $stream->capturedLengths);
24+
}
25+
26+
public function test_null_characters_read_in_line_lets_the_stream_choose_its_default(): void
27+
{
28+
$stream = new LengthCapturingSourceStream("id,name\n1,foo", path('s3://bucket/users.csv'));
29+
30+
iterator_to_array((new CSVLineReader('"'))->readLines($stream));
31+
32+
static::assertSame([null], $stream->capturedLengths);
33+
}
34+
1335
public function test_detection_of_multiline_quotes(): void
1436
{
1537
$simpleContent = "field1,field2\nvalue1,value2";

web/landing/resources/dsl.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)