Skip to content

Commit 00a4c58

Browse files
committed
feat: auto-detect delimiters.
1 parent 973a4da commit 00a4c58

9 files changed

Lines changed: 234 additions & 4 deletions

File tree

src/Migration/Sources/CSV.php

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ private function exportRows(int $batchSize): void
180180
}
181181
}
182182

183-
$this->withCSVStream(function ($stream) use ($columnTypes, $manyToManyKeys, $arrayKeys, $table, $batchSize) {
183+
$this->withCSVStream(function ($stream, $delimiter) use ($columnTypes, $manyToManyKeys, $arrayKeys, $table, $batchSize) {
184184
$headers = fgetcsv($stream);
185185
if (! is_array($headers) || count($headers) === 0) {
186186
return;
@@ -190,7 +190,7 @@ private function exportRows(int $batchSize): void
190190

191191
$buffer = [];
192192

193-
while (($csvRowItem = fgetcsv($stream)) !== false) {
193+
while (($csvRowItem = fgetcsv(stream: $stream, separator: $delimiter)) !== false) {
194194
if (count($csvRowItem) !== count($headers)) {
195195
throw new \Exception('CSV row does not match the number of header columns.');
196196
}
@@ -314,7 +314,7 @@ protected function exportGroupFunctions(int $batchSize, array $resources): void
314314
}
315315

316316
/**
317-
* @param callable(resource $stream): void $callback
317+
* @param callable(resource $stream, string $delimiter): void $callback
318318
* @return void
319319
* @throws \Exception
320320
*/
@@ -336,8 +336,10 @@ private function withCsvStream(callable $callback): void
336336
return;
337337
}
338338

339+
$delimiter = $this->delimiter($stream);
340+
339341
try {
340-
$callback($stream);
342+
$callback($stream, $delimiter);
341343
} finally {
342344
\fclose($stream);
343345
}
@@ -402,4 +404,116 @@ private function downloadToLocal(
402404

403405
$this->downloaded = true;
404406
}
407+
408+
/**
409+
* @param resource $stream
410+
* @return string
411+
*/
412+
private function delimiter($stream): string
413+
{
414+
/**
415+
* widely used options, from here -
416+
*
417+
* https://stackoverflow.com/a/15946087/6819340
418+
*/
419+
$delimiters = [',', ';', "\t", '|'];
420+
421+
$sampleLines = [];
422+
423+
for ($i = 0; $i < 5 && !feof($stream); $i++) {
424+
$line = fgets($stream);
425+
if ($line === false) {
426+
break;
427+
}
428+
429+
$line = trim($line);
430+
431+
// empty line, skip for sampling
432+
if (empty($line)) {
433+
$i--;
434+
continue;
435+
}
436+
437+
$sampleLines[] = $line;
438+
}
439+
440+
/**
441+
* reset to top again because we need to process
442+
* the same file later again if everything goes OK here!
443+
*/
444+
rewind($stream);
445+
446+
if (empty($sampleLines)) {
447+
return ',';
448+
}
449+
450+
$delimiterScores = [];
451+
452+
foreach ($delimiters as $delimiter) {
453+
$columnCounts = [];
454+
$totalFields = 0;
455+
$usableFields = 0;
456+
457+
foreach ($sampleLines as $line) {
458+
// delimiter doesn't exist
459+
if (!str_contains($line, $delimiter)) {
460+
$fields = [$line];
461+
} else {
462+
$fields = str_getcsv($line, $delimiter);
463+
}
464+
465+
$fieldCount = count($fields);
466+
$columnCounts[] = $fieldCount;
467+
$totalFields += $fieldCount;
468+
469+
// Count fields that make some sense i.e.
470+
// longer than 1 char or single alphanumeric
471+
foreach ($fields as $field) {
472+
$trimmed = trim($field);
473+
if (strlen($trimmed) > 1) {
474+
$usableFields++;
475+
}
476+
}
477+
}
478+
479+
$sampleCount = count($columnCounts);
480+
$avgColumns = $totalFields / $sampleCount;
481+
482+
// short-circuit
483+
// if the delimiter doesn't split anything
484+
if ($avgColumns <= 1) {
485+
$delimiterScores[$delimiter] = 0;
486+
continue;
487+
}
488+
489+
// check consistency
490+
if ($sampleCount <= 1) {
491+
$consistencyScore = 1.0;
492+
} else {
493+
$variance = 0;
494+
foreach ($columnCounts as $count) {
495+
$variance += pow($count - $avgColumns, 2);
496+
}
497+
498+
// oof, math!
499+
$stddev = sqrt($variance / $sampleCount);
500+
$coefficientOfVariation = $stddev / $avgColumns;
501+
502+
// lower variance = higher score
503+
$consistencyScore = 1.0 / (1.0 + $coefficientOfVariation * 2);
504+
}
505+
506+
$qualityScore = $totalFields > 0 ? $usableFields / $totalFields : 0.0;
507+
508+
$delimiterScores[$delimiter] = $consistencyScore * $qualityScore;
509+
}
510+
511+
// sort as per score
512+
arsort($delimiterScores);
513+
514+
// get the first
515+
$bestDelimiter = key($delimiterScores);
516+
517+
return ($bestDelimiter && $delimiterScores[$bestDelimiter] > 0) ? $bestDelimiter : ',';
518+
}
405519
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
<?php
2+
3+
namespace Migration\Unit\General;
4+
5+
use PHPUnit\Framework\TestCase;
6+
use Utopia\Migration\Sources\CSV;
7+
8+
class CSVTest extends TestCase
9+
{
10+
private const RESOURCES_DIR = __DIR__ . '/../../resources/csv/';
11+
12+
/**
13+
* @throws \ReflectionException
14+
*/
15+
private function detectDelimiter($stream): string
16+
{
17+
$reflection = new \ReflectionClass(CSV::class);
18+
$instance = $reflection->newInstanceWithoutConstructor();
19+
20+
$refMethod = $reflection->getMethod('delimiter');
21+
22+
/** @noinspection PhpExpressionResultUnusedInspection */
23+
$refMethod->setAccessible(true);
24+
25+
return $refMethod->invoke($instance, $stream);
26+
}
27+
28+
public function testDetectDelimiter()
29+
{
30+
$cases = [
31+
['file' => 'comma.csv', 'expected' => ','],
32+
['file' => 'single_column.csv', 'expected' => ','], // fallback
33+
['file' => 'empty.csv', 'expected' => ','], // fallback
34+
['file' => 'quoted_fields.csv', 'expected' => ','],
35+
['file' => 'semicolon.csv', 'expected' => ';'],
36+
['file' => 'tab.csv', 'expected' => "\t"],
37+
['file' => 'pipe.csv', 'expected' => '|'],
38+
];
39+
40+
foreach ($cases as $case) {
41+
$filepath = self::RESOURCES_DIR . $case['file'];
42+
$stream = fopen($filepath, 'r');
43+
$delimiter = $this->detectDelimiter($stream);
44+
fclose($stream);
45+
46+
$this->assertEquals($case['expected'], $delimiter, "Failed for {$case['file']}");
47+
}
48+
}
49+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
id,name,age,city,country,email,phone,department,role,joined
2+
1,Alice,23,New York,USA,alice@example.com,555-0101,Engineering,Developer,2020-01-15
3+
2,Bob,30,London,UK,bob@example.com,555-0102,Data,Analyst,2019-05-20
4+
3,Charlie,25,Tokyo,Japan,charlie@example.com,555-0103,Product,Manager,2021-08-10
5+
4,Diana,28,Sydney,Australia,diana@example.com,555-0104,Engineering,Designer,2020-04-12
6+
5,Ethan,31,Toronto,Canada,ethan@example.com,555-0105,Marketing,Lead,2018-11-23
7+
6,Faith,27,Berlin,Germany,faith@example.com,555-0106,Finance,Accountant,2017-07-14
8+
7,George,35,Paris,France,george@example.com,555-0107,Legal,Advisor,2021-02-05
9+
8,Hannah,26,Singapore,Singapore,hannah@example.com,555-0108,Operations,Coordinator,2019-09-27
10+
9,Ian,29,Dubai,UAE,ian@example.com,555-0109,Engineering,QA,2022-03-19
11+
10,Julia,24,Zurich,Switzerland,julia@example.com,555-0110,Research,Scientist,2020-12-01

tests/Migration/resources/csv/empty.csv

Whitespace-only changes.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
id|name|age|city|country|email|phone|department|role|joined
2+
1|Alice|23|New York|USA|alice@example.com|555-0101|Engineering|Developer|2020-01-15
3+
2|Bob|30|London|UK|bob@example.com|555-0102|Data|Analyst|2019-05-20
4+
3|Charlie|25|Tokyo|Japan|charlie@example.com|555-0103|Product|Manager|2021-08-10
5+
4|Diana|28|Sydney|Australia|diana@example.com|555-0104|Engineering|Designer|2020-04-12
6+
5|Ethan|31|Toronto|Canada|ethan@example.com|555-0105|Marketing|Lead|2018-11-23
7+
6|Faith|27|Berlin|Germany|faith@example.com|555-0106|Finance|Accountant|2017-07-14
8+
7|George|35|Paris|France|george@example.com|555-0107|Legal|Advisor|2021-02-05
9+
8|Hannah|26|Singapore|Singapore|hannah@example.com|555-0108|Operations|Coordinator|2019-09-27
10+
9|Ian|29|Dubai|UAE|ian@example.com|555-0109|Engineering|QA|2022-03-19
11+
10|Julia|24|Zurich|Switzerland|julia@example.com|555-0110|Research|Scientist|2020-12-01
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name,description,tags
2+
"Alice","Software Engineer, Team Lead","php,js"
3+
"Bob","Data Analyst; Expert","sql,excel"
4+
"Carol","CTO, Strategic Leader","go,devops,cloud"
5+
"David","Back-end Developer, API specialist","python,flask,django"
6+
"Emma","Fullstack Developer, React/Vue","js,react,vue"
7+
"Frank","DevOps; Site Reliability","docker,k8s,terraform"
8+
"Grace","Product Manager, B2B; B2C","planning,roadmap"
9+
"Hannah","Support Lead, ""Customer Success""","support,crm"
10+
"Ian","QA Engineer, Manual & Automated testing","selenium,pytest"
11+
"Jane","UI/UX Designer, accessibility","figma,sketch"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
id;name;age;city;country;email;phone;department;role;joined
2+
1;Alice;23;New York;USA;alice@example.com;555-0101;Engineering;Developer;2020-01-15
3+
2;Bob;30;London;UK;bob@example.com;555-0102;Data;Analyst;2019-05-20
4+
3;Charlie;25;Tokyo;Japan;charlie@example.com;555-0103;Product;Manager;2021-08-10
5+
4;Diana;28;Sydney;Australia;diana@example.com;555-0104;Engineering;Designer;2020-04-12
6+
5;Ethan;31;Toronto;Canada;ethan@example.com;555-0105;Marketing;Lead;2018-11-23
7+
6;Faith;27;Berlin;Germany;faith@example.com;555-0106;Finance;Accountant;2017-07-14
8+
7;George;35;Paris;France;george@example.com;555-0107;Legal;Advisor;2021-02-05
9+
8;Hannah;26;Singapore;Singapore;hannah@example.com;555-0108;Operations;Coordinator;2019-09-27
10+
9;Ian;29;Dubai;UAE;ian@example.com;555-0109;Engineering;QA;2022-03-19
11+
10;Julia;24;Zurich;Switzerland;julia@example.com;555-0110;Research;Scientist;2020-12-01
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
singlecolumn
2+
alpha
3+
beta
4+
gamma
5+
delta
6+
epsilon
7+
zeta
8+
eta
9+
theta
10+
iota
11+
kappa
12+
lambda
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
id name age city country email phone department role joined
2+
1 Alice 23 New York USA alice@example.com 555-0101 Engineering Developer 2020-01-15
3+
2 Bob 30 London UK bob@example.com 555-0102 Data Analyst 2019-05-20
4+
3 Charlie 25 Tokyo Japan charlie@example.com 555-0103 Product Manager 2021-08-10
5+
4 Diana 28 Sydney Australia diana@example.com 555-0104 Engineering Designer 2020-04-12
6+
5 Ethan 31 Toronto Canada ethan@example.com 555-0105 Marketing Lead 2018-11-23
7+
6 Faith 27 Berlin Germany faith@example.com 555-0106 Finance Accountant 2017-07-14
8+
7 George 35 Paris France george@example.com 555-0107 Legal Advisor 2021-02-05
9+
8 Hannah 26 Singapore Singapore hannah@example.com 555-0108 Operations Coordinator 2019-09-27
10+
9 Ian 29 Dubai UAE ian@example.com 555-0109 Engineering QA 2022-03-19
11+
10 Julia 24 Zurich Switzerland julia@example.com 555-0110 Research Scientist 2020-12-01

0 commit comments

Comments
 (0)