@@ -180,7 +180,7 @@ private function exportRows(int $batchSize): void
180180 }
181181 }
182182
183- $ this ->withCSVStream (function ($ stream ) use ($ columnTypes , $ manyToManyKeys , $ arrayKeys , $ table , $ batchSize ) {
183+ $ this ->withCSVStream (function ($ stream, $ delimiter ) use ($ columnTypes , $ manyToManyKeys , $ arrayKeys , $ table , $ batchSize ) {
184184 $ headers = fgetcsv ($ stream );
185185 if (! is_array ($ headers ) || count ($ headers ) === 0 ) {
186186 return ;
@@ -190,7 +190,7 @@ private function exportRows(int $batchSize): void
190190
191191 $ buffer = [];
192192
193- while (($ csvRowItem = fgetcsv ($ stream )) !== false ) {
193+ while (($ csvRowItem = fgetcsv (stream: $ stream, separator: $ delimiter )) !== false ) {
194194 if (count ($ csvRowItem ) !== count ($ headers )) {
195195 throw new \Exception ('CSV row does not match the number of header columns. ' );
196196 }
@@ -314,7 +314,7 @@ protected function exportGroupFunctions(int $batchSize, array $resources): void
314314 }
315315
316316 /**
317- * @param callable(resource $stream): void $callback
317+ * @param callable(resource $stream, string $delimiter ): void $callback
318318 * @return void
319319 * @throws \Exception
320320 */
@@ -336,8 +336,10 @@ private function withCsvStream(callable $callback): void
336336 return ;
337337 }
338338
339+ $ delimiter = $ this ->delimiter ($ stream );
340+
339341 try {
340- $ callback ($ stream );
342+ $ callback ($ stream, $ delimiter );
341343 } finally {
342344 \fclose ($ stream );
343345 }
@@ -402,4 +404,116 @@ private function downloadToLocal(
402404
403405 $ this ->downloaded = true ;
404406 }
407+
408+ /**
409+ * @param resource $stream
410+ * @return string
411+ */
412+ private function delimiter ($ stream ): string
413+ {
414+ /**
415+ * widely used options, from here -
416+ *
417+ * https://stackoverflow.com/a/15946087/6819340
418+ */
419+ $ delimiters = [', ' , '; ' , "\t" , '| ' ];
420+
421+ $ sampleLines = [];
422+
423+ for ($ i = 0 ; $ i < 5 && !feof ($ stream ); $ i ++) {
424+ $ line = fgets ($ stream );
425+ if ($ line === false ) {
426+ break ;
427+ }
428+
429+ $ line = trim ($ line );
430+
431+ // empty line, skip for sampling
432+ if (empty ($ line )) {
433+ $ i --;
434+ continue ;
435+ }
436+
437+ $ sampleLines [] = $ line ;
438+ }
439+
440+ /**
441+ * reset to top again because we need to process
442+ * the same file later again if everything goes OK here!
443+ */
444+ rewind ($ stream );
445+
446+ if (empty ($ sampleLines )) {
447+ return ', ' ;
448+ }
449+
450+ $ delimiterScores = [];
451+
452+ foreach ($ delimiters as $ delimiter ) {
453+ $ columnCounts = [];
454+ $ totalFields = 0 ;
455+ $ usableFields = 0 ;
456+
457+ foreach ($ sampleLines as $ line ) {
458+ // delimiter doesn't exist
459+ if (!str_contains ($ line , $ delimiter )) {
460+ $ fields = [$ line ];
461+ } else {
462+ $ fields = str_getcsv ($ line , $ delimiter );
463+ }
464+
465+ $ fieldCount = count ($ fields );
466+ $ columnCounts [] = $ fieldCount ;
467+ $ totalFields += $ fieldCount ;
468+
469+ // Count fields that make some sense i.e.
470+ // longer than 1 char or single alphanumeric
471+ foreach ($ fields as $ field ) {
472+ $ trimmed = trim ($ field );
473+ if (strlen ($ trimmed ) > 1 ) {
474+ $ usableFields ++;
475+ }
476+ }
477+ }
478+
479+ $ sampleCount = count ($ columnCounts );
480+ $ avgColumns = $ totalFields / $ sampleCount ;
481+
482+ // short-circuit
483+ // if the delimiter doesn't split anything
484+ if ($ avgColumns <= 1 ) {
485+ $ delimiterScores [$ delimiter ] = 0 ;
486+ continue ;
487+ }
488+
489+ // check consistency
490+ if ($ sampleCount <= 1 ) {
491+ $ consistencyScore = 1.0 ;
492+ } else {
493+ $ variance = 0 ;
494+ foreach ($ columnCounts as $ count ) {
495+ $ variance += pow ($ count - $ avgColumns , 2 );
496+ }
497+
498+ // oof, math!
499+ $ stddev = sqrt ($ variance / $ sampleCount );
500+ $ coefficientOfVariation = $ stddev / $ avgColumns ;
501+
502+ // lower variance = higher score
503+ $ consistencyScore = 1.0 / (1.0 + $ coefficientOfVariation * 2 );
504+ }
505+
506+ $ qualityScore = $ totalFields > 0 ? $ usableFields / $ totalFields : 0.0 ;
507+
508+ $ delimiterScores [$ delimiter ] = $ consistencyScore * $ qualityScore ;
509+ }
510+
511+ // sort as per score
512+ arsort ($ delimiterScores );
513+
514+ // get the first
515+ $ bestDelimiter = key ($ delimiterScores );
516+
517+ return ($ bestDelimiter && $ delimiterScores [$ bestDelimiter ] > 0 ) ? $ bestDelimiter : ', ' ;
518+ }
405519}
0 commit comments