Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions src/processors/src/data-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Copyright contributors to the kepler.gl project

import * as arrow from 'apache-arrow';
import {csvParseRows} from 'd3-dsv';
import {csvParseRows, tsvParseRows, dsvFormat} from 'd3-dsv';
import {DATA_TYPES as AnalyzerDATA_TYPES} from 'type-analyzer';
import normalize from '@mapbox/geojson-normalize';
import {parseSync} from '@loaders.gl/core';
Expand Down Expand Up @@ -40,6 +40,39 @@
// matches empty string
export const CSV_NULLS = /^(null|NULL|Null|NaN|\/N||)$/;

const SUPPORTED_DELIMITERS = [',', '\t', ';', '|'] as const;

function getRowParser(delimiter: string): (raw: string) => string[][] {
if (delimiter === ',') return csvParseRows;
if (delimiter === '\t') return tsvParseRows;
return dsvFormat(delimiter).parseRows;
}

/**
* Detect the delimiter used in a DSV string by checking the first line.
* Returns the delimiter that produces the most columns (minimum 2).
* Falls back to comma if no delimiter produces multiple columns.
*/
export function detectDelimiter(rawData: string): string {
const newlineIdx = rawData.indexOf('\n');
const firstLine = newlineIdx === -1 ? rawData : rawData.slice(0, newlineIdx);
if (!firstLine) return ',';

let bestDelimiter = ',';
let bestCount = 1;

for (const delimiter of SUPPORTED_DELIMITERS) {
const parsed = getRowParser(delimiter)(firstLine);
const count = parsed[0]?.length || 0;
if (count > bestCount) {
bestCount = count;
bestDelimiter = delimiter;
}
}

return bestDelimiter;
}

function tryParseJsonString(str) {
try {
return JSON.parse(str);
Expand Down Expand Up @@ -119,11 +152,11 @@
let headerRow: string[] | undefined;

if (typeof rawData === 'string') {
const parsedRows: string[][] = csvParseRows(rawData);
const delimiter = detectDelimiter(rawData);
const parsedRows: string[][] = getRowParser(delimiter)(rawData);

if (!Array.isArray(parsedRows) || parsedRows.length < 2) {
// looks like an empty file, throw error to be catch
throw new Error('process Csv Data Failed: CSV is empty');
throw new Error('processCsvData Failed: delimited text is empty or has no data rows');
}
headerRow = parsedRows[0];
rows = parsedRows.slice(1);
Expand Down Expand Up @@ -550,7 +583,7 @@
const newColumns: Record<string, arrow.Vector> = {};
for (let i = 0; i < arrowTable.numCols; i++) {
const field = arrowTable.schema.fields[i];
const col = arrowTable.getChildAt(i)!;

Check warning on line 586 in src/processors/src/data-processor.ts

View workflow job for this annotation

GitHub Actions / build

Forbidden non-null assertion
if (arrow.DataType.isInt(field.type) && field.type.bitWidth === 64) {
const float64Array = new Float64Array(col.length);
for (let j = 0; j < col.length; j++) {
Expand Down
4 changes: 2 additions & 2 deletions src/reducers/src/vis-state-selectors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import {createSelector} from 'reselect';

// NOTE: default formats must match file-handler-test.js
const DEFAULT_FILE_EXTENSIONS = ['csv', 'json', 'geojson', 'arrow', 'parquet'];
const DEFAULT_FILE_FORMATS = ['CSV', 'Json', 'GeoJSON', 'Arrow', 'Parquet'];
const DEFAULT_FILE_EXTENSIONS = ['csv', 'tsv', 'dsv', 'json', 'geojson', 'arrow', 'parquet'];
const DEFAULT_FILE_FORMATS = ['CSV', 'TSV', 'DSV', 'Json', 'GeoJSON', 'Arrow', 'Parquet'];

interface LoaderInfo {
name: string;
Expand Down
257 changes: 256 additions & 1 deletion test/node/utils/data-processor-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ import {
parseCsvRowsByFieldType,
processCsvData,
processGeojson,
processRowObject
processRowObject,
detectDelimiter
} from '@kepler.gl/processors';

import {validateInputData, createDataContainer} from '@kepler.gl/utils';
Expand Down Expand Up @@ -152,6 +153,260 @@ test('Processor -> processCsvData', t => {
t.end();
});

test('Processor -> detectDelimiter', t => {
t.equal(detectDelimiter('a,b,c\n1,2,3'), ',', 'should detect comma delimiter');
t.equal(detectDelimiter('a\tb\tc\n1\t2\t3'), '\t', 'should detect tab delimiter');
t.equal(detectDelimiter('a;b;c\n1;2;3'), ';', 'should detect semicolon delimiter');
t.equal(detectDelimiter('a|b|c\n1|2|3'), '|', 'should detect pipe delimiter');

t.equal(
detectDelimiter('single_column\nvalue'),
',',
'should fall back to comma when no delimiter produces multiple columns'
);
t.equal(detectDelimiter(''), ',', 'should return comma for empty string');
t.equal(
detectDelimiter('a,b,c'),
',',
'should handle input without newline (single line)'
);

t.equal(
detectDelimiter('"a\tb"\tc\td\n1\t2\t3'),
'\t',
'should handle quoted fields containing other delimiters'
);
t.equal(
detectDelimiter('"city, state"\tpopulation\tarea\n"New York, NY"\t8000000\t302'),
'\t',
'should detect tab even when commas appear inside quoted fields'
);

t.equal(
detectDelimiter('name;age;city\nAlice;30;"Berlin, Germany"'),
';',
'should detect semicolon with quoted fields containing commas'
);

t.equal(
detectDelimiter('a\tb\tc\td\te\n1\t2\t3\t4\t5'),
'\t',
'should prefer delimiter that produces more columns'
);

t.end();
});

test('Processor -> processCsvData -> tab-separated', t => {
const tsvData = 'name\tage\tcity\nAlice\t30\tBerlin\nBob\t25\tParis';
const result = processCsvData(tsvData);

t.equal(result.fields.length, 3, 'should parse 3 fields from TSV');
t.equal(result.fields[0].name, 'name', 'first field should be name');
t.equal(result.fields[1].name, 'age', 'second field should be age');
t.equal(result.fields[2].name, 'city', 'third field should be city');
t.equal(result.rows.length, 2, 'should have 2 data rows');
t.deepEqual(result.rows[0], ['Alice', 30, 'Berlin'], 'should parse first row correctly');
t.deepEqual(result.rows[1], ['Bob', 25, 'Paris'], 'should parse second row correctly');

t.end();
});

test('Processor -> processCsvData -> semicolon-separated', t => {
const ssvData = 'name;value;active\nfoo;100;true\nbar;200;false';
const result = processCsvData(ssvData);

t.equal(result.fields.length, 3, 'should parse 3 fields from semicolon-separated data');
t.equal(result.fields[0].name, 'name', 'first field should be name');
t.equal(result.fields[1].name, 'value', 'second field should be value');
t.equal(result.fields[2].name, 'active', 'third field should be active');
t.equal(result.rows.length, 2, 'should have 2 data rows');
t.deepEqual(result.rows[0], ['foo', 100, true], 'should parse first row correctly');
t.deepEqual(result.rows[1], ['bar', 200, false], 'should parse second row correctly');

t.end();
});

test('Processor -> processCsvData -> pipe-separated', t => {
const psvData = 'id|name|score\n1|Alice|95.5\n2|Bob|87.3';
const result = processCsvData(psvData);

t.equal(result.fields.length, 3, 'should parse 3 fields from pipe-separated data');
t.equal(result.fields[0].name, 'id', 'first field should be id');
t.equal(result.fields[1].name, 'name', 'second field should be name');
t.equal(result.fields[2].name, 'score', 'third field should be score');
t.equal(result.rows.length, 2, 'should have 2 data rows');

t.end();
});

test('Processor -> processCsvData -> semicolon with quoted commas', t => {
const data = '"City, Country";Population;Area\n"Berlin, Germany";3600000;891\n"Paris, France";2100000;105';
const result = processCsvData(data);

t.equal(result.fields.length, 3, 'should parse 3 fields');
t.equal(result.fields[0].name, 'City, Country', 'should preserve comma inside quotes');
t.equal(result.rows.length, 2, 'should have 2 data rows');
t.equal(result.rows[0][0], 'Berlin, Germany', 'should preserve quoted value with comma');

t.end();
});

test('Processor -> detectDelimiter -> Windows line endings (CRLF)', t => {
t.equal(
detectDelimiter('a\tb\tc\r\n1\t2\t3\r\n'),
'\t',
'should detect tab delimiter with CRLF line endings'
);
t.equal(
detectDelimiter('a;b;c\r\n1;2;3\r\n'),
';',
'should detect semicolon delimiter with CRLF line endings'
);

t.end();
});

test('Processor -> detectDelimiter -> ambiguous cases', t => {
t.equal(
detectDelimiter('a,b\tc,d\n1,2\t3,4'),
',',
'should prefer comma when comma produces more columns than tab'
);
t.equal(
detectDelimiter('a\tb\tc\td,e\n1\t2\t3\t4,5'),
'\t',
'should prefer tab when tab produces more columns than comma'
);
t.equal(
detectDelimiter('a;b;c;d|e\n1;2;3;4|5'),
';',
'should prefer semicolon when it produces more columns than pipe'
);

t.end();
});

test('Processor -> detectDelimiter -> trailing and leading whitespace', t => {
t.equal(
detectDelimiter(' a\tb\tc \n 1\t2\t3 '),
'\t',
'should detect tab even with surrounding whitespace'
);
t.equal(
detectDelimiter(' a ; b ; c \n 1 ; 2 ; 3 '),
';',
'should detect semicolon even with spaces around values'
);

t.end();
});

test('Processor -> processCsvData -> tab-separated with empty fields', t => {
const tsvData = 'name\tage\tcity\nAlice\t\tBerlin\n\t25\t';
const result = processCsvData(tsvData);

t.equal(result.fields.length, 3, 'should parse 3 fields');
t.equal(result.rows.length, 2, 'should have 2 data rows');
t.equal(result.rows[0][0], 'Alice', 'first row first value should be Alice');
t.equal(result.rows[0][1], null, 'first row second value should be null (empty)');
t.equal(result.rows[0][2], 'Berlin', 'first row third value should be Berlin');
t.equal(result.rows[1][0], null, 'second row first value should be null (empty)');

t.end();
});

test('Processor -> processCsvData -> tab-separated with many columns', t => {
const headers = Array.from({length: 20}, (_, i) => `col${i}`).join('\t');
const row1 = Array.from({length: 20}, (_, i) => `val${i}`).join('\t');
const row2 = Array.from({length: 20}, (_, i) => `row2_${i}`).join('\t');
const tsvData = `${headers}\n${row1}\n${row2}`;
const result = processCsvData(tsvData);

t.equal(result.fields.length, 20, 'should parse 20 fields from wide TSV');
t.equal(result.fields[0].name, 'col0', 'first field should be col0');
t.equal(result.fields[19].name, 'col19', 'last field should be col19');
t.equal(result.rows.length, 2, 'should have 2 data rows');
t.equal(result.rows[0][0], 'val0', 'first cell should be val0');
t.equal(result.rows[0][19], 'val19', 'last cell should be val19');

t.end();
});

test('Processor -> processCsvData -> semicolon-separated with numeric data', t => {
const data = 'lat;lng;value\n52.52;13.405;1000.5\n48.8566;2.3522;2000.7\n40.4168;-3.7038;1500.3';
const result = processCsvData(data);

t.equal(result.fields.length, 3, 'should parse 3 fields');
t.equal(result.rows.length, 3, 'should have 3 data rows');
t.equal(result.rows[0][0], 52.52, 'should parse lat as number');
t.equal(result.rows[0][1], 13.405, 'should parse lng as number');
t.equal(result.rows[0][2], 1000.5, 'should parse value as number');

t.end();
});

test('Processor -> processCsvData -> pipe-separated with special characters in values', t => {
const data = 'id|description|url\n1|"hello, world"|http://example.com\n2|"foo; bar"|http://test.org';
const result = processCsvData(data);

t.equal(result.fields.length, 3, 'should parse 3 fields');
t.equal(result.rows[0][1], 'hello, world', 'should handle commas inside quoted pipe-separated fields');
t.equal(result.rows[1][1], 'foo; bar', 'should handle semicolons inside quoted pipe-separated fields');

t.end();
});

test('Processor -> processCsvData -> tab-separated preserves original comma data', t => {
const csvData = 'a,b,c\n1,2,3\n4,5,6';
const result = processCsvData(csvData);

t.equal(result.fields.length, 3, 'regular CSV should still parse correctly');
t.equal(result.fields[0].name, 'a', 'field name should be a');
t.deepEqual(result.rows[0], [1, 2, 3], 'first row should be [1,2,3]');
t.deepEqual(result.rows[1], [4, 5, 6], 'second row should be [4,5,6]');

t.end();
});

test('Processor -> processCsvData -> trailing newline does not break parsing', t => {
const tsvData = 'name\tage\nAlice\t30\nBob\t25\n';
const result = processCsvData(tsvData);

t.equal(result.fields.length, 2, 'should parse 2 fields');
t.equal(result.rows.length, 2, 'should have 2 data rows (trailing newline ignored)');

const ssvData = 'x;y;z\n1;2;3\n4;5;6\n';
const result2 = processCsvData(ssvData);

t.equal(result2.fields.length, 3, 'semicolon: should parse 3 fields');
t.equal(result2.rows.length, 2, 'semicolon: trailing newline should not add empty row');

t.end();
});

test('Processor -> processCsvData -> single data row (header + 1 row)', t => {
const tsvData = 'x\ty\tz\n10\t20\t30';
const result = processCsvData(tsvData);

t.equal(result.fields.length, 3, 'should parse 3 fields');
t.equal(result.rows.length, 1, 'should have 1 data row');
t.deepEqual(result.rows[0], [10, 20, 30], 'should parse the single row correctly');

t.end();
});

test('Processor -> processCsvData -> timestamps in TSV', t => {
const tsvData = 'timestamp\tvalue\n2023-01-15 10:30:00\t100\n2023-02-20 14:45:30\t200';
const result = processCsvData(tsvData);

t.equal(result.fields.length, 2, 'should parse 2 fields');
t.equal(result.fields[0].type, 'timestamp', 'should detect timestamp type');
t.equal(result.rows.length, 2, 'should have 2 rows');

t.end();
});

test('Processor -> processCsvData -> duplicated field name', t => {
const testData1 = `column1,column1,column1,column2\na,b,c,d\nc,d,e,f`;

Expand Down
Loading