From c81445a04f513c2079055609305f70d14e8fd463 Mon Sep 17 00:00:00 2001 From: Juarez Rudsatz Date: Tue, 9 Sep 2025 19:21:51 -0300 Subject: [PATCH 1/3] Parse tables from CSV, TSV, and fixed space delimted formats --- src/parse-table.js | 78 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/src/parse-table.js b/src/parse-table.js index beaef53..aa5f592 100644 --- a/src/parse-table.js +++ b/src/parse-table.js @@ -205,11 +205,83 @@ function parseTable(inputString) { function parseTextTable(textString) { // Split the input by line breaks for rows const rows = textString.trim().split(/\r?\n/); + const rlen = rows.length; + const len2 = rlen - 2; - // Split each row by tab delimiters or by space and tab - const matrix = rows.map((row) => row.split(/\t|\s\t/)); + // Delimiters: TAB (spreadsheets, IDEs), comma (CSV), semicolon (CSV), pipe (TSV) + // or by spaces (fixed width) + const patterns = [ '\\t|\\s\\t', ',', ';', '\\|', '\\s+' ]; + let results = []; + let columns = []; + // Finds the best pattern to split the table + for (let i = 0; i < patterns.length; i++) { + let pattern = patterns[i]; + let regex = new RegExp(pattern, 'gm'); + let matrix = rows.map((row) => row.split(regex)); + let cols = getNumSplitRows(matrix, i); + // Check if the pattern perfectly split all rows with same number of columns + if ((cols[0] > 1) && (cols[0] >= len2) && cols[1] >= rlen) { + return matrix; + } + results[i] = matrix; + columns[i] = cols; + } + // Choose the pattern that best splits the table + const sorted = columns.sort(sortByBestRowSplit); + const best = sorted[0][2]; + const res = results[best]; + // Append empty cells to make the table rectangular and avoid errors while converting + const maxCols = getMaxCols(res);; + const normalized = res.map((row) => { + if (row.length < maxCols) { + const diff = maxCols - row.length; + return row.concat(new Array(diff).fill("")); + } + return row; + }); + return normalized; +} - return matrix; +function sortByBestRowSplit(a, b) { + let res = b[0] - a[0]; // More rows with same number of columns (DESC) + if (res == 0) { + res = b[1] - a[1]; // More rows with columns split by the pattern (DESC) + if (res == 0) { + res = a[2] - b[2]; // Pattern order (ASC) gives TAB + } + } + return res +} + +function getMaxCols(matrix) { + let maxCols = 0; + let numRows = matrix.length; + for (let i = 0; i < numRows; i++) { + let cols = matrix[i].length; + if (cols > maxCols) { + maxCols = cols; + } + } + return maxCols; +} + +function getNumSplitRows(matrix, index) { + let numRowSplit = 0; + let numColsEqual = -1; + let numCols = -1; + let numRows = matrix.length; + for (let i = 0; i < numRows; i++) { + let cols = matrix[i].length; + if (cols > 1) { + numRowSplit += 1; + if (numCols <= 0) { + numCols = cols; + } else if (cols == numCols) { + numColsEqual += 1; + } + } + } + return [numColsEqual, numRowSplit, index]; } module.exports = { From e059081569f608c42aef0b1555fac0b721a166ff Mon Sep 17 00:00:00 2001 From: Juarez Rudsatz Date: Tue, 9 Sep 2025 19:22:30 -0300 Subject: [PATCH 2/3] Fix conversion when table have any row with missing columns --- src/parse-table.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parse-table.js b/src/parse-table.js index aa5f592..a761b75 100644 --- a/src/parse-table.js +++ b/src/parse-table.js @@ -141,7 +141,7 @@ function parseTable(inputString) { // each column data.forEach((row) => { row.forEach((value, colIndex) => { - if (value === "") { + if (value === null || value === undefined || value === "") { // Ignore empty values return; } @@ -164,7 +164,7 @@ function parseTable(inputString) { columnTypes.forEach((type, colIndex) => { if (type === "numeric") { const values = data.map((row) => row[colIndex]).filter(value => value !== ""); - const allIntegers = values.every((value) => utils.isInt(value)); + const allIntegers = values.every((value) => value && utils.isInt(value)); if (allIntegers) { columnTypes[colIndex] = "integer"; } @@ -175,7 +175,7 @@ function parseTable(inputString) { columnTypes.forEach((type, colIndex) => { if (type === "string") { const values = data.map((row) => row[colIndex]).filter(value => value !== ""); - const allBool = values.every((value) => utils.isBool(value)); + const allBool = values.every((value) => value && utils.isBool(value)); if (allBool) { columnTypes[colIndex] = "boolean"; } From c32dc6c6235ccb84b3468fc83ca53fb37fe8e3d9 Mon Sep 17 00:00:00 2001 From: Anatolii Tsyplenkov <34775595+atsyplenkov@users.noreply.github.com> Date: Thu, 11 Sep 2025 22:13:02 +0000 Subject: [PATCH 3/3] refactor: simplify value checks in parseTable and parseTextTable functions --- src/parse-table.js | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/parse-table.js b/src/parse-table.js index a761b75..6e2fc84 100644 --- a/src/parse-table.js +++ b/src/parse-table.js @@ -164,18 +164,18 @@ function parseTable(inputString) { columnTypes.forEach((type, colIndex) => { if (type === "numeric") { const values = data.map((row) => row[colIndex]).filter(value => value !== ""); - const allIntegers = values.every((value) => value && utils.isInt(value)); + const allIntegers = values.every((value) => utils.isInt(value)); if (allIntegers) { columnTypes[colIndex] = "integer"; } } }); - + // Check if all values in a string column are boolean columnTypes.forEach((type, colIndex) => { if (type === "string") { const values = data.map((row) => row[colIndex]).filter(value => value !== ""); - const allBool = values.every((value) => value && utils.isBool(value)); + const allBool = values.every((value) => utils.isBool(value)); if (allBool) { columnTypes[colIndex] = "boolean"; } @@ -186,7 +186,7 @@ function parseTable(inputString) { const convertedData = data.map((row) => row.map((value, colIndex) => columnTypes[colIndex] !== "string" && - columnTypes[colIndex] !== "boolean" + columnTypes[colIndex] !== "boolean" ? utils.convertValue(value) : value ) @@ -210,7 +210,7 @@ function parseTextTable(textString) { // Delimiters: TAB (spreadsheets, IDEs), comma (CSV), semicolon (CSV), pipe (TSV) // or by spaces (fixed width) - const patterns = [ '\\t|\\s\\t', ',', ';', '\\|', '\\s+' ]; + const patterns = ['\\t|\\s\\t', ',', ';', '\\|', '\\s+']; let results = []; let columns = []; // Finds the best pattern to split the table @@ -220,7 +220,7 @@ function parseTextTable(textString) { let matrix = rows.map((row) => row.split(regex)); let cols = getNumSplitRows(matrix, i); // Check if the pattern perfectly split all rows with same number of columns - if ((cols[0] > 1) && (cols[0] >= len2) && cols[1] >= rlen) { + if (cols[0] >= rlen && cols[1] >= rlen) { return matrix; } results[i] = matrix; @@ -267,7 +267,7 @@ function getMaxCols(matrix) { function getNumSplitRows(matrix, index) { let numRowSplit = 0; - let numColsEqual = -1; + let numColsEqual = 0; let numCols = -1; let numRows = matrix.length; for (let i = 0; i < numRows; i++) { @@ -276,6 +276,7 @@ function getNumSplitRows(matrix, index) { numRowSplit += 1; if (numCols <= 0) { numCols = cols; + numColsEqual = 1; // First row establishes the column count } else if (cols == numCols) { numColsEqual += 1; }