From 37879943b5666bb797b8ee0408d3725f7e8a848f Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Fri, 22 Aug 2025 14:28:02 -0400 Subject: [PATCH 1/6] update dependencies --- package.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index 7083b411..2fbb0148 100644 --- a/package.json +++ b/package.json @@ -55,8 +55,8 @@ "watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet" }, "dependencies": { - "hightable": "0.18.5", - "hyparquet": "1.17.7", + "hightable": "0.19.3", + "hyparquet": "1.17.8", "hyparquet-compressors": "1.1.1", "icebird": "0.3.0", "react": "18.3.1", @@ -76,14 +76,14 @@ "eslint-plugin-react-hooks": "5.2.0", "eslint-plugin-react-refresh": "0.4.20", "eslint-plugin-storybook": "9.1.5", - "globals": "16.3.0", + "globals": "16.4.0", "jsdom": "26.1.0", "nodemon": "3.1.10", "npm-run-all": "4.1.5", "storybook": "9.1.5", "typescript": "5.8.3", - "typescript-eslint": "8.42.0", - "vite": "7.1.4", + "typescript-eslint": "8.43.0", + "vite": "7.1.5", "vitest": "3.2.4" }, "eslintConfig": { From 1208edfd6c6fb6916c5e8a27cdab695230eab351 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Fri, 22 Aug 2025 14:28:20 -0400 Subject: [PATCH 2/6] update code to hightable 0.19.0 --- src/components/Cell/Cell.tsx | 4 ++-- src/components/CellPanel/CellPanel.tsx | 6 +++--- src/components/ParquetView/ParquetView.tsx | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/components/Cell/Cell.tsx b/src/components/Cell/Cell.tsx index 2b7f7438..dcbda3c5 100644 --- a/src/components/Cell/Cell.tsx +++ b/src/components/Cell/Cell.tsx @@ -42,11 +42,11 @@ export default function CellView({ source, row, col }: CellProps) { setProgress(0.75) const df = parquetDataFrame(from, metadata) - const columnName = df.header[col] + const columnName = df.columnDescriptors[col]?.name if (columnName === undefined) { throw new Error(`Column name missing at index col=${col}`) } - await df.fetch({ rowStart: row, rowEnd: row + 1, columns: [columnName] }) + await df.fetch?.({ rowStart: row, rowEnd: row + 1, columns: [columnName] }) const cell = df.getCell({ row, column: columnName }) const text = cell === undefined ? UNLOADED_CELL_PLACEHOLDER : stringify(cell.value) setText(text) diff --git a/src/components/CellPanel/CellPanel.tsx b/src/components/CellPanel/CellPanel.tsx index 6c568e50..35b55b22 100644 --- a/src/components/CellPanel/CellPanel.tsx +++ b/src/components/CellPanel/CellPanel.tsx @@ -58,7 +58,7 @@ export default function CellPanel({ df, row, col, setProgress, setError, onClose try { setProgress(0.5) - const columnName = df.header[col] + const columnName = df.columnDescriptors[col]?.name if (columnName === undefined) { throw new Error(`Column name missing at index col=${col}`) } @@ -67,7 +67,7 @@ export default function CellPanel({ df, row, col, setProgress, setError, onClose fillContent(undefined) return } - await df.fetch({ rowStart: row, rowEnd: row + 1, columns: [columnName] }) + await df.fetch?.({ rowStart: row, rowEnd: row + 1, columns: [columnName] }) cell = df.getCell({ row, column: columnName }) if (cell === undefined) { throw new Error(`Cell at row=${row}, column=${columnName} is undefined`) @@ -85,7 +85,7 @@ export default function CellPanel({ df, row, col, setProgress, setError, onClose const headers = <> - column: {df.header[col]} + column: {df.columnDescriptors[col]?.name} row: {row + 1} diff --git a/src/components/ParquetView/ParquetView.tsx b/src/components/ParquetView/ParquetView.tsx index 2dc8b0cd..f87462f3 100644 --- a/src/components/ParquetView/ParquetView.tsx +++ b/src/components/ParquetView/ParquetView.tsx @@ -82,7 +82,7 @@ export default function ParquetView({ source, setProgress, setError }: ViewerPro if (cell?.col === col && cell.row === row) { return undefined } - const columnName = content?.dataframe.header[col] + const columnName = content?.dataframe.columnDescriptors[col]?.name if (columnName === undefined || !content?.dataframe.getCell({ row, column: columnName })) { // don't open the cell panel until it has loaded return undefined From 96f533ed3f8a66a00a44c2b0d21d5ae9857ac361 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Fri, 22 Aug 2025 15:38:31 -0400 Subject: [PATCH 3/6] update code related to hightable 0.19.1 --- src/lib/tableProvider.ts | 53 ++++++++++++---------------------------- 1 file changed, 15 insertions(+), 38 deletions(-) diff --git a/src/lib/tableProvider.ts b/src/lib/tableProvider.ts index e46ddac5..1ebc9c38 100644 --- a/src/lib/tableProvider.ts +++ b/src/lib/tableProvider.ts @@ -1,4 +1,4 @@ -import { DataFrame, DataFrameEvents, ResolvedValue, UnsortableDataFrame, createEventTarget, sortableDataFrame } from 'hightable' +import { DataFrame, DataFrameEvents, ResolvedValue, checkSignal, createEventTarget, sortableDataFrame, validateFetchParams, validateGetCellParams, validateGetRowNumberParams } from 'hightable' import type { ColumnData } from 'hyparquet' import { FileMetaData, ParquetReadOptions, parquetSchema } from 'hyparquet' import { parquetReadWorker } from './workers/parquetWorkerClient.js' @@ -20,13 +20,15 @@ interface VirtualRowGroup { /** * Convert a parquet file into a dataframe. + * + * It's sortable on all the columns, and fetches data on demand in chunks of 1000 rows. */ -export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, options?: Pick): DataFrame { +export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, options?: Pick): DataFrame<{parquet: FileMetaData}> { const { children } = parquetSchema(metadata) - const header = children.map(child => child.element.name) + const columnDescriptors = children.map(child => ({ name: child.element.name })) const eventTarget = createEventTarget() - const cellCache = new Map[]>(header.map(name => [name, []])) + const cellCache = new Map[]>(columnDescriptors.map(({ name }) => [name, []])) // virtual row groups are up to 1000 rows within row group boundaries const groups: VirtualRowGroup[] = [] @@ -39,7 +41,7 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, groups.push({ groupStart, groupEnd, - state: new Map(header.map(name => [name, { kind: 'unfetched' }])), + state: new Map(columnDescriptors.map(({ name }) => [name, { kind: 'unfetched' }])), }) groupStart = groupEnd } @@ -84,22 +86,21 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, const numRows = Number(metadata.num_rows) - const unsortableDataFrame: UnsortableDataFrame = { - header, + const unsortableDataFrame: DataFrame<{parquet: FileMetaData}> = { + columnDescriptors, numRows, - metadata, + metadata: { parquet: metadata }, eventTarget, - getRowNumber({ row }) { - validateRow({ row, data: { numRows } }) + getRowNumber({ row, orderBy }) { + validateGetRowNumberParams({ row, orderBy, data: { numRows, columnDescriptors } }) return { value: row } }, - getCell({ row, column }) { - validateRow({ row, data: { numRows } }) - validateColumn({ column, data: { header } }) + getCell({ row, column, orderBy }) { + validateGetCellParams({ row, column, orderBy, data: { numRows, columnDescriptors } }) return cellCache.get(column)?.[row] }, fetch: async ({ rowStart, rowEnd, columns, signal }) => { - validateFetchParams({ rowStart, rowEnd, columns, data: { numRows, header } }) + validateFetchParams({ rowStart, rowEnd, columns, data: { numRows, columnDescriptors } }) checkSignal(signal) if (!columns || columns.length === 0) { @@ -128,27 +129,3 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, return sortableDataFrame(unsortableDataFrame) } - -function validateFetchParams({ rowStart, rowEnd, columns, data: { numRows, header } }: {rowStart: number, rowEnd: number, columns?: string[], data: Pick}): void { - if (rowStart < 0 || rowEnd > numRows || !Number.isInteger(rowStart) || !Number.isInteger(rowEnd) || rowStart > rowEnd) { - throw new Error(`Invalid row range: ${rowStart} - ${rowEnd}, numRows: ${numRows}`) - } - if (columns?.some(column => !header.includes(column))) { - throw new Error(`Invalid columns: ${columns.join(', ')}. Available columns: ${header.join(', ')}`) - } -} -function validateRow({ row, data: { numRows } }: {row: number, data: Pick}): void { - if (row < 0 || row >= numRows || !Number.isInteger(row)) { - throw new Error(`Invalid row index: ${row}, numRows: ${numRows}`) - } -} -function validateColumn({ column, data: { header } }: {column: string, data: Pick}): void { - if (!header.includes(column)) { - throw new Error(`Invalid column: ${column}. Available columns: ${header.join(', ')}`) - } -} -function checkSignal(signal?: AbortSignal): void { - if (signal?.aborted) { - throw new DOMException('The operation was aborted.', 'AbortError') - } -} From 098e7a72ccdd1ac6b4a420104c374664ff8b294a Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Fri, 22 Aug 2025 16:02:07 -0400 Subject: [PATCH 4/6] fix types in worker --- src/lib/workers/parquetWorker.ts | 10 +++++----- src/lib/workers/parquetWorkerClient.ts | 8 +++++++- src/lib/workers/types.ts | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/lib/workers/parquetWorker.ts b/src/lib/workers/parquetWorker.ts index 06fb2667..80e6f3fa 100644 --- a/src/lib/workers/parquetWorker.ts +++ b/src/lib/workers/parquetWorker.ts @@ -1,7 +1,7 @@ import type { ColumnData } from 'hyparquet' import { AsyncBuffer, parquetQuery, parquetRead, parquetReadObjects } from 'hyparquet' import { compressors } from 'hyparquet-compressors' -import type { ChunkMessage, ClientMessage, CompleteMessage, PageMessage, ParquetQueryResolveMessage, ParquetReadObjectsResolveMessage, ParquetReadResolveMessage, RejectMessage } from './types.js' +import type { ChunkMessage, ClientMessage, CompleteMessage, PageMessage, ParquetQueryResolveMessage, ParquetReadObjectsResolveMessage, ParquetReadResolveMessage, RejectMessage, Rows } from './types.js' import { fromToAsyncBuffer } from './utils.js' const cache = new Map>() @@ -33,20 +33,20 @@ self.onmessage = async ({ data }: { data: ClientMessage }) => { const file = await fromToAsyncBuffer(from, cache) try { if (kind === 'parquetReadObjects') { - const rows = await parquetReadObjects({ ...options, file, compressors, onChunk, onPage }) + const rows = (await parquetReadObjects({ ...options, rowFormat: 'object', file, compressors, onChunk, onPage })) as Rows postParquetReadObjectsResultMessage({ queryId, rows }) } else if (kind === 'parquetQuery') { - const rows = await parquetQuery({ ...options, file, compressors, onComplete, onChunk, onPage }) + const rows = (await parquetQuery({ ...options, rowFormat: 'object', file, compressors, onComplete, onChunk, onPage })) as Rows postParquetQueryResultMessage({ queryId, rows }) } else { - await parquetRead({ ...options, file, compressors, onComplete, onChunk, onPage }) + await parquetRead({ ...options, rowFormat: 'object', file, compressors, onComplete, onChunk, onPage }) postParquetReadResultMessage({ queryId }) } } catch (error) { postErrorMessage({ error: error as Error, queryId }) } - function onComplete(rows: unknown[][]) { + function onComplete(rows: Rows) { postCompleteMessage({ queryId, rows }) } function onChunk(chunk: ColumnData) { diff --git a/src/lib/workers/parquetWorkerClient.ts b/src/lib/workers/parquetWorkerClient.ts index 6d095e32..7234f2eb 100644 --- a/src/lib/workers/parquetWorkerClient.ts +++ b/src/lib/workers/parquetWorkerClient.ts @@ -7,7 +7,7 @@ import type { ClientMessage, ParquetQueryWorkerOptions, ParquetReadObjectsWorker let worker: Worker | undefined let nextQueryId = 0 interface Agent { - onComplete?: (rows: Rows) => void + onComplete?: ((rows: Rows) => void) onChunk?: (chunk: ColumnData) => void onPage?: (page: ColumnData) => void reject: (error: Error) => void @@ -73,6 +73,8 @@ function getWorker() { * Instead of taking an AsyncBuffer, it takes a AsyncBufferFrom, because it needs * to be serialized to the worker. Also: the worker uses hyparquet-compressors and * the default parsers. + * + * Note that it only supports 'rowFormat: object' (the default). */ export function parquetReadWorker(options: ParquetReadWorkerOptions): Promise { const { onComplete, onChunk, onPage, from, ...serializableOptions } = options @@ -91,6 +93,8 @@ export function parquetReadWorker(options: ParquetReadWorkerOptions): Promise { const { onChunk, onPage, from, ...serializableOptions } = options @@ -109,6 +113,8 @@ export function parquetReadObjectsWorker(options: ParquetReadObjectsWorkerOption * Instead of taking an AsyncBuffer, it takes a AsyncBufferFrom, because it needs * to be serialized to the worker. Also: the worker uses hyparquet-compressors and * the default parsers. + * + * Note that it only supports 'rowFormat: object' (the default). */ export function parquetQueryWorker(options: ParquetQueryWorkerOptions): Promise { const { onComplete, onChunk, onPage, from, ...serializableOptions } = options diff --git a/src/lib/workers/types.ts b/src/lib/workers/types.ts index d2ce699f..caa868c3 100644 --- a/src/lib/workers/types.ts +++ b/src/lib/workers/types.ts @@ -16,7 +16,8 @@ interface AsyncBufferFromUrl { } export type AsyncBufferFrom = AsyncBufferFromFile | AsyncBufferFromUrl -export type Rows = unknown[][] | Record[] +// Only rowFormat 'object' is supported in the worker +export type Rows = Record[] /** * Options for the worker version of parquetRead @@ -25,9 +26,11 @@ export type Rows = unknown[][] | Record[] * - 'compressors' are not configurable, the worker uses hyparquet-compressors * - 'parsers' are not configurable, the worker uses the default parsers */ -export interface ParquetReadWorkerOptions extends Omit { - onComplete?: (rows: Rows) => void // fix for https://github.com/hyparam/hyparquet/issues/28 +export interface ParquetReadWorkerOptions extends Omit { from: AsyncBufferFrom + // rowFormat 'array' is not supported in the worker. + rowFormat?: 'object' + onComplete?: (rows: Rows) => void } /** * Options for the worker version of parquetReadObjects From 985441dd357e444d7690ffedd89579ab21adbce5 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Fri, 22 Aug 2025 16:04:56 -0400 Subject: [PATCH 5/6] parquetDataFrame returns an unsortable dataframe --- src/lib/tableProvider.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib/tableProvider.ts b/src/lib/tableProvider.ts index 1ebc9c38..f65e6916 100644 --- a/src/lib/tableProvider.ts +++ b/src/lib/tableProvider.ts @@ -1,4 +1,4 @@ -import { DataFrame, DataFrameEvents, ResolvedValue, checkSignal, createEventTarget, sortableDataFrame, validateFetchParams, validateGetCellParams, validateGetRowNumberParams } from 'hightable' +import { DataFrame, DataFrameEvents, ResolvedValue, checkSignal, createEventTarget, validateFetchParams, validateGetCellParams, validateGetRowNumberParams } from 'hightable' import type { ColumnData } from 'hyparquet' import { FileMetaData, ParquetReadOptions, parquetSchema } from 'hyparquet' import { parquetReadWorker } from './workers/parquetWorkerClient.js' @@ -21,7 +21,8 @@ interface VirtualRowGroup { /** * Convert a parquet file into a dataframe. * - * It's sortable on all the columns, and fetches data on demand in chunks of 1000 rows. + * It fetches data on demand in chunks of 1000 rows within each row group. + * It's not sortable. You can use sortableDataFrame from hightable to make it sortable. */ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, options?: Pick): DataFrame<{parquet: FileMetaData}> { const { children } = parquetSchema(metadata) @@ -127,5 +128,5 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, }, } - return sortableDataFrame(unsortableDataFrame) + return unsortableDataFrame } From 7730ca864629046e64b8a7ef833790e7bef44154 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Sat, 13 Sep 2025 00:28:19 +0200 Subject: [PATCH 6/6] upgrade deps --- package.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 2fbb0148..37b260a2 100644 --- a/package.json +++ b/package.json @@ -55,7 +55,7 @@ "watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet" }, "dependencies": { - "hightable": "0.19.3", + "hightable": "0.19.4", "hyparquet": "1.17.8", "hyparquet-compressors": "1.1.1", "icebird": "0.3.0", @@ -66,8 +66,8 @@ "@eslint/js": "9.35.0", "@storybook/react-vite": "9.1.5", "@testing-library/react": "16.3.0", - "@types/node": "24.3.1", - "@types/react": "19.1.12", + "@types/node": "24.3.2", + "@types/react": "19.1.13", "@types/react-dom": "19.1.9", "@vitejs/plugin-react": "5.0.2", "@vitest/coverage-v8": "3.2.4",