Skip to content

Commit 3e688b3

Browse files
authored
feat: support HTTP DuckDB queries in WASM notebooks (#9480)
Motivated by marimo-team/quarto-marimo#74, marimo-team/jupyter-book-marimo#1, and #9413. DuckDB remote file queries fail in Pyodide because DuckDB-WASM [can't use httpfs](https://duckdb.org/2024/10/02/pyodide#limitations). Therefore, URL-based SQL like `FROM 'https://...'` and `read_csv/read_parquet/read_json('https://...')` are unusable in WASM notebooks today. This PR adds a DuckDB WASM fallback layer for `mo.sql`, SQL cells, raw `duckdb.sql/query/execute/query_df`, connection SQL methods, and direct `duckdb.read_csv/read_parquet/read_json` calls. It translates queries such as ```sql SELECT * FROM read_csv('https://example.com/cars.csv') SELECT * FROM 'https://example.com/cars.csv' -- or duckdb.read_csv('https://example.com/cars.csv') via Python API ``` into ```sql SELECT * FROM __marimo_wasm_duckdb_remote_0 ``` where `__marimo_wasm_duckdb_remote_0` is bound to a fetched pandas DataFrame, which DuckDB can query through Python replacement scans. Underneath, the fallback layer: - uses sqlglot to analyze SQL and extract supported static remote file references from the AST - fetches remote files through Python/urllib via marimo's shared WASM fetch util - decodes fetched DuckDB file bytes into pandas DataFrames - hands those DataFrames back to DuckDB under generated table names Unsupported or dynamic cases are left to DuckDB's normal path. The patch is a no-op outside Pyodide, and in Pyodide the DuckDB SQL fallback requires `sqlglot` for AST analysis. Tested with unit coverage for the rewrite/fetch/read paths and manually in the WASM playground against hosted CSV, parquet, JSON, and GeoJSON datasets. ## WASM Playground Demo Demonstrates that the Pyodide-build of marimo supports querying remote files with DuckDB across cases like: - CSV via `mo.sql`: direct URL scan - Parquet via `duckdb.sql`: `read_parquet(...)` - Direct `duckdb.read_csv` Python API with patched options (custom delimiter) - JSON / GeoJSON path - Connection API via `duckdb.connect` https://github.com/user-attachments/assets/a2b310ed-2677-4f67-81da-3d8ea2145254
1 parent 47fc5a1 commit 3e688b3

17 files changed

Lines changed: 3526 additions & 52 deletions

File tree

frontend/src/core/islands/worker/worker.tsx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
} from "rpc-anywhere";
1010
import type { NotificationPayload } from "@/core/kernel/messages";
1111
import type { ParentSchema } from "@/core/wasm/rpc";
12+
import { shouldLoadDuckDBPackages } from "@/core/wasm/utils";
1213
import { TRANSPORT_ID } from "@/core/wasm/worker/constants";
1314
import { getPyodideVersion } from "@/core/wasm/worker/getPyodideVersion";
1415
import { MessageBuffer } from "@/core/wasm/worker/message-buffer";
@@ -85,8 +86,8 @@ const requestHandler = createRPCRequestHandler({
8586
loadPackages: async (code: string) => {
8687
await pyodideReadyPromise; // Make sure loading is done
8788

88-
if (code.includes("mo.sql")) {
89-
// Add pandas and duckdb to the code
89+
if (shouldLoadDuckDBPackages(code)) {
90+
// Add pandas and duckdb to the code for mo.sql and for remote duckdb sources
9091
code = `import pandas\n${code}`;
9192
code = `import duckdb\n${code}`;
9293
code = `import sqlglot\n${code}`;
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/* Copyright 2026 Marimo. All rights reserved. */
2+
3+
import { describe, expect, it } from "vitest";
4+
import { shouldLoadDuckDBPackages } from "../utils";
5+
6+
describe("shouldLoadDuckDBPackages", () => {
7+
it("loads for mo.sql", () => {
8+
expect(shouldLoadDuckDBPackages('df = mo.sql("SELECT 1")')).toBe(true);
9+
});
10+
11+
it("loads for duckdb imports and usage", () => {
12+
expect(shouldLoadDuckDBPackages("import duckdb")).toBe(true);
13+
expect(shouldLoadDuckDBPackages("from duckdb import sql")).toBe(true);
14+
expect(shouldLoadDuckDBPackages("import pandas, duckdb")).toBe(true);
15+
expect(shouldLoadDuckDBPackages("rows = duckdb.sql('SELECT 1')")).toBe(
16+
true,
17+
);
18+
});
19+
20+
it("loads when package discovery found duckdb", () => {
21+
expect(
22+
shouldLoadDuckDBPackages("print('hello')", new Set(["duckdb"])),
23+
).toBe(true);
24+
});
25+
26+
it("does not load for incidental duckdb text", () => {
27+
expect(shouldLoadDuckDBPackages("name = 'duckdb'")).toBe(false);
28+
expect(shouldLoadDuckDBPackages("# import duckdb")).toBe(false);
29+
});
30+
31+
it("does not load without mo.sql, duckdb usage, or discovery", () => {
32+
expect(shouldLoadDuckDBPackages("print('hello')")).toBe(false);
33+
});
34+
});

frontend/src/core/wasm/utils.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,17 @@ export function isWasm(): boolean {
1010
document.querySelector("marimo-wasm") !== null
1111
);
1212
}
13+
14+
const DUCKDB_USAGE_PATTERN =
15+
/(^|\n)\s*(?:import\s+[^\n#]*\bduckdb\b|from\s+duckdb\b|[^\n#]*\bduckdb\s*\.)/;
16+
17+
export function shouldLoadDuckDBPackages(
18+
code: string,
19+
foundPackages?: ReadonlySet<string>,
20+
): boolean {
21+
return (
22+
code.includes("mo.sql") ||
23+
DUCKDB_USAGE_PATTERN.test(code) ||
24+
foundPackages?.has("duckdb") === true
25+
);
26+
}

frontend/src/core/wasm/worker/bootstrap.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import { WasmFileSystem } from "./fs";
99
import { getMarimoWheel } from "./getMarimoWheel";
1010
import { t } from "./tracer";
1111
import type { SerializedBridge, WasmController } from "./types";
12+
import { shouldLoadDuckDBPackages } from "../utils";
1213

1314
const MAKE_SNAPSHOT = false;
1415

@@ -163,8 +164,8 @@ export class DefaultWasmController implements WasmController {
163164
private async loadNotebookDeps(code: string, foundPackages: Set<string>) {
164165
const pyodide = this.requirePyodide;
165166

166-
if (code.includes("mo.sql")) {
167-
// We need pandas and duckdb for mo.sql
167+
if (shouldLoadDuckDBPackages(code, foundPackages)) {
168+
// We need pandas and duckdb for mo.sql and for remote duckdb sources
168169
code = `import pandas\n${code}`;
169170
code = `import duckdb\n${code}`;
170171
code = `import sqlglot\n${code}`;

frontend/src/core/wasm/worker/worker.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import type {
3434
SerializedBridge,
3535
WasmController,
3636
} from "./types";
37+
import { shouldLoadDuckDBPackages } from "../utils";
3738

3839
/**
3940
* Web worker responsible for running the notebook.
@@ -141,8 +142,8 @@ const requestHandler = createRPCRequestHandler({
141142
const span = t.startSpan("loadPackages");
142143
await pyodideReadyPromise; // Make sure loading is done
143144

144-
if (code.includes("mo.sql")) {
145-
// Add pandas and duckdb to the code
145+
if (shouldLoadDuckDBPackages(code)) {
146+
// Add pandas and duckdb to the code for mo.sql and for remote duckdb sources
146147
code = `import pandas\n${code}`;
147148
code = `import duckdb\n${code}`;
148149
code = `import sqlglot\n${code}`;

marimo/_output/formatters/df_formatters.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from marimo._plugins.stateless.plain_text import plain_text
2020
from marimo._plugins.ui._impl import tabs
2121
from marimo._plugins.ui._impl.table import get_default_table_page_size, table
22+
from marimo._runtime._wasm._duckdb import patch_duckdb_for_wasm
2223
from marimo._runtime._wasm._polars import patch_polars_for_wasm
2324

2425
LOGGER = _loggers.marimo_logger()
@@ -160,6 +161,17 @@ def _show_marimo_dataframe(
160161
return table(df, selection=None, pagination=None)._mime_()
161162

162163

164+
class DuckDBFormatter(FormatterFactory):
165+
"""Use DuckDB's lazy import hook to install WASM runtime patches."""
166+
167+
@staticmethod
168+
def package_name() -> str:
169+
return "duckdb"
170+
171+
def register(self) -> Unregister:
172+
return patch_duckdb_for_wasm()
173+
174+
163175
class DataFusionFormatter(FormatterFactory):
164176
@staticmethod
165177
def package_name() -> str:

marimo/_output/formatters/formatters.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from marimo._output.formatters.cell import CellFormatter
1919
from marimo._output.formatters.df_formatters import (
2020
DataFusionFormatter,
21+
DuckDBFormatter,
2122
IbisFormatter,
2223
PolarsFormatter,
2324
PyArrowFormatter,
@@ -54,6 +55,7 @@
5455
AltairFormatter.package_name(): AltairFormatter(),
5556
MatplotlibFormatter.package_name(): MatplotlibFormatter(),
5657
DataFusionFormatter.package_name(): DataFusionFormatter(),
58+
DuckDBFormatter.package_name(): DuckDBFormatter(),
5759
IbisFormatter.package_name(): IbisFormatter(),
5860
PandasFormatter.package_name(): PandasFormatter(),
5961
PolarsFormatter.package_name(): PolarsFormatter(),

0 commit comments

Comments
 (0)