Skip to content

Commit fc63200

Browse files
authored
Merge pull request #213 from Maxteabag/duckdb-data-files
Open CSV/Parquet/JSON files directly with DuckDB
2 parents 596518a + 45d9127 commit fc63200

3 files changed

Lines changed: 406 additions & 0 deletions

File tree

sqlit/domains/connections/providers/duckdb/adapter.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
from pathlib import Path
56
from typing import TYPE_CHECKING, Any
67

78
from sqlit.domains.connections.providers.adapters.base import (
@@ -13,6 +14,11 @@
1314
TriggerInfo,
1415
resolve_file_path,
1516
)
17+
from sqlit.domains.connections.providers.duckdb.data_files import (
18+
get_read_function,
19+
sidecar_path_for,
20+
table_name_for,
21+
)
1622

1723
if TYPE_CHECKING:
1824
from sqlit.domains.connections.domain.config import ConnectionConfig
@@ -85,8 +91,45 @@ def connect(self, config: ConnectionConfig) -> Any:
8591
duckdb_any: Any = duckdb
8692
connect_args: dict[str, Any] = {}
8793
connect_args.update(config.extra_options)
94+
95+
read_fn = get_read_function(file_path)
96+
if read_fn is not None:
97+
return self._connect_data_file(
98+
duckdb_any, file_path, read_fn, connect_args
99+
)
100+
88101
return duckdb_any.connect(str(file_path), **connect_args)
89102

103+
def _connect_data_file(
104+
self,
105+
duckdb_any: Any,
106+
file_path: Path,
107+
read_fn: str,
108+
connect_args: dict[str, Any],
109+
) -> Any:
110+
"""Connect to a per-process sidecar `.duckdb` backed by a data file.
111+
112+
On first connect within a sqlit process the source file is loaded
113+
into a real table so the user can run UPDATE/INSERT/DELETE against
114+
it. Subsequent connects in the same process reuse the sidecar so
115+
in-session edits persist across query Runs. The sidecar lives under
116+
a PID-scoped temp dir; a new process gets a fresh load from source.
117+
Writing back to the source is explicit: `COPY <table> TO '<path>'`.
118+
"""
119+
sidecar = sidecar_path_for(file_path)
120+
sidecar.parent.mkdir(parents=True, exist_ok=True)
121+
needs_load = not sidecar.exists()
122+
123+
conn = duckdb_any.connect(str(sidecar), **connect_args)
124+
if needs_load:
125+
table = table_name_for(file_path)
126+
path_literal = str(file_path).replace("'", "''")
127+
conn.execute(
128+
f'CREATE TABLE "{table}" AS '
129+
f"SELECT * FROM {read_fn}('{path_literal}')"
130+
)
131+
return conn
132+
90133
def get_databases(self, conn: Any) -> list[str]:
91134
"""DuckDB doesn't support multiple databases - return empty list."""
92135
return []
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""Detection of data files queryable directly by DuckDB.
2+
3+
DuckDB's `read_csv_auto`, `read_parquet`, `read_json_auto` etc. let you
4+
query a raw data file as if it were a table. When a sqlit DuckDB connection
5+
points at one of these files instead of a `.duckdb` database, the adapter:
6+
7+
1. Picks a per-process sidecar `.duckdb` file in the OS temp dir.
8+
2. Loads the source file into a real TABLE in the sidecar on first connect.
9+
3. Lets the user CRUD the table freely; edits persist for the lifetime of
10+
the sqlit process and are wiped on restart.
11+
4. Writing back to the source file is explicit (`COPY <table> TO '<path>'`).
12+
"""
13+
14+
from __future__ import annotations
15+
16+
import hashlib
17+
import os
18+
import re
19+
import tempfile
20+
from pathlib import Path
21+
22+
23+
# File extension -> DuckDB table function that can read it.
24+
_READ_FUNCTIONS: dict[str, str] = {
25+
".csv": "read_csv_auto",
26+
".tsv": "read_csv_auto",
27+
".parquet": "read_parquet",
28+
".pq": "read_parquet",
29+
".json": "read_json_auto",
30+
".jsonl": "read_json_auto",
31+
".ndjson": "read_json_auto",
32+
}
33+
34+
# Allowed compression suffixes that wrap the data extensions above. DuckDB's
35+
# auto-readers transparently decompress these.
36+
_COMPRESSION_SUFFIXES: frozenset[str] = frozenset({".gz", ".zst", ".bz2"})
37+
38+
39+
def get_read_function(path: Path) -> str | None:
40+
"""Return the DuckDB table function for this file, or None if not a known
41+
data file extension.
42+
43+
Handles compressed forms like `.csv.gz` by looking past the compression
44+
suffix.
45+
"""
46+
suffixes = [s.lower() for s in path.suffixes]
47+
if not suffixes:
48+
return None
49+
50+
last = suffixes[-1]
51+
if last in _COMPRESSION_SUFFIXES:
52+
if len(suffixes) >= 2:
53+
return _READ_FUNCTIONS.get(suffixes[-2])
54+
return None
55+
return _READ_FUNCTIONS.get(last)
56+
57+
58+
def is_data_file(path: Path) -> bool:
59+
"""True if the file extension is one DuckDB can query directly."""
60+
return get_read_function(path) is not None
61+
62+
63+
def table_name_for(path: Path) -> str:
64+
"""Build a SQL-safe table name from a file path basename.
65+
66+
Strips the data and (optional) compression extension, then sanitizes
67+
non-identifier characters to underscores.
68+
69+
Examples:
70+
sales.csv -> sales
71+
sales-2024.csv -> sales_2024
72+
events.json.gz -> events
73+
123-data.parquet -> _123_data
74+
"""
75+
stem = path.name
76+
# Strip compression suffix if present.
77+
lower = stem.lower()
78+
for comp in _COMPRESSION_SUFFIXES:
79+
if lower.endswith(comp):
80+
stem = stem[: -len(comp)]
81+
break
82+
# Strip data extension.
83+
dot = stem.rfind(".")
84+
if dot > 0:
85+
stem = stem[:dot]
86+
87+
sanitized = re.sub(r"[^A-Za-z0-9_]+", "_", stem).strip("_")
88+
if not sanitized:
89+
sanitized = "data"
90+
if sanitized[0].isdigit():
91+
sanitized = "_" + sanitized
92+
return sanitized.lower()
93+
94+
95+
def sidecar_path_for(source_path: Path) -> Path:
96+
"""Per-process scratch `.duckdb` path for a data-file source.
97+
98+
Each sqlit process gets its own directory under the OS temp dir. The
99+
sidecar persists for the lifetime of the process so edits within a
100+
sqlit session survive across query Runs. A fresh process gets a fresh
101+
sidecar, so source-file changes are picked up on restart and unsaved
102+
edits are wiped (the user opted into "re-load from source each time").
103+
"""
104+
digest = hashlib.sha1(str(source_path.resolve()).encode()).hexdigest()[:16]
105+
base = Path(tempfile.gettempdir()) / f"sqlit-{os.getpid()}" / "data-files"
106+
return base / f"{digest}.duckdb"

0 commit comments

Comments
 (0)