|
| 1 | +"""Detection of data files queryable directly by DuckDB. |
| 2 | +
|
| 3 | +DuckDB's `read_csv_auto`, `read_parquet`, `read_json_auto` etc. let you |
| 4 | +query a raw data file as if it were a table. When a sqlit DuckDB connection |
| 5 | +points at one of these files instead of a `.duckdb` database, the adapter: |
| 6 | +
|
| 7 | +1. Picks a per-process sidecar `.duckdb` file in the OS temp dir. |
| 8 | +2. Loads the source file into a real TABLE in the sidecar on first connect. |
| 9 | +3. Lets the user CRUD the table freely; edits persist for the lifetime of |
| 10 | + the sqlit process and are wiped on restart. |
| 11 | +4. Writing back to the source file is explicit (`COPY <table> TO '<path>'`). |
| 12 | +""" |
| 13 | + |
| 14 | +from __future__ import annotations |
| 15 | + |
| 16 | +import hashlib |
| 17 | +import os |
| 18 | +import re |
| 19 | +import tempfile |
| 20 | +from pathlib import Path |
| 21 | + |
| 22 | + |
| 23 | +# File extension -> DuckDB table function that can read it. |
| 24 | +_READ_FUNCTIONS: dict[str, str] = { |
| 25 | + ".csv": "read_csv_auto", |
| 26 | + ".tsv": "read_csv_auto", |
| 27 | + ".parquet": "read_parquet", |
| 28 | + ".pq": "read_parquet", |
| 29 | + ".json": "read_json_auto", |
| 30 | + ".jsonl": "read_json_auto", |
| 31 | + ".ndjson": "read_json_auto", |
| 32 | +} |
| 33 | + |
| 34 | +# Allowed compression suffixes that wrap the data extensions above. DuckDB's |
| 35 | +# auto-readers transparently decompress these. |
| 36 | +_COMPRESSION_SUFFIXES: frozenset[str] = frozenset({".gz", ".zst", ".bz2"}) |
| 37 | + |
| 38 | + |
| 39 | +def get_read_function(path: Path) -> str | None: |
| 40 | + """Return the DuckDB table function for this file, or None if not a known |
| 41 | + data file extension. |
| 42 | +
|
| 43 | + Handles compressed forms like `.csv.gz` by looking past the compression |
| 44 | + suffix. |
| 45 | + """ |
| 46 | + suffixes = [s.lower() for s in path.suffixes] |
| 47 | + if not suffixes: |
| 48 | + return None |
| 49 | + |
| 50 | + last = suffixes[-1] |
| 51 | + if last in _COMPRESSION_SUFFIXES: |
| 52 | + if len(suffixes) >= 2: |
| 53 | + return _READ_FUNCTIONS.get(suffixes[-2]) |
| 54 | + return None |
| 55 | + return _READ_FUNCTIONS.get(last) |
| 56 | + |
| 57 | + |
| 58 | +def is_data_file(path: Path) -> bool: |
| 59 | + """True if the file extension is one DuckDB can query directly.""" |
| 60 | + return get_read_function(path) is not None |
| 61 | + |
| 62 | + |
| 63 | +def table_name_for(path: Path) -> str: |
| 64 | + """Build a SQL-safe table name from a file path basename. |
| 65 | +
|
| 66 | + Strips the data and (optional) compression extension, then sanitizes |
| 67 | + non-identifier characters to underscores. |
| 68 | +
|
| 69 | + Examples: |
| 70 | + sales.csv -> sales |
| 71 | + sales-2024.csv -> sales_2024 |
| 72 | + events.json.gz -> events |
| 73 | + 123-data.parquet -> _123_data |
| 74 | + """ |
| 75 | + stem = path.name |
| 76 | + # Strip compression suffix if present. |
| 77 | + lower = stem.lower() |
| 78 | + for comp in _COMPRESSION_SUFFIXES: |
| 79 | + if lower.endswith(comp): |
| 80 | + stem = stem[: -len(comp)] |
| 81 | + break |
| 82 | + # Strip data extension. |
| 83 | + dot = stem.rfind(".") |
| 84 | + if dot > 0: |
| 85 | + stem = stem[:dot] |
| 86 | + |
| 87 | + sanitized = re.sub(r"[^A-Za-z0-9_]+", "_", stem).strip("_") |
| 88 | + if not sanitized: |
| 89 | + sanitized = "data" |
| 90 | + if sanitized[0].isdigit(): |
| 91 | + sanitized = "_" + sanitized |
| 92 | + return sanitized.lower() |
| 93 | + |
| 94 | + |
| 95 | +def sidecar_path_for(source_path: Path) -> Path: |
| 96 | + """Per-process scratch `.duckdb` path for a data-file source. |
| 97 | +
|
| 98 | + Each sqlit process gets its own directory under the OS temp dir. The |
| 99 | + sidecar persists for the lifetime of the process so edits within a |
| 100 | + sqlit session survive across query Runs. A fresh process gets a fresh |
| 101 | + sidecar, so source-file changes are picked up on restart and unsaved |
| 102 | + edits are wiped (the user opted into "re-load from source each time"). |
| 103 | + """ |
| 104 | + digest = hashlib.sha1(str(source_path.resolve()).encode()).hexdigest()[:16] |
| 105 | + base = Path(tempfile.gettempdir()) / f"sqlit-{os.getpid()}" / "data-files" |
| 106 | + return base / f"{digest}.duckdb" |
0 commit comments