Skip to content

Commit e1abecb

Browse files
committed
chore: make ducklake client be a duckdataset manager
1 parent df0e21e commit e1abecb

15 files changed

Lines changed: 144 additions & 50 deletions

File tree

pysus/api/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
## Roadmap
2-

pysus/api/_impl/databases.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
from typing import Literal
2525

2626
import pandas as pd
27-
from pysus.api.client import PySUS
2827
from pysus.api import types
28+
from pysus.api.client import PySUS
2929
from tqdm import tqdm
3030

3131

pysus/api/client.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,11 @@
1111
from typing import TYPE_CHECKING, Literal
1212

1313
import anyio
14-
15-
from pysus.api.types import Origin
1614
import duckdb
17-
from duckdb import func
1815
import pandas as pd
16+
from duckdb import func
1917
from pysus import CACHEPATH
18+
from pysus.api.types import Origin
2019
from sqlalchemy import DateTime, Enum, Integer, String, create_engine
2120
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker
2221
from sqlalchemy.pool import NullPool
@@ -518,7 +517,9 @@ async def query(
518517
all_datasets = await self._ducklake.datasets()
519518

520519
if dataset:
521-
matching = [d for d in all_datasets if d.name.lower() == dataset.lower()]
520+
matching = [
521+
d for d in all_datasets if d.name.lower() == dataset.lower()
522+
]
522523
if not matching:
523524
return []
524525
target = matching[0]
@@ -617,7 +618,9 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
617618

618619
else:
619620
paths_str = ", ".join(f"'{p}'" for p in paths)
620-
query = f"SELECT * FROM read_parquet([{paths_str}], union_by_name=True)"
621+
query = (
622+
f"SELECT * FROM read_parquet([{paths_str}], union_by_name=True)"
623+
)
621624

622625
if sql:
623626
if sql.upper().startswith("SELECT"):
@@ -630,7 +633,9 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
630633
if not add_dv:
631634
return base
632635

633-
geocode_cols = [col[0] for col in base.description if is_geocode_column(col[0])]
636+
geocode_cols = [
637+
col[0] for col in base.description if is_geocode_column(col[0])
638+
]
634639
if not geocode_cols:
635640
return base
636641

pysus/api/dadosgov/client.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,11 @@ async def _download_file(
277277
"Client not connected. Call login(token=...) first.",
278278
)
279279

280-
url = str(file.path).replace("https:/", "https://").replace("http:/", "http://")
280+
url = (
281+
str(file.path)
282+
.replace("https:/", "https://")
283+
.replace("http:/", "http://")
284+
)
281285

282286
async with self._client.stream("GET", url) as response:
283287
response.raise_for_status()

pysus/api/dadosgov/models.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,13 @@ def _dedup_entries(
3030
if m:
3131
stem = filename[: m.start()]
3232
fmt = m.group(1).lower()
33-
grouped.setdefault(stem, []).append((fmt, filename, recurso, metadata))
33+
grouped.setdefault(stem, []).append(
34+
(fmt, filename, recurso, metadata)
35+
)
3436
else:
35-
grouped.setdefault(filename, []).append(("", filename, recurso, metadata))
37+
grouped.setdefault(filename, []).append(
38+
("", filename, recurso, metadata)
39+
)
3640

3741
result: list[tuple[str, Any, dict]] = []
3842
for _, items in grouped.items():
@@ -245,7 +249,9 @@ class Group(BaseRemoteGroup):
245249
"""A group of files within a dataset."""
246250

247251
record: ConjuntoDados
248-
_formatter: Callable[[str], dict[str, Any]] | None = PrivateAttr(default=None)
252+
_formatter: Callable[[str], dict[str, Any]] | None = PrivateAttr(
253+
default=None
254+
)
249255

250256
def __init__(
251257
self,
@@ -313,7 +319,9 @@ async def _fetch_files(self) -> list[BaseRemoteFile]:
313319
"""Build File objects from the underlying resources."""
314320
entries: list[tuple[str, Any, dict]] = []
315321
for recurso in self.record.resources:
316-
filename = recurso.file_name or recurso.url.split("/")[-1].split("?")[0]
322+
filename = (
323+
recurso.file_name or recurso.url.split("/")[-1].split("?")[0]
324+
)
317325
if filename.lower().endswith(".pdf") or filename.startswith("get_"):
318326
continue
319327
metadata = {}

pysus/api/ducklake/catalog/columns.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6388,7 +6388,7 @@
63886388

63896389
TIPPRE = {"sia": ""}
63906390

6391-
TIPPRE = {"sia": ""}
6391+
TIPPRE = {"sia": ""}
63926392

63936393
TIP_DIARRE = {"sinan": ""}
63946394

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-

pysus/api/ducklake/client.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,9 @@ async def _download(
319319
else:
320320
raise e
321321

322-
async def _download_catalog(self, local_path: Path, remote_path: str) -> None:
322+
async def _download_catalog(
323+
self, local_path: Path, remote_path: str
324+
) -> None:
323325
"""Download a catalog database from remote storage with retries.
324326
325327
Parameters
@@ -373,7 +375,9 @@ def _get_s3_client(self):
373375
"s3",
374376
endpoint_url=f"https://{self.endpoint}",
375377
aws_access_key_id=self.credentials.access_key.get_secret_value(),
376-
aws_secret_access_key=(self.credentials.secret_key.get_secret_value()),
378+
aws_secret_access_key=(
379+
self.credentials.secret_key.get_secret_value()
380+
),
377381
region_name=self.region,
378382
config=Config(signature_version="s3v4"),
379383
)

pysus/api/ducklake/models.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,12 @@
88
from collections.abc import Callable
99
from datetime import datetime
1010
from pathlib import Path
11-
from typing import TYPE_CHECKING, Any, Union, Optional
11+
from typing import TYPE_CHECKING, Any, Optional, Union
1212

1313
from anyio import to_thread
1414
from pydantic import Field, PrivateAttr
1515
from pysus import CACHEPATH
16-
from pysus.api.models import (
17-
BaseRemoteDataset,
18-
BaseRemoteFile,
19-
BaseRemoteGroup,
20-
)
16+
from pysus.api.models import BaseRemoteDataset, BaseRemoteFile, BaseRemoteGroup
2117
from sqlalchemy.orm import contains_eager, joinedload, sessionmaker
2218

2319
from .catalog import CatalogDataset, CatalogFile, DatasetGroup
@@ -266,7 +262,9 @@ async def connect(
266262
return
267263

268264
await self.client._download(
269-
f"public/{self._catalog_name}", self._catalog_local, callback=callback
265+
f"public/{self._catalog_name}",
266+
self._catalog_local,
267+
callback=callback,
270268
)
271269
self._engine = await to_thread.run_sync(
272270
lambda: self.client._setup_engine(self._catalog_local)

pysus/api/extensions.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,9 @@ def columns(self) -> list["Column"]:
252252

253253
schema = pq.read_schema(self.path)
254254
return [
255-
Column.from_schema(name=field.name, dtype=_map_dtype(str(field.type)))
255+
Column.from_schema(
256+
name=field.name, dtype=_map_dtype(str(field.type))
257+
)
256258
for field in schema
257259
]
258260

@@ -356,7 +358,9 @@ def columns(self) -> list["Column"]:
356358
"M": "VARCHAR",
357359
}
358360
return [
359-
Column.from_schema(name=f.name, dtype=_DBF_DTYPE.get(f.type, "VARCHAR"))
361+
Column.from_schema(
362+
name=f.name, dtype=_DBF_DTYPE.get(f.type, "VARCHAR")
363+
)
360364
for f in reader.fields
361365
]
362366

@@ -840,7 +844,9 @@ class DBCNotImported(BaseTabularFile):
840844

841845
path: Path = Field(default_factory=lambda: Path("..."))
842846
type: str | FileType = Field(default="remote")
843-
import_err: ClassVar[str] = """
847+
import_err: ClassVar[
848+
str
849+
] = """
844850
run "pip install pysus[dbc]" to handle DBC files.
845851
Make sure you also have libffi installed on the system. It may not work
846852
on Windows

0 commit comments

Comments
 (0)