Skip to content

Commit 714edad

Browse files
authored
feat(ducklake): split the catalog.db into individual files for each dataset (#279)
* feat(ducklake): split the catalog.db into individual files for each dataset * feat(api): include global variables feat(api): start creating the detailed report classes for the files feat(api): start splitting the catalog.db into catalogs for each dataset * chore: move s3 requests to DuckDataset * chore: make ducklake client be a duckdataset manager * chore: make ducklake client be a duckdataset manager * chore: dintinguish between default catalog.db and specific dataset catalog in the ORM * tests: include more tests
1 parent 41d5010 commit 714edad

45 files changed

Lines changed: 13790 additions & 876 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.idea/PySUS.iml

Lines changed: 0 additions & 17 deletions
This file was deleted.

.idea/misc.xml

Lines changed: 0 additions & 10 deletions
This file was deleted.

poetry.lock

Lines changed: 16 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ pydantic = "^2.12.5"
3131
duckdb = "^1.4.4"
3232
duckdb-engine = "^0.17.0"
3333
sqlalchemy = "^2.0.48"
34-
python-magic = "^0.4.27"
34+
python-magic = { version = "*", platform = "!=win32" }
35+
python-magic-bin = { version = "*", platform = "win32" }
3536
chardet = "^7.4.0.post2"
3637
anyio = "^4.13.0"
3738
httpx = ">=0.28.0"

pysus/api/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
## Roadmap

pysus/api/_impl/databases.py

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
from typing import Literal
2525

2626
import pandas as pd
27+
from pysus.api import types
2728
from pysus.api.client import PySUS
28-
from pysus.api.types import State
2929
from tqdm import tqdm
3030

3131

@@ -220,7 +220,7 @@ def sinan(
220220

221221

222222
def sinasc(
223-
state: State,
223+
state: types.State,
224224
year: int | list[int],
225225
group: str | None = None,
226226
**kwargs,
@@ -232,7 +232,7 @@ def sinasc(
232232
233233
Parameters
234234
----------
235-
state : State
235+
state : types.State
236236
Two-letter state abbreviation (e.g. ``"RJ"``).
237237
year : int | list[int]
238238
Year or list of years to fetch.
@@ -255,7 +255,7 @@ def sinasc(
255255

256256

257257
def sim(
258-
state: State,
258+
state: types.State,
259259
year: int | list[int],
260260
group: str | None = None,
261261
**kwargs,
@@ -290,7 +290,7 @@ def sim(
290290

291291

292292
def sih(
293-
state: State,
293+
state: types.State,
294294
year: int | list[int],
295295
month: int | list[int],
296296
group: str | None = None,
@@ -303,7 +303,7 @@ def sih(
303303
304304
Parameters
305305
----------
306-
state : State
306+
state : types.State
307307
Two-letter state abbreviation (e.g. ``"RJ"``).
308308
year : int | list[int]
309309
Year or list of years to fetch.
@@ -329,7 +329,7 @@ def sih(
329329

330330

331331
def sia(
332-
state: State,
332+
state: types.State,
333333
year: int | list[int],
334334
month: int | list[int],
335335
group: str | None = None,
@@ -342,7 +342,7 @@ def sia(
342342
343343
Parameters
344344
----------
345-
state : State
345+
state : types.State
346346
Two-letter state abbreviation (e.g. ``"RJ"``).
347347
year : int | list[int]
348348
Year or list of years to fetch.
@@ -368,7 +368,7 @@ def sia(
368368

369369

370370
def pni(
371-
state: State,
371+
state: types.State,
372372
year: int | list[int],
373373
group: str | None = None,
374374
**kwargs,
@@ -430,7 +430,7 @@ def ibge(
430430

431431

432432
def cnes(
433-
state: State,
433+
state: types.State,
434434
year: int | list[int],
435435
month: int | list[int],
436436
group: str | None = None,
@@ -469,7 +469,7 @@ def cnes(
469469

470470

471471
def ciha(
472-
state: State,
472+
state: types.State,
473473
year: int | list[int],
474474
month: int | list[int],
475475
group: str | None = "CIHA",
@@ -508,18 +508,8 @@ def ciha(
508508

509509

510510
def list_files(
511-
dataset: Literal[
512-
"SINAN",
513-
"SINASC",
514-
"SIM",
515-
"SIH",
516-
"SIA",
517-
"PNI",
518-
"IBGE",
519-
"CNES",
520-
"CIHA",
521-
],
522-
client: Literal["FTP", "DadosGov"] | None = None,
511+
dataset: types.DatasetName,
512+
client: types.Origin | None = None,
523513
group: str | None = None,
524514
state: str | None = None,
525515
year: int | list[int] | None = None,
@@ -536,7 +526,7 @@ def list_files(
536526
----------
537527
dataset : Literal
538528
Dataset name (e.g. ``"SINAN"``, ``"SINASC"``, etc.).
539-
client : Literal["FTP", "DadosGov"], optional
529+
client : Origin, optional
540530
Data source client to query.
541531
group : str, optional
542532
Group or disease code to filter by.

pysus/api/client.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
import anyio
1414
import duckdb
1515
import pandas as pd
16+
from duckdb import func
1617
from pysus import CACHEPATH
18+
from pysus.api.types import Origin
1719
from sqlalchemy import DateTime, Enum, Integer, String, create_engine
1820
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker
1921
from sqlalchemy.pool import NullPool
@@ -24,7 +26,7 @@
2426
from .ftp import FTPClient
2527
from .models import BaseLocalFile, BaseRemoteFile
2628

27-
if TYPE_CHECKING:
29+
if TYPE_CHECKING: # pragma: no cover
2830
from duckdb import DuckDBPyConnection
2931

3032

@@ -101,7 +103,7 @@ async def __aenter__(self):
101103
"""Set up DuckLake catalog and return self as async context manager."""
102104

103105
self._ducklake = DuckLake()
104-
await self._ducklake._load_catalog()
106+
await self._ducklake.connect()
105107
self._attach_client_catalog(
106108
"ducklake",
107109
str(self._ducklake.catalog_path),
@@ -124,7 +126,7 @@ async def get_ducklake(self) -> DuckLake:
124126

125127
if self._ducklake is None:
126128
self._ducklake = DuckLake()
127-
await self._ducklake._load_catalog()
129+
await self._ducklake.connect()
128130
self._attach_client_catalog(
129131
"ducklake",
130132
str(self._ducklake.catalog_path),
@@ -477,26 +479,72 @@ def get_completed_remote_paths(self) -> set[str]:
477479

478480
async def query(
479481
self,
480-
client: Literal["DadosGov", "FTP"] | None = None,
482+
client: Origin | None = None,
481483
dataset: str | None = None,
482484
group: str | None = None,
483485
state: str | None = None,
484486
year: int | None = None,
485487
month: int | None = None,
486488
):
487-
"""Query available datasets through the DuckLake catalog."""
489+
"""Query available datasets through the DuckLake catalog.
488490
491+
Parameters
492+
----------
493+
client : Origin, optional
494+
Source client to filter by.
495+
dataset : str, optional
496+
Dataset name to filter by.
497+
group : str, optional
498+
Group name pattern to filter by (case-insensitive ILIKE).
499+
state : str, optional
500+
Two-letter state code to filter by.
501+
year : int, optional
502+
Year to filter by.
503+
month : int, optional
504+
Month to filter by.
505+
506+
Returns
507+
-------
508+
list
509+
List of matching File objects.
510+
"""
489511
if self._ducklake is None:
490512
await self.get_ducklake()
491-
if self._ducklake is not None:
492-
return await self._ducklake.query(
493-
client=client,
494-
dataset=dataset,
513+
514+
if self._ducklake is None:
515+
raise ConnectionError("Could not connect to PySUS s3 bucket")
516+
517+
all_datasets = await self._ducklake.datasets()
518+
519+
if dataset:
520+
matching = [
521+
d for d in all_datasets if d.name.lower() == dataset.lower()
522+
]
523+
if not matching:
524+
return []
525+
target = matching[0]
526+
files = await target.query(
495527
group=group,
496528
state=state,
497529
year=year,
498530
month=month,
499531
)
532+
else:
533+
files = []
534+
for ds in all_datasets:
535+
ds_files = await ds.query(
536+
group=group,
537+
state=state,
538+
year=year,
539+
month=month,
540+
)
541+
files.extend(ds_files)
542+
543+
if not client:
544+
return files
545+
546+
prefix = f"public/data/{client.lower()}/"
547+
return [f for f in files if f.record.path.startswith(prefix)]
500548

501549
def read_parquet(
502550
self,
@@ -595,8 +643,8 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
595643
duckdb.create_function(
596644
"__pysus_add_dv",
597645
_add_dv_fn,
598-
null_handling="special",
599-
)
646+
null_handling=func.SPECIAL,
647+
) # type: ignore
600648
except duckdb.NotImplementedException:
601649
pass
602650
selects = [
File renamed without changes.

pysus/api/dadosgov/client.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PrivateAttr
1212
from pysus import __version__
1313
from pysus.api.models import BaseRemoteClient, BaseRemoteFile
14+
from pysus.api.types import DADOSGOV
1415

15-
if TYPE_CHECKING:
16+
if TYPE_CHECKING: # pragma: no cover
1617
from .models import Dataset
1718

1819

@@ -89,7 +90,7 @@ def name(self) -> str:
8990
str
9091
The abbreviated client name ``"DadosGov"``.
9192
"""
92-
return "DadosGov"
93+
return DADOSGOV
9394

9495
@property
9596
def long_name(self) -> str:

pysus/api/dadosgov/databases.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def formatter(self, filename: str) -> dict[str, Any]:
272272
}
273273

274274
m = re.search(r"_(\w{3})-out_(\d{4})_\.csv$", name)
275-
if m:
275+
if m: # pragma: no cover
276276
return {
277277
"state": None,
278278
"year": _parse_year(m.group(2)),

0 commit comments

Comments
 (0)