Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions .idea/PySUS.iml

This file was deleted.

10 changes: 0 additions & 10 deletions .idea/misc.xml

This file was deleted.

17 changes: 16 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ pydantic = "^2.12.5"
duckdb = "^1.4.4"
duckdb-engine = "^0.17.0"
sqlalchemy = "^2.0.48"
python-magic = "^0.4.27"
python-magic = { version = "*", platform = "!=win32" }
python-magic-bin = { version = "*", platform = "win32" }
chardet = "^7.4.0.post2"
anyio = "^4.13.0"
httpx = ">=0.28.0"
Expand Down
1 change: 1 addition & 0 deletions pysus/api/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
## Roadmap
38 changes: 14 additions & 24 deletions pysus/api/_impl/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from typing import Literal

import pandas as pd
from pysus.api import types
from pysus.api.client import PySUS
from pysus.api.types import State
from tqdm import tqdm


Expand Down Expand Up @@ -220,7 +220,7 @@ def sinan(


def sinasc(
state: State,
state: types.State,
year: int | list[int],
group: str | None = None,
**kwargs,
Expand All @@ -232,7 +232,7 @@ def sinasc(

Parameters
----------
state : State
state : types.State
Two-letter state abbreviation (e.g. ``"RJ"``).
year : int | list[int]
Year or list of years to fetch.
Expand All @@ -255,7 +255,7 @@ def sinasc(


def sim(
state: State,
state: types.State,
year: int | list[int],
group: str | None = None,
**kwargs,
Expand Down Expand Up @@ -290,7 +290,7 @@ def sim(


def sih(
state: State,
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = None,
Expand All @@ -303,7 +303,7 @@ def sih(

Parameters
----------
state : State
state : types.State
Two-letter state abbreviation (e.g. ``"RJ"``).
year : int | list[int]
Year or list of years to fetch.
Expand All @@ -329,7 +329,7 @@ def sih(


def sia(
state: State,
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = None,
Expand All @@ -342,7 +342,7 @@ def sia(

Parameters
----------
state : State
state : types.State
Two-letter state abbreviation (e.g. ``"RJ"``).
year : int | list[int]
Year or list of years to fetch.
Expand All @@ -368,7 +368,7 @@ def sia(


def pni(
state: State,
state: types.State,
year: int | list[int],
group: str | None = None,
**kwargs,
Expand Down Expand Up @@ -430,7 +430,7 @@ def ibge(


def cnes(
state: State,
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = None,
Expand Down Expand Up @@ -469,7 +469,7 @@ def cnes(


def ciha(
state: State,
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = "CIHA",
Expand Down Expand Up @@ -508,18 +508,8 @@ def ciha(


def list_files(
dataset: Literal[
"SINAN",
"SINASC",
"SIM",
"SIH",
"SIA",
"PNI",
"IBGE",
"CNES",
"CIHA",
],
client: Literal["FTP", "DadosGov"] | None = None,
dataset: types.DatasetName,
client: types.Origin | None = None,
group: str | None = None,
state: str | None = None,
year: int | list[int] | None = None,
Expand All @@ -536,7 +526,7 @@ def list_files(
----------
dataset : Literal
Dataset name (e.g. ``"SINAN"``, ``"SINASC"``, etc.).
client : Literal["FTP", "DadosGov"], optional
client : Origin, optional
Data source client to query.
group : str, optional
Group or disease code to filter by.
Expand Down
70 changes: 59 additions & 11 deletions pysus/api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
import anyio
import duckdb
import pandas as pd
from duckdb import func
from pysus import CACHEPATH
from pysus.api.types import Origin
from sqlalchemy import DateTime, Enum, Integer, String, create_engine
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker
from sqlalchemy.pool import NullPool
Expand All @@ -24,7 +26,7 @@
from .ftp import FTPClient
from .models import BaseLocalFile, BaseRemoteFile

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from duckdb import DuckDBPyConnection


Expand Down Expand Up @@ -101,7 +103,7 @@ async def __aenter__(self):
"""Set up DuckLake catalog and return self as async context manager."""

self._ducklake = DuckLake()
await self._ducklake._load_catalog()
await self._ducklake.connect()
self._attach_client_catalog(
"ducklake",
str(self._ducklake.catalog_path),
Expand All @@ -124,7 +126,7 @@ async def get_ducklake(self) -> DuckLake:

if self._ducklake is None:
self._ducklake = DuckLake()
await self._ducklake._load_catalog()
await self._ducklake.connect()
self._attach_client_catalog(
"ducklake",
str(self._ducklake.catalog_path),
Expand Down Expand Up @@ -477,26 +479,72 @@ def get_completed_remote_paths(self) -> set[str]:

async def query(
self,
client: Literal["DadosGov", "FTP"] | None = None,
client: Origin | None = None,
dataset: str | None = None,
group: str | None = None,
state: str | None = None,
year: int | None = None,
month: int | None = None,
):
"""Query available datasets through the DuckLake catalog."""
"""Query available datasets through the DuckLake catalog.

Parameters
----------
client : Origin, optional
Source client to filter by.
dataset : str, optional
Dataset name to filter by.
group : str, optional
Group name pattern to filter by (case-insensitive ILIKE).
state : str, optional
Two-letter state code to filter by.
year : int, optional
Year to filter by.
month : int, optional
Month to filter by.

Returns
-------
list
List of matching File objects.
"""
if self._ducklake is None:
await self.get_ducklake()
if self._ducklake is not None:
return await self._ducklake.query(
client=client,
dataset=dataset,

if self._ducklake is None:
raise ConnectionError("Could not connect to PySUS s3 bucket")

all_datasets = await self._ducklake.datasets()

if dataset:
matching = [
d for d in all_datasets if d.name.lower() == dataset.lower()
]
if not matching:
return []
target = matching[0]
files = await target.query(
group=group,
state=state,
year=year,
month=month,
)
else:
files = []
for ds in all_datasets:
ds_files = await ds.query(
group=group,
state=state,
year=year,
month=month,
)
files.extend(ds_files)

if not client:
return files

prefix = f"public/data/{client.lower()}/"
return [f for f in files if f.record.path.startswith(prefix)]

def read_parquet(
self,
Expand Down Expand Up @@ -595,8 +643,8 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
duckdb.create_function(
"__pysus_add_dv",
_add_dv_fn,
null_handling="special",
)
null_handling=func.SPECIAL,
) # type: ignore
except duckdb.NotImplementedException:
pass
selects = [
Expand Down
File renamed without changes.
5 changes: 3 additions & 2 deletions pysus/api/dadosgov/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PrivateAttr
from pysus import __version__
from pysus.api.models import BaseRemoteClient, BaseRemoteFile
from pysus.api.types import DADOSGOV

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
from .models import Dataset


Expand Down Expand Up @@ -89,7 +90,7 @@ def name(self) -> str:
str
The abbreviated client name ``"DadosGov"``.
"""
return "DadosGov"
return DADOSGOV

@property
def long_name(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion pysus/api/dadosgov/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def formatter(self, filename: str) -> dict[str, Any]:
}

m = re.search(r"_(\w{3})-out_(\d{4})_\.csv$", name)
if m:
if m: # pragma: no cover
return {
"state": None,
"year": _parse_year(m.group(2)),
Expand Down
7 changes: 4 additions & 3 deletions pysus/api/dadosgov/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ def __init__(
A callable that extracts metadata from filenames.
"""
super().__init__(
record=record, dataset=dataset # type: ignore[call-arg]
record=record,
dataset=dataset, # type: ignore[call-arg]
)
self._formatter = formatter

Expand Down Expand Up @@ -354,7 +355,7 @@ class Dataset(BaseRemoteDataset):
"""

ids: list[str] = []
client: "DadosGov"
client: DadosGov
group_aliases: dict[str, str] = {}

def __repr__(self):
Expand All @@ -369,7 +370,7 @@ def formatter(self, filename: str) -> dict[str, Any]:
async def _fetch_content(self) -> list[Group]:
"""Fetch all groups belonging to this dataset."""
items: list[Group] = []
client: "DadosGov" = self.client
client: DadosGov = self.client
if self.ids:
for group_id in self.ids:
record = await client.get_dataset(group_id)
Expand Down
File renamed without changes.
File renamed without changes.
Loading
Loading