Skip to content

Commit 790c387

Browse files
authored
fix(parquet): include parsings to parquet reading (#265)
* fix(parquet): include parsings to parquet reading * chore: check if remote size is same as local file when downloading * fix(docs): fix CI docs build * fix(CI): remove dbc extra installation from CI on windows * fix: make python-magic import graceful on Windows (avoids segfault when libmagic missing) - Wrap import magic in try/except so the module can be imported on Windows without libmagic installed. - Guard _identify MIME-type lookup — returns None (falling back to extension matching) when magic is unavailable. - CI: skip --extras dbc and pre-commit on Windows. - CI: per-OS timeout (30 m Windows, 15 m Linux). - CI: guard Pages deployment steps behind refs/heads/main to avoid failure on PRs from forks. - Coverage: omit pysus/management/client.py and pysus/tui/*. * deps: make tui an extra installation
1 parent 0920e27 commit 790c387

15 files changed

Lines changed: 542 additions & 194 deletions

File tree

.github/workflows/python-package.yml

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,20 @@ on: [push, pull_request]
44

55
jobs:
66
tests:
7-
runs-on: ubuntu-latest
8-
timeout-minutes: 15
7+
runs-on: ${{ matrix.os }}
8+
timeout-minutes: ${{ (matrix.os == 'windows-latest' && 30) || 15 }}
99

1010
defaults:
1111
run:
1212
shell: bash -l {0}
1313

1414
strategy:
1515
matrix:
16-
python-version: ["3.10", "3.11", "3.12", "3.13"]
16+
os: [ubuntu-latest, windows-latest]
17+
python-version: ["3.10", "3.11", "3.12"]
1718

1819
concurrency:
19-
group: ci-tests-${{ matrix.python-version }}-${{ github.ref }}
20+
group: ci-tests-${{ matrix.os }}-${{ matrix.python-version }}-${{ github.ref }}
2021
cancel-in-progress: true
2122

2223
steps:
@@ -31,22 +32,28 @@ jobs:
3132
auto-update-conda: true
3233
conda-solver: libmamba
3334

34-
- name: Linting & Tests
35+
- name: Install dependencies
3536
run: |
3637
pip install poetry poetry-plugin-export
37-
3838
poetry config virtualenvs.create false
39-
40-
poetry export --with dev --extras dbc --format requirements.txt --output reqs.txt --without-hashes
41-
42-
pip install -r reqs.txt
43-
pip install -e ".[dbc]"
44-
45-
pre-commit run --files pysus/**/*
46-
47-
make test-pysus-with-coverage
39+
if [ "${{ runner.os }}" = "Linux" ]; then
40+
poetry install --without dev --extras dbc
41+
pip install pre-commit
42+
else
43+
poetry install --without dev
44+
fi
45+
pip install pytest pytest-timeout pytest-retry pytest-asyncio pytest-cov
46+
47+
- name: Linting
48+
if: matrix.os == 'ubuntu-latest'
49+
run: pre-commit run --files pysus/**/*
50+
51+
- name: Tests
52+
run: |
53+
poetry run pytest -vv pysus/tests/ --retries 3 --retry-delay 15 --cov=pysus --cov-report=xml:coverage.xml --cov-report=term-missing
4854
4955
- name: Upload coverage to Codecov
56+
if: matrix.os == 'ubuntu-latest'
5057
uses: codecov/codecov-action@v5
5158
with:
5259
files: ./coverage.xml

.github/workflows/release.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,11 @@ jobs:
8484
make html
8585
8686
- name: Configure GitHub Pages
87+
if: github.ref == 'refs/heads/main'
8788
uses: actions/configure-pages@v5
8889

8990
- name: Upload artifact
91+
if: github.ref == 'refs/heads/main'
9092
uses: actions/upload-pages-artifact@v3
9193
with:
9294
path: docs/build/html

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ sudo apt install libffi-dev
2727
pip install pysus[dbc]
2828
```
2929

30+
For the terminal user interface (TUI):
31+
```bash
32+
pip install pysus[tui]
33+
```
34+
3035
## Quick Start
3136

3237
### Simplified Database Functions (New in 2.0)

conda/dev.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ channels:
44
- defaults
55
dependencies:
66
- docker-compose
7-
- python>=3.10,<3.14
7+
- python>=3.10,<3.13
88
- jupyter
99
- make
1010
- pip

poetry.lock

Lines changed: 102 additions & 162 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ python = ">=3.10,<3.14"
1919
python-dateutil = "2.8.2"
2020
fastparquet = ">=2023.10.1,<=2024.11.0"
2121
pyarrow = ">=11.0.0"
22-
numpy = ">1,<3"
22+
numpy = ">=1.22,<2"
2323
tqdm = ">=4.67.0"
2424
wget = "^3.2"
2525
loguru = "^0.6.0"
@@ -31,23 +31,25 @@ pydantic = "^2.12.5"
3131
duckdb = "^1.4.4"
3232
duckdb-engine = "^0.17.0"
3333
sqlalchemy = "^2.0.48"
34-
textual = {extras = ["syntax"], version = "^8.2.1"}
3534
python-magic = "^0.4.27"
3635
chardet = "^7.4.0.post2"
3736
anyio = "^4.13.0"
38-
humanize = "^4.8.0"
37+
httpx = ">=0.28.0"
3938
aioftp = "^0.21.4"
4039
dbfread = "2.0.7"
4140
bigtree = "^0.12.2"
4241

4342
pyreaddbc = { version = ">=1.1.0", optional = true }
4443
pycparser = { version = "2.21", optional = true }
44+
textual = { extras = ["syntax"], version = "^8.2.1", optional = true }
45+
humanize = { version = "^4.8.0", optional = true }
4546
dotenv = "^0.9.9"
4647
boto3 = "^1.42.89"
4748
typer = "^0.24.1"
4849

4950
[tool.poetry.extras]
5051
dbc = ["pyreaddbc", "pycparser"]
52+
tui = ["textual", "humanize"]
5153

5254
[tool.poetry.group.dev.dependencies]
5355
pytest = ">=6.1.0"
@@ -64,6 +66,7 @@ pytest-cov = "^7.1.0"
6466

6567
[tool.poetry.group.docs.dependencies]
6668
sphinx = "^5.1.1"
69+
standard-imghdr = "*"
6770
nbmake = "^1.4.1"
6871
matplotlib = "^3.7.1"
6972
jupyterlab = "^4.0.5"
@@ -101,6 +104,12 @@ testpaths = [
101104

102105
exclude = ["*.git", "docs/"]
103106

107+
[tool.coverage.run]
108+
omit = [
109+
"pysus/management/client.py",
110+
"pysus/tui/*",
111+
]
112+
104113
[[tool.mypy.overrides]]
105114
module = "tests.*"
106115
disallow_untyped_defs = false

pysus/api/client.py

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
from pathlib import Path
1111
from typing import TYPE_CHECKING, Literal
1212

13+
import anyio
1314
import duckdb
15+
import pandas as pd
1416
from pysus import CACHEPATH
1517
from sqlalchemy import DateTime, Enum, Integer, String, create_engine
1618
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker
@@ -235,14 +237,26 @@ async def download(
235237
file: BaseRemoteFile,
236238
token: str | None = None,
237239
callback: Callable | None = None,
240+
timeout: float | None = None,
238241
) -> BaseLocalFile:
239-
"""Download a remote file and return a local file handle."""
242+
"""Download a remote file and return a local file handle.
243+
244+
Parameters
245+
----------
246+
timeout : float | None
247+
Maximum seconds to wait for the download. ``None`` (default) means
248+
no timeout – use this when the socket-level timeout on the
249+
underlying client is sufficient.
250+
"""
240251

241252
from pysus.api.extensions import ExtensionFactory
242253

243254
existing_local = await self.get_local_file(file)
244255
if existing_local and existing_local.path.exists():
245-
return existing_local
256+
if existing_local.size == file.size:
257+
return existing_local
258+
await self._delete_record(str(existing_local.path))
259+
existing_local.path.unlink(missing_ok=True)
246260

247261
client_name = file.client.name.lower()
248262
remote_path = file.path
@@ -271,7 +285,11 @@ async def download(
271285
f"No download logic for client: {client_name}",
272286
)
273287

274-
await client._download_file(file, local_path, callback)
288+
if timeout is not None:
289+
with anyio.fail_after(timeout):
290+
await client._download_file(file, local_path, callback)
291+
else:
292+
await client._download_file(file, local_path, callback)
275293

276294
await self._update_state(
277295
local_path=local_path,
@@ -311,18 +329,22 @@ async def download_to_parquet(
311329
file: BaseRemoteFile,
312330
token: str | None = None,
313331
callback: Callable[[int, int], None] | None = None,
332+
timeout: float | None = None,
333+
add_dv: bool = True,
314334
) -> Parquet:
315335
"""Download a file and convert it to Parquet format."""
316336

317337
local_file = await self.download(
318338
file=file,
319339
token=token,
320340
callback=callback,
341+
timeout=timeout,
321342
)
322343

323344
if hasattr(local_file, "to_parquet"):
324345
original_path = local_file.path
325346
parquet_file = await local_file.to_parquet(callback=callback)
347+
parquet_file.add_dv = add_dv
326348

327349
await self._update_state(
328350
local_path=parquet_file.path,
@@ -346,7 +368,9 @@ async def download_to_parquet(
346368
)
347369

348370
def get_local_hierarchy(self):
349-
"""Build a nested dict of cached files grouped by client and dataset."""
371+
"""
372+
Build a nested dict of cached files grouped by client and dataset.
373+
"""
350374

351375
with self.Session() as session:
352376
records = session.query(LocalFileState).all()
@@ -414,8 +438,20 @@ def read_parquet(
414438
paths: list[Path],
415439
sql: str | None = None,
416440
mode: Literal["union", "intersection", "strict"] = "union",
417-
) -> "DuckDBPyConnection":
418-
"""Read Parquet files with optional schema handling and SQL filter."""
441+
add_dv: bool = True,
442+
) -> "DuckDBPyConnection | pd.DataFrame":
443+
"""Read Parquet files with optional schema handling and SQL filter.
444+
445+
Parameters
446+
----------
447+
add_dv : bool
448+
When True, automatically applies the IBGE verification digit to
449+
municipality code columns. If there are matching columns, a
450+
DataFrame is returned instead of a DuckDBPyConnection.
451+
"""
452+
453+
from pysus.api.utils import add_dv as _add_dv_fn
454+
from pysus.api.utils import is_geocode_column
419455

420456
if not paths:
421457
raise ValueError("No paths provided")
@@ -452,8 +488,7 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
452488
else:
453489
paths_str = ", ".join(f"'{p}'" for p in paths)
454490
query = (
455-
f"SELECT * FROM read_parquet([{paths_str}], "
456-
"union_by_name=True)"
491+
f"SELECT * FROM read_parquet([{paths_str}], union_by_name=True)"
457492
)
458493

459494
if sql:
@@ -462,4 +497,29 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
462497
else:
463498
query = f"SELECT {sql} FROM ({query}) AS t"
464499

500+
base = duckdb.execute(query)
501+
502+
if not add_dv:
503+
return base
504+
505+
geocode_cols = [
506+
col[0] for col in base.description if is_geocode_column(col[0])
507+
]
508+
if not geocode_cols:
509+
return base
510+
511+
duckdb.create_function(
512+
"__pysus_add_dv",
513+
_add_dv_fn,
514+
null_handling="special",
515+
)
516+
selects = [
517+
(
518+
f'__pysus_add_dv("{c[0]}") AS "{c[0]}"'
519+
if c[0] in geocode_cols
520+
else f'"{c[0]}"'
521+
)
522+
for c in base.description
523+
]
524+
query = f"SELECT {', '.join(selects)} FROM ({query}) AS _t"
465525
return duckdb.execute(query)

pysus/api/extensions.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@
1414
from typing import ClassVar
1515

1616
import chardet
17-
import magic
17+
18+
try:
19+
import magic
20+
except (ImportError, OSError):
21+
magic = None # type: ignore[assignment]
22+
1823
import pandas as pd
1924
import pyarrow as pa
2025
import pyarrow.parquet as pq
@@ -188,6 +193,7 @@ class Parquet(BaseTabularFile):
188193
"""Represents a Parquet file with optional date and integer type parsing."""
189194

190195
type: FileType = Field("PARQUET")
196+
add_dv: bool = True
191197

192198
@property
193199
def schema(self) -> pa.Schema:
@@ -204,12 +210,26 @@ def rows(self) -> int:
204210
"""Return the number of rows from the Parquet metadata."""
205211
return pq.read_metadata(self.path).num_rows
206212

213+
@staticmethod
214+
def _apply_add_dv(df: pd.DataFrame) -> pd.DataFrame:
215+
"""Apply the IBGE verification digit to geocode columns in-place."""
216+
from pysus.api.utils import add_dv, is_geocode_column
217+
218+
geocode_cols = [c for c in df.columns if is_geocode_column(c)]
219+
for col in geocode_cols:
220+
df[col] = df[col].astype(str).apply(add_dv)
221+
return df
222+
207223
async def load(self, parse: bool = True) -> pd.DataFrame:
208224
"""Read the entire Parquet file into a DataFrame."""
209225

210226
def _load():
211227
df = pd.read_parquet(self.path, engine="pyarrow")
212-
return self.parse_dftypes(df) if parse else df
228+
if parse:
229+
df = self.parse_dftypes(df)
230+
if self.add_dv:
231+
df = self._apply_add_dv(df)
232+
return df
213233

214234
return await to_thread.run_sync(_load)
215235

@@ -226,6 +246,8 @@ async def stream(
226246
df = batch.to_pandas()
227247
if parse:
228248
df = self.parse_dftypes(df)
249+
if self.add_dv:
250+
df = self._apply_add_dv(df)
229251
yield df
230252
await asyncio.sleep(0)
231253

@@ -815,6 +837,8 @@ class ExtensionFactory:
815837
@classmethod
816838
async def _identify(cls, path: Path) -> type[BaseLocalFile] | None:
817839
"""Identify the file class by its MIME type."""
840+
if magic is None:
841+
return None
818842
try:
819843
mime = await to_thread.run_sync(
820844
magic.from_file,

pysus/api/ftp/client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class FTP(BaseRemoteClient):
4343
"""Async FTP client for navigating and downloading DATASUS data."""
4444

4545
host: str = "ftp.datasus.gov.br"
46+
timeout: int = 60
4647

4748
_ftp: FTPLib | None = PrivateAttr(default=None)
4849

@@ -77,7 +78,7 @@ async def connect(self) -> None:
7778

7879
def _connect():
7980
if self.ftp is None:
80-
self._ftp = FTPLib(self.host)
81+
self._ftp = FTPLib(self.host, timeout=self.timeout)
8182
self.ftp.login()
8283

8384
await to_thread.run_sync(_connect)

0 commit comments

Comments
 (0)