1010from pathlib import Path
1111from typing import TYPE_CHECKING , Literal
1212
13+ import anyio
1314import duckdb
15+ import pandas as pd
1416from pysus import CACHEPATH
1517from sqlalchemy import DateTime , Enum , Integer , String , create_engine
1618from sqlalchemy .orm import DeclarativeBase , Mapped , mapped_column , sessionmaker
@@ -235,8 +237,17 @@ async def download(
235237 file : BaseRemoteFile ,
236238 token : str | None = None ,
237239 callback : Callable | None = None ,
240+ timeout : float | None = None ,
238241 ) -> BaseLocalFile :
239- """Download a remote file and return a local file handle."""
242+ """Download a remote file and return a local file handle.
243+
244+ Parameters
245+ ----------
246+ timeout : float | None
247+ Maximum seconds to wait for the download. ``None`` (default) means
248+ no timeout – use this when the socket-level timeout on the
249+ underlying client is sufficient.
250+ """
240251
241252 from pysus .api .extensions import ExtensionFactory
242253
@@ -271,7 +282,11 @@ async def download(
271282 f"No download logic for client: { client_name } " ,
272283 )
273284
274- await client ._download_file (file , local_path , callback )
285+ if timeout is not None :
286+ with anyio .fail_after (timeout ):
287+ await client ._download_file (file , local_path , callback )
288+ else :
289+ await client ._download_file (file , local_path , callback )
275290
276291 await self ._update_state (
277292 local_path = local_path ,
@@ -311,18 +326,22 @@ async def download_to_parquet(
311326 file : BaseRemoteFile ,
312327 token : str | None = None ,
313328 callback : Callable [[int , int ], None ] | None = None ,
329+ timeout : float | None = None ,
330+ add_dv : bool = True ,
314331 ) -> Parquet :
315332 """Download a file and convert it to Parquet format."""
316333
317334 local_file = await self .download (
318335 file = file ,
319336 token = token ,
320337 callback = callback ,
338+ timeout = timeout ,
321339 )
322340
323341 if hasattr (local_file , "to_parquet" ):
324342 original_path = local_file .path
325343 parquet_file = await local_file .to_parquet (callback = callback )
344+ parquet_file .add_dv = add_dv
326345
327346 await self ._update_state (
328347 local_path = parquet_file .path ,
@@ -346,7 +365,9 @@ async def download_to_parquet(
346365 )
347366
348367 def get_local_hierarchy (self ):
349- """Build a nested dict of cached files grouped by client and dataset."""
368+ """
369+ Build a nested dict of cached files grouped by client and dataset.
370+ """
350371
351372 with self .Session () as session :
352373 records = session .query (LocalFileState ).all ()
@@ -414,8 +435,20 @@ def read_parquet(
414435 paths : list [Path ],
415436 sql : str | None = None ,
416437 mode : Literal ["union" , "intersection" , "strict" ] = "union" ,
417- ) -> "DuckDBPyConnection" :
418- """Read Parquet files with optional schema handling and SQL filter."""
438+ add_dv : bool = True ,
439+ ) -> "DuckDBPyConnection | pd.DataFrame" :
440+ """Read Parquet files with optional schema handling and SQL filter.
441+
442+ Parameters
443+ ----------
444+ add_dv : bool
445+ When True, automatically applies the IBGE verification digit to
446+ municipality code columns. If there are matching columns, a
447+ DataFrame is returned instead of a DuckDBPyConnection.
448+ """
449+
450+ from pysus .api .utils import add_dv as _add_dv_fn
451+ from pysus .api .utils import is_geocode_column
419452
420453 if not paths :
421454 raise ValueError ("No paths provided" )
@@ -452,8 +485,7 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
452485 else :
453486 paths_str = ", " .join (f"'{ p } '" for p in paths )
454487 query = (
455- f"SELECT * FROM read_parquet([{ paths_str } ], "
456- "union_by_name=True)"
488+ f"SELECT * FROM read_parquet([{ paths_str } ], union_by_name=True)"
457489 )
458490
459491 if sql :
@@ -462,4 +494,29 @@ def get_columns(path: Path) -> set[tuple[str, str]]:
462494 else :
463495 query = f"SELECT { sql } FROM ({ query } ) AS t"
464496
497+ base = duckdb .execute (query )
498+
499+ if not add_dv :
500+ return base
501+
502+ geocode_cols = [
503+ col [0 ] for col in base .description if is_geocode_column (col [0 ])
504+ ]
505+ if not geocode_cols :
506+ return base
507+
508+ duckdb .create_function (
509+ "__pysus_add_dv" ,
510+ _add_dv_fn ,
511+ null_handling = "special" ,
512+ )
513+ selects = [
514+ (
515+ f'__pysus_add_dv("{ c [0 ]} ") AS "{ c [0 ]} "'
516+ if c [0 ] in geocode_cols
517+ else f'"{ c [0 ]} "'
518+ )
519+ for c in base .description
520+ ]
521+ query = f"SELECT { ', ' .join (selects )} FROM ({ query } ) AS _t"
465522 return duckdb .execute (query )
0 commit comments