Skip to content

Commit 6aca1bd

Browse files
authored
fix(dadosgov): include group aliases on DadosGov to link files with FTP (#272)
1 parent 19024f2 commit 6aca1bd

12 files changed

Lines changed: 354 additions & 50 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ repos:
2929
hooks:
3030
- id: flake8
3131
args: [--max-line-length=80, --extend-ignore=E203]
32+
exclude: ^docs/
3233
additional_dependencies: [
3334
'flake8-blind-except',
3435
'flake8-bugbear',

pysus/api/dadosgov/client.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,21 +154,30 @@ async def _download_file(
154154
self,
155155
file: BaseRemoteFile,
156156
output: pathlib.Path,
157-
callback: Callable[[int], None] | None = None,
157+
callback: Callable[[int, int], None] | None = None,
158158
) -> pathlib.Path:
159159
"""Download a remote file to a local path."""
160160
if self._client is None:
161161
raise ConnectionError(
162162
"Client not connected. Call login(token=...) first.",
163163
)
164164

165-
async with self._client.stream("GET", str(file.path)) as response:
165+
url = (
166+
str(file.path)
167+
.replace("https:/", "https://")
168+
.replace("http:/", "http://")
169+
)
170+
171+
async with self._client.stream("GET", url) as response:
166172
response.raise_for_status()
173+
total = int(response.headers.get("Content-Length", 0))
174+
downloaded = 0
167175
with open(output, "wb") as f:
168176
async for chunk in response.aiter_bytes():
169177
f.write(chunk)
178+
downloaded += len(chunk)
170179
if callback:
171-
callback(len(chunk))
180+
callback(downloaded, total)
172181
return output
173182

174183

@@ -181,9 +190,7 @@ class Recurso(BaseModel):
181190
title: str = Field(alias="titulo")
182191
url: str = Field(alias="link")
183192
api_size: int = Field(alias="tamanho")
184-
last_modified: datetime | None = Field(
185-
None, alias="dataUltimaAtualizacaoArquivo"
186-
)
193+
last_modified: DateTime = Field(None, alias="dataUltimaAtualizacaoArquivo")
187194
file_name: str | None = Field(None, alias="nomeArquivo")
188195

189196
async def get_size(self) -> int:

pysus/api/dadosgov/databases.py

Lines changed: 232 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,39 @@
11
"""Pre-configured health database definitions accessible via dados.gov.br."""
22

3+
import re
34
from typing import Any
45

6+
from pysus.utils import zfill_year
7+
58
from .models import Dataset
69

10+
MONTHS: dict[str, int] = {
11+
"jan": 1,
12+
"fev": 2,
13+
"mar": 3,
14+
"abr": 4,
15+
"mai": 5,
16+
"jun": 6,
17+
"jul": 7,
18+
"ago": 8,
19+
"set": 9,
20+
"out": 10,
21+
"nov": 11,
22+
"dez": 12,
23+
}
24+
25+
26+
def _parse_year(val: str) -> int | None:
27+
try:
28+
y = int(val)
29+
return y if 1970 <= y <= 2100 else None
30+
except ValueError:
31+
return None
32+
33+
34+
def _skip(name: str) -> bool:
35+
return name.startswith("get_") or name.lower().endswith(".pdf")
36+
737

838
class CNES(Dataset):
939
"""Cadastro Nacional de Estabelecimentos de Saúde (CNES)."""
@@ -32,8 +62,23 @@ def description(self) -> str:
3262
)
3363

3464
def formatter(self, filename: str) -> dict[str, Any]:
35-
"""Extract metadata from a filename (not yet implemented)."""
36-
raise NotImplementedError()
65+
"""Parse a CNES filename and extract metadata."""
66+
try:
67+
name = filename.strip()
68+
if _skip(name):
69+
return {"state": None, "year": None, "month": None}
70+
71+
m = re.search(r"_(\d{2})-(\d{4})\.csv$", name)
72+
if m:
73+
return {
74+
"state": None,
75+
"year": _parse_year(m.group(2)),
76+
"month": int(m.group(1)),
77+
}
78+
79+
return {"state": None, "year": None, "month": None}
80+
except (IndexError, ValueError):
81+
return {"state": None, "year": None, "month": None}
3782

3883

3984
class PNI(Dataset):
@@ -49,6 +94,18 @@ class PNI(Dataset):
4994
"9a25b796-80e3-444a-a4e7-405f5596d8ab",
5095
]
5196

97+
_PNI_PREFIX = "doses-aplicadas-pelo-programa-de-nacional-de-imunizacoes-pni"
98+
99+
group_aliases: dict[str, str] = {
100+
_PNI_PREFIX: "DPNI",
101+
f"{_PNI_PREFIX}-2020": "DPNI",
102+
f"{_PNI_PREFIX}-2021": "DPNI",
103+
f"dataset-{_PNI_PREFIX}_2022": "DPNI",
104+
f"{_PNI_PREFIX}-2023": "DPNI",
105+
f"{_PNI_PREFIX}-2025": "DPNI",
106+
f"{_PNI_PREFIX}-2026": "DPNI",
107+
}
108+
52109
@property
53110
def name(self) -> str:
54111
"""Return the short name."""
@@ -64,8 +121,21 @@ def description(self) -> str:
64121
return "O PNI monitora a cobertura vacinal e doses aplicadas no Brasil."
65122

66123
def formatter(self, filename: str) -> dict[str, Any]:
67-
"""Extract metadata from a filename (not yet implemented)."""
68-
raise NotImplementedError()
124+
"""Parse a PNI vaccination filename into month and year."""
125+
try:
126+
name = filename.strip().lower()
127+
if _skip(name):
128+
return {"state": None, "year": None, "month": None}
129+
130+
m = re.match(r"vacinacao_(\w{3})_(\d{4})_csv\.zip", name)
131+
if m:
132+
month = MONTHS.get(m.group(1))
133+
year = _parse_year(m.group(2))
134+
return {"state": None, "year": year, "month": month}
135+
136+
return {"state": None, "year": None, "month": None}
137+
except (IndexError, ValueError):
138+
return {"state": None, "year": None, "month": None}
69139

70140

71141
class SIA(Dataset):
@@ -92,8 +162,31 @@ def description(self) -> str:
92162
"""
93163

94164
def formatter(self, filename: str) -> dict[str, Any]:
95-
"""Extract metadata from a filename (not yet implemented)."""
96-
raise NotImplementedError()
165+
"""Parse an SIA filename into year."""
166+
try:
167+
name = filename.strip().lower()
168+
if _skip(name):
169+
return {"state": None, "year": None, "month": None}
170+
171+
m = re.search(r"_(\d{4})_\.csv$", name)
172+
if m:
173+
return {
174+
"state": None,
175+
"year": _parse_year(m.group(1)),
176+
"month": None,
177+
}
178+
179+
m = re.search(r"_(\w{3})-out_(\d{4})_\.csv$", name)
180+
if m:
181+
return {
182+
"state": None,
183+
"year": _parse_year(m.group(2)),
184+
"month": None,
185+
}
186+
187+
return {"state": None, "year": None, "month": None}
188+
except (IndexError, ValueError):
189+
return {"state": None, "year": None, "month": None}
97190

98191

99192
class SINAN(Dataset):
@@ -104,8 +197,21 @@ class SINAN(Dataset):
104197
"5699abe0-0510-4da8-b47d-209b3bb32b34",
105198
"4557ba96-7d52-4a56-bd6f-f99a5af09f77",
106199
"740ce8f4-7a5d-4351-aad4-7623f2490ada",
200+
"cf044c1b-b966-4d0e-bab0-f3aa65897b7d",
201+
"2d4997fb-cd11-4ce2-b217-09cd50e3151f",
202+
"8a585222-4c2e-43b7-807d-59355ee79c48",
203+
"527e8665-de64-4f81-b7c3-40b59c7d1d3c",
107204
]
108205

206+
group_aliases: dict[str, str] = {
207+
"arboviroses-dengue": "DENG",
208+
"arboviroses-febre-de-chikungunya": "CHIK",
209+
"arboviroses-zika-virus": "ZIKA",
210+
"hanseniase": "HANS",
211+
"dados-tuberculose": "TUBE",
212+
"sifilis": "SIFA",
213+
}
214+
109215
@property
110216
def name(self) -> str:
111217
"""Return the short name."""
@@ -124,8 +230,31 @@ def description(self) -> str:
124230
"""
125231

126232
def formatter(self, filename: str) -> dict[str, Any]:
127-
"""Extract metadata from a filename (not yet implemented)."""
128-
raise NotImplementedError()
233+
"""Parse a SINAN filename into state and year."""
234+
try:
235+
name = filename.strip().upper()
236+
if _skip(name):
237+
return {"state": None, "year": None, "month": None}
238+
239+
m = re.match(r"(\w{4})(BR)(\d{2})\.CSV\.ZIP", name)
240+
if m:
241+
return {
242+
"state": m.group(2),
243+
"year": zfill_year(m.group(3)),
244+
"month": None,
245+
}
246+
247+
m = re.match(r"MPX_(\d{4})_OPENDATASUS\.CSV\.ZIP", name)
248+
if m:
249+
return {
250+
"state": None,
251+
"year": _parse_year(m.group(1)),
252+
"month": None,
253+
}
254+
255+
return {"state": None, "year": None, "month": None}
256+
except (IndexError, ValueError):
257+
return {"state": None, "year": None, "month": None}
129258

130259

131260
class SIM(Dataset):
@@ -135,6 +264,10 @@ class SIM(Dataset):
135264
"5f121f4d-47c6-428e-8ec6-e8ec56417172",
136265
]
137266

267+
group_aliases: dict[str, str] = {
268+
"sim-1979-2019": "DO",
269+
}
270+
138271
@property
139272
def name(self) -> str:
140273
"""Return the short name."""
@@ -152,8 +285,31 @@ def description(self) -> str:
152285
"""
153286

154287
def formatter(self, filename: str) -> dict[str, Any]:
155-
"""Extract metadata from a filename (not yet implemented)."""
156-
raise NotImplementedError()
288+
"""Parse a SIM filename into year."""
289+
try:
290+
name = filename.strip()
291+
if _skip(name):
292+
return {"state": None, "year": None, "month": None}
293+
294+
m = re.search(r"Mortalidade_Geral_(\d{4})_csv\.zip", name)
295+
if m:
296+
return {
297+
"state": None,
298+
"year": _parse_year(m.group(1)),
299+
"month": None,
300+
}
301+
302+
m = re.match(r"DO(\d{2})OPEN", name)
303+
if m:
304+
return {
305+
"state": None,
306+
"year": zfill_year(m.group(1)),
307+
"month": None,
308+
}
309+
310+
return {"state": None, "year": None, "month": None}
311+
except (IndexError, ValueError):
312+
return {"state": None, "year": None, "month": None}
157313

158314

159315
class SINASC(Dataset):
@@ -163,6 +319,10 @@ class SINASC(Dataset):
163319
"441cc6bd-684a-4afd-a88b-ba4734c9e83e",
164320
]
165321

322+
group_aliases: dict[str, str] = {
323+
"sistema-de-informacao-sobre-nascidos-vivos-sinasc-1996-a-20201": "DN",
324+
}
325+
166326
@property
167327
def name(self) -> str:
168328
"""Return the short name."""
@@ -181,8 +341,67 @@ def description(self) -> str:
181341
"""
182342

183343
def formatter(self, filename: str) -> dict[str, Any]:
184-
"""Extract metadata from a filename (not yet implemented)."""
185-
raise NotImplementedError()
344+
"""Parse a SINASC filename into year."""
345+
try:
346+
name = filename.strip()
347+
if _skip(name):
348+
return {"state": None, "year": None, "month": None}
349+
350+
m = re.search(r"SINASC_(\d{4})_csv\.zip", name)
351+
if m:
352+
return {
353+
"state": None,
354+
"year": _parse_year(m.group(1)),
355+
"month": None,
356+
}
357+
358+
m = re.search(r"DNBR(\d{4})_csv\.zip", name)
359+
if m:
360+
return {
361+
"state": "BR",
362+
"year": _parse_year(m.group(1)),
363+
"month": None,
364+
}
365+
366+
return {"state": None, "year": None, "month": None}
367+
except (IndexError, ValueError):
368+
return {"state": None, "year": None, "month": None}
369+
370+
371+
class COVID19(Dataset):
372+
"""Casos Confirmados de COVID-19."""
373+
374+
ids: list[str] = [
375+
"1ba1801e-aec0-4dba-ae2a-7732f0a0c9f7",
376+
]
377+
378+
@property
379+
def name(self) -> str:
380+
"""Return the short name."""
381+
return "COVID19"
382+
383+
@property
384+
def long_name(self) -> str:
385+
"""Return the human-readable name."""
386+
return "Casos Confirmados de COVID-19"
387+
388+
@property
389+
def description(self) -> str:
390+
return "Dados anonimizados de casos confirmados de COVID-19."
391+
392+
def formatter(self, filename: str) -> dict[str, Any]:
393+
"""Parse a COVID-19 filename."""
394+
try:
395+
name = filename.strip().lower()
396+
if _skip(name) or name.endswith(".xlsx"):
397+
return {"state": None, "year": None, "month": None}
398+
399+
if name.endswith(".csv"):
400+
return {"state": None, "year": None, "month": None}
401+
402+
return {"state": None, "year": None, "month": None}
403+
except (IndexError, ValueError):
404+
return {"state": None, "year": None, "month": None}
186405

187406

188407
AVAILABLE_DATABASES: list[type[Dataset]] = [
@@ -192,4 +411,5 @@ def formatter(self, filename: str) -> dict[str, Any]:
192411
SIM,
193412
SINAN,
194413
SINASC,
414+
COVID19,
195415
]

0 commit comments

Comments
 (0)