Skip to content

Commit 267ed88

Browse files
EliEli
authored andcommitted
Simplification of config and unification of nomenclature in registries. Update dependencies.
1 parent ea6b522 commit 267ed88

6 files changed

Lines changed: 80 additions & 27 deletions

File tree

dms_datastore/config_data/dstore_config.yaml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,15 @@ spot_check_spec: spot_check_spec.yaml
1515
default_repo: screened
1616

1717

18-
# aliases for registry files available to repos in their "registry"
18+
# registry files available to repos and identified in their `registry` field.
1919
# these will be looked for in the dbase_config directory and are expected to be csv files.
2020
# The registry files provide supplementary metadata that can be used for provider resolution,
21-
# geographical and other purposes.
21+
# geographical and other purposes. The `column_map` field allows you to map from the column names
22+
# in the registry file to canonical metadata keys that are used in the data file headers and
23+
# elsewhere in the config. The `crs` field specifies coordinate reference systems for geospatial
24+
# data in the registry. If registries are joined as they are in the processed directory,
25+
# the column_map and crs fields for each registry will be applied to the relevant columns before merging.
26+
# and the crs must be compatible across registries.
2227
registries:
2328
continuous:
2429
file: station_dbase.csv
@@ -30,6 +35,9 @@ registries:
3035
lon: longitude
3136
x: projection_x_coordinate
3237
y: projection_y_coordinate
38+
crs:
39+
geographic: "epsg:4269" # applies to latitude/longitude
40+
projected: "epsg:26910" # applies to projection_x_coordinate/projection_y_coordinate
3341
processed_synthetic: processed_registry.csv
3442
structures: structures_registry.csv
3543
daily: station_dbase.csv

dms_datastore/dropbox_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,8 @@ def _get_float(col):
180180
)
181181
return float(val)
182182

183-
out["latitude"] = _get_float("agency_lat")
184-
out["longitude"] = _get_float("agency_lon")
183+
out["latitude"] = _get_float("lat")
184+
out["longitude"] = _get_float("lon")
185185
out["projection_x_coordinate"] = _get_float("x")
186186
out["projection_y_coordinate"] = _get_float("y")
187187
out["projection_authority_id"] = "epsg:26910"

dms_datastore/dstore_config.py

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@
5454
"repo_root",
5555
"resolve_repo_data_dir",
5656
"repo_names",
57+
"registry_spec",
58+
"registry_column_map",
59+
"registry_crs",
5760
"registry_df",
5861
"repo_registry",
5962
"source_priority_group",
@@ -412,6 +415,23 @@ def repo_registry(repo=None, repo_cfg=None):
412415
if isinstance(registry_names, str):
413416
registry_names = [registry_names]
414417

418+
# Validate CRS consistency across sub-registries
419+
if len(registry_names) > 1:
420+
crs_values = []
421+
for rname in registry_names:
422+
crs = registry_crs(rname)
423+
if crs is not None:
424+
crs_values.append((rname, crs))
425+
if len(crs_values) > 1:
426+
first_name, first_crs = crs_values[0]
427+
for other_name, other_crs in crs_values[1:]:
428+
if other_crs != first_crs:
429+
raise ValueError(
430+
f"CRS mismatch across registries: "
431+
f"{first_name} declares {first_crs} but "
432+
f"{other_name} declares {other_crs}"
433+
)
434+
415435
dfs = []
416436
for rname in registry_names:
417437
sub = registry_df(rname).copy()
@@ -424,19 +444,10 @@ def repo_registry(repo=None, repo_cfg=None):
424444
)
425445
dfs.append(sub)
426446

427-
# Merge on common columns
447+
# Merge registries (union of columns, no key overlap allowed)
428448
if len(dfs) == 1:
429449
db = dfs[0]
430450
else:
431-
common_cols = set(dfs[0].columns)
432-
for sub in dfs[1:]:
433-
common_cols &= set(sub.columns)
434-
common_cols = sorted(common_cols)
435-
if site_key not in common_cols:
436-
raise ValueError(
437-
f"Column {site_key!r} not in common columns across registries"
438-
)
439-
440451
# Check for key overlap across sub-registries
441452
all_keys = []
442453
for i, sub in enumerate(dfs):
@@ -448,7 +459,6 @@ def repo_registry(repo=None, repo_cfg=None):
448459
)
449460
all_keys.extend(sub[site_key].values)
450461

451-
dfs = [sub[common_cols] for sub in dfs]
452462
db = pd.concat(dfs, ignore_index=True)
453463

454464
dup = db[site_key].duplicated()
@@ -547,6 +557,39 @@ def repo_config(repo_name):
547557
_repo_cache[repo_name] = spec
548558
return spec
549559

560+
def registry_spec(registry_name):
561+
"""
562+
Return the raw registry specification dict for a named registry.
563+
564+
If the registry is declared as a bare string (legacy), returns
565+
``{"file": <string>}``.
566+
"""
567+
registries = config.get("registries", {})
568+
if registry_name not in registries:
569+
raise ValueError(f"Registry not found: {registry_name}")
570+
spec = registries[registry_name]
571+
if isinstance(spec, str):
572+
return {"file": spec}
573+
return dict(spec)
574+
575+
576+
def registry_column_map(registry_name):
577+
"""
578+
Return the column_map for a named registry, or empty dict if unset.
579+
"""
580+
spec = registry_spec(registry_name)
581+
return dict(spec.get("column_map", {}) or {})
582+
583+
584+
def registry_crs(registry_name):
585+
"""
586+
Return the crs dict for a named registry, or None if unset.
587+
"""
588+
spec = registry_spec(registry_name)
589+
crs = spec.get("crs", None)
590+
return dict(crs) if crs else None
591+
592+
550593
def registry_df(registry_name):
551594
"""
552595
Load a raw registry table by name.
@@ -604,7 +647,9 @@ def registry_df(registry_name):
604647
if registry_name not in registries:
605648
raise ValueError(f"Registry not found: {registry_name}")
606649

607-
reg_path = _resolve_config_path(registries[registry_name])
650+
spec = registries[registry_name]
651+
fname = spec["file"] if isinstance(spec, dict) else spec
652+
reg_path = _resolve_config_path(fname)
608653
if not os.path.exists(reg_path):
609654
raise ValueError(f"Registry file not found: {reg_path}")
610655

environment.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
name: dms_datastore
22
channels:
3-
- cadwr-dms
43
- conda-forge
4+
- cadwr-dms
55
- nodefaults
66
dependencies:
77
- python=3.12
88
- pip
99
- beautifulsoup4
10-
- dask
11-
- numba
10+
- dask>=0.6.24
11+
- numpy>=2.0
12+
- pandas>=2
13+
- numba>=0.61
14+
- vtools3>=3.9.21
1215
- scipy
1316
- tqdm
1417
- openpyxl
@@ -17,11 +20,8 @@ dependencies:
1720
- paramiko # for sftp cimis requests
1821
- tabula-py>=2.9.0 # for pdf parsing
1922
- jpype1
20-
- numpy
21-
- pandas>=2
2223
- cfgrib
2324
- xarray
24-
- vtools3
2525
- pyyaml
2626
- gdal
2727
- pytz # a new hrrr dependency ?

jenkins.environment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
name: dms_datastore
22
channels:
3-
- cadwr-dms
43
- conda-forge
4+
- cadwr-dms
55
- nodefaults
66
dependencies:
77
- python=3.12
88
- pip
99
- beautifulsoup4
10-
- dask
11-
- numba
10+
- dask>=2024
11+
- numba>=2.0
1212
- scipy
1313
- tqdm
1414
- openpyxl

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ dependencies = [
4040
"omegaconf>=2.3",
4141
"paramiko",
4242
"pandas>=2",
43-
"numpy",
43+
"numpy>2",
4444
"xarray",
45-
"dask",
45+
"dask>2024",
4646
"scikit-learn",
4747
"matplotlib",
4848
"cfgrib",

0 commit comments

Comments
 (0)