From b8b719fb1c25ced010d69436dcb6206a56715bbf Mon Sep 17 00:00:00 2001 From: Mayk Thewessen Date: Wed, 17 Jun 2026 15:14:34 +0200 Subject: [PATCH 1/3] MASTR: support refreshing from the open-mastr bulk export The MASTR source assumes the Zenodo CSV dump, whose last published version is frozen at 2025-02-09. This adds support for building the source from a current open-mastr bulk export instead: - get_raw_file: a glob fn selects the newest matching local file (ISO-dated names sort chronologically). The newest local dump wins even on update=True, so a forced refresh never regresses to the frozen URL. - MASTR(): match the storage_units CSV by suffix instead of a hardcoded bnetza_open_mastr_2025-02-09/ folder, so any dated dump loads. - MASTR(): guard ThermischeNutzleistung, which is present in the Zenodo CSV but absent from the bulk export (CHP detection falls back to KwkMastrNummer alone). - config: MASTR fn is now a glob (bnetza_open_mastr_*.zip), Zenodo URL kept as the fallback seed. - scripts/build_mastr_zip_from_open_mastr.py: build the ppm-format zip from an open-mastr SQLite DB. --- powerplantmatching/data.py | 18 +++- powerplantmatching/package_data/config.yaml | 5 +- powerplantmatching/utils.py | 19 +++- scripts/build_mastr_zip_from_open_mastr.py | 111 ++++++++++++++++++++ 4 files changed, 148 insertions(+), 5 deletions(-) create mode 100644 scripts/build_mastr_zip_from_open_mastr.py diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py index 524c15b..af59c31 100644 --- a/powerplantmatching/data.py +++ b/powerplantmatching/data.py @@ -2408,8 +2408,12 @@ def MASTR( cols = ["NutzbareSpeicherkapazitaet", "VerknuepfteEinheit"] with ZipFile(fn, "r") as file: - fn_storage_units = ( - "bnetza_open_mastr_2025-02-09/bnetza_mastr_storage_units_raw.csv" + # Match by suffix rather than a hardcoded dated folder, so newer + # open-mastr dumps (different bnetza_open_mastr_/ prefix) load. + fn_storage_units = next( + name + for name in file.namelist() + if name.endswith("bnetza_mastr_storage_units_raw.csv") ) storage_units = pd.read_csv(file.open(fn_storage_units), usecols=cols) @@ -2473,8 +2477,16 @@ def MASTR( parse_columns=PARSE_COLUMNS, ) .assign( + # ThermischeNutzleistung is present in the Zenodo CSV dump but absent + # from the open-mastr bulk export; fall back to KwkMastrNummer alone. Set=lambda df: df["Set"].where( - df["KwkMastrNummer"].isna() & df["ThermischeNutzleistung"].isna(), "CHP" + df["KwkMastrNummer"].isna() + & ( + df["ThermischeNutzleistung"].isna() + if "ThermischeNutzleistung" in df.columns + else True + ), + "CHP", ), ) ) diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml index aced4f3..c84e94c 100644 --- a/powerplantmatching/package_data/config.yaml +++ b/powerplantmatching/package_data/config.yaml @@ -226,7 +226,10 @@ MASTR: reliability_score: 7 status: ["In Betrieb", "In Planung", "Endgültig stillgelegt", "Vorübergehend stillgelegt"] capacity_threshold: 0.1 # all values below will be filtered out, given in MW - fn: bnetza_open_mastr_2025-02-09.zip + # Glob selects the newest local dump (see get_raw_file). Build a current one + # from the open-mastr bulk export with scripts/build_mastr_zip_from_open_mastr.py; + # otherwise the frozen Zenodo dump below (last published 2025-02-09) is fetched. + fn: bnetza_open_mastr_*.zip url: https://zenodo.org/records/14783581/files/bnetza_open_mastr_2025-02-09.zip EESI: net_capacity: true diff --git a/powerplantmatching/utils.py b/powerplantmatching/utils.py index 4b9396b..5fe5dbf 100644 --- a/powerplantmatching/utils.py +++ b/powerplantmatching/utils.py @@ -10,7 +10,9 @@ import os import re from ast import literal_eval as liteval +from glob import glob from importlib.metadata import version +from urllib.parse import urlparse import country_converter as coco import numpy as np @@ -76,7 +78,22 @@ def get_raw_file(name, update=False, config=None, skip_retrieve=False): if config is None: config = get_config() df_config = config[name] - path = _data_in(df_config["fn"]) + fn = df_config["fn"] + + # A glob pattern in `fn` selects the most recent matching local file + # (e.g. a locally built, dated dump such as a fresh open-mastr export). + # ISO-dated filenames sort chronologically, so the last match is the + # newest. The newest local match wins even when update=True: the URL is + # only a seed and can never be newer than a dated local build, so a forced + # refresh must not silently regress to stale data. Falls back to + # downloading the URL's basename when nothing local matches. + if any(c in fn for c in "*?["): + matches = sorted(glob(_data_in(fn))) + if matches: + return matches[-1] + path = _data_in(os.path.basename(urlparse(df_config["url"]).path)) + else: + path = _data_in(fn) if (not os.path.exists(path) or update) and not skip_retrieve: url = df_config["url"] diff --git a/scripts/build_mastr_zip_from_open_mastr.py b/scripts/build_mastr_zip_from_open_mastr.py new file mode 100644 index 0000000..f38c4b1 --- /dev/null +++ b/scripts/build_mastr_zip_from_open_mastr.py @@ -0,0 +1,111 @@ +""" +Build a powerplantmatching-compatible MaStR zip from an open-mastr SQLite DB. + +open-mastr's bulk download (`Mastr().download()`) writes the Marktstammdatenregister +into ~/.open-MaStR/data/sqlite/open-mastr.db. powerplantmatching's MASTR() loader, +however, reads a zip of `*_raw.csv` files (the layout of the Zenodo open-mastr dump). + +This script bridges the two: it exports the technology tables that ppm's loader +consumes into exactly those CSVs and zips them, so a fresh bulk download can refresh +ppm's MaStR source without waiting for the (infrequent) Zenodo re-release. + +Usage: + python scripts/build_mastr_zip_from_open_mastr.py [--db PATH] [--out PATH] [--date YYYY-MM-DD] + +Then point config.yaml MASTR.fn at the produced filename (it lands in ppm's data dir). +""" + +from __future__ import annotations + +import argparse +import os +import sqlite3 +import tempfile +import zipfile +from pathlib import Path + +import pandas as pd + +# DB table -> ppm CSV filename suffix (ppm matches by str.endswith). +TABLE_TO_CSV = { + "biomass_extended": "bnetza_mastr_biomass_raw.csv", + "combustion_extended": "bnetza_mastr_combustion_raw.csv", + "nuclear_extended": "bnetza_mastr_nuclear_raw.csv", + "hydro_extended": "bnetza_mastr_hydro_raw.csv", + "wind_extended": "bnetza_mastr_wind_raw.csv", + "solar_extended": "bnetza_mastr_solar_raw.csv", + "storage_extended": "bnetza_mastr_storage_raw.csv", + "storage_units": "bnetza_mastr_storage_units_raw.csv", +} + +# Columns ppm's MASTR() loader reads from the *_extended tables (see data.py). +# Exporting only these keeps the 6M-row solar CSV to a sane size; ppm intersects +# with what is present, so a missing column in one table is harmless. +EXTENDED_COLUMNS = [ + # target_columns + "GeplantesInbetriebnahmedatum", "ThermischeNutzleistung", "KwkMastrNummer", + "Batterietechnologie", "DatumBeginnVoruebergehendeStilllegung", + "DatumWiederaufnahmeBetrieb", "Postleitzahl", "Ort", "Gemeinde", "Landkreis", "Lage", + # PARSE_COLUMNS (Filesuffix is added by ppm, not sourced) + "ArtDerWasserkraftanlage", "Biomasseart", "Energietraeger", "Hauptbrennstoff", + "NameStromerzeugungseinheit", "NameKraftwerksblock", "NameWindpark", "Technologie", + # RENAME_COLUMNS keys + "EinheitMastrNummer", "NameKraftwerk", "Land", "Nettonennleistung", + "Inbetriebnahmedatum", "DatumEndgueltigeStilllegung", "EinheitBetriebsstatus", + "Laengengrad", "Breitengrad", "WEIC", +] +STORAGE_UNITS_COLUMNS = ["NutzbareSpeicherkapazitaet", "VerknuepfteEinheit"] + +DEFAULT_DB = Path.home() / ".open-MaStR" / "data" / "sqlite" / "open-mastr.db" +DEFAULT_OUT_DIR = Path.home() / ".local" / "share" / "powerplantmatching" / "data" / "in" + + +def _existing_columns(con: sqlite3.Connection, table: str) -> list[str]: + return [r[1] for r in con.execute(f"PRAGMA table_info({table})")] + + +def build(db_path: Path, out_path: Path, date_tag: str) -> None: + con = sqlite3.connect(db_path) + folder = f"bnetza_open_mastr_{date_tag}" + out_path.parent.mkdir(parents=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as tmp, zipfile.ZipFile( + out_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6 + ) as zf: + for table, csv_name in TABLE_TO_CSV.items(): + avail = _existing_columns(con, table) + if not avail: + print(f" skip {table}: table missing/empty") + continue + wanted = STORAGE_UNITS_COLUMNS if table == "storage_units" else EXTENDED_COLUMNS + cols = [c for c in wanted if c in avail] + if not cols: + print(f" skip {table}: none of the wanted columns present") + continue + df = pd.read_sql(f"SELECT {', '.join(cols)} FROM {table}", con) + csv_path = Path(tmp) / csv_name + df.to_csv(csv_path, index=False) + zf.write(csv_path, arcname=f"{folder}/{csv_name}") + print(f" {table:22} -> {csv_name:38} rows={len(df):>9} cols={len(cols)}") + con.close() + print(f"\nWrote {out_path} ({out_path.stat().st_size / 1e6:.0f} MB)") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--db", type=Path, default=DEFAULT_DB) + ap.add_argument("--date", default=None, help="date tag for folder/filename, e.g. 2026-06-14") + ap.add_argument("--out", type=Path, default=None) + args = ap.parse_args() + + date_tag = args.date or os.environ.get("MASTR_DATE_TAG") + if not date_tag: + raise SystemExit("pass --date YYYY-MM-DD (the bulk export date)") + + out_path = args.out or (DEFAULT_OUT_DIR / f"bnetza_open_mastr_{date_tag}.zip") + print(f"DB: {args.db}\nOut: {out_path}\n") + build(args.db, out_path, date_tag) + + +if __name__ == "__main__": + main() From b5672915687accdfca1fef396b7521811fa21724 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 17 Jun 2026 13:15:10 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/build_mastr_zip_from_open_mastr.py | 58 ++++++++++++++++------ 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/scripts/build_mastr_zip_from_open_mastr.py b/scripts/build_mastr_zip_from_open_mastr.py index f38c4b1..4e636c0 100644 --- a/scripts/build_mastr_zip_from_open_mastr.py +++ b/scripts/build_mastr_zip_from_open_mastr.py @@ -43,21 +43,44 @@ # with what is present, so a missing column in one table is harmless. EXTENDED_COLUMNS = [ # target_columns - "GeplantesInbetriebnahmedatum", "ThermischeNutzleistung", "KwkMastrNummer", - "Batterietechnologie", "DatumBeginnVoruebergehendeStilllegung", - "DatumWiederaufnahmeBetrieb", "Postleitzahl", "Ort", "Gemeinde", "Landkreis", "Lage", + "GeplantesInbetriebnahmedatum", + "ThermischeNutzleistung", + "KwkMastrNummer", + "Batterietechnologie", + "DatumBeginnVoruebergehendeStilllegung", + "DatumWiederaufnahmeBetrieb", + "Postleitzahl", + "Ort", + "Gemeinde", + "Landkreis", + "Lage", # PARSE_COLUMNS (Filesuffix is added by ppm, not sourced) - "ArtDerWasserkraftanlage", "Biomasseart", "Energietraeger", "Hauptbrennstoff", - "NameStromerzeugungseinheit", "NameKraftwerksblock", "NameWindpark", "Technologie", + "ArtDerWasserkraftanlage", + "Biomasseart", + "Energietraeger", + "Hauptbrennstoff", + "NameStromerzeugungseinheit", + "NameKraftwerksblock", + "NameWindpark", + "Technologie", # RENAME_COLUMNS keys - "EinheitMastrNummer", "NameKraftwerk", "Land", "Nettonennleistung", - "Inbetriebnahmedatum", "DatumEndgueltigeStilllegung", "EinheitBetriebsstatus", - "Laengengrad", "Breitengrad", "WEIC", + "EinheitMastrNummer", + "NameKraftwerk", + "Land", + "Nettonennleistung", + "Inbetriebnahmedatum", + "DatumEndgueltigeStilllegung", + "EinheitBetriebsstatus", + "Laengengrad", + "Breitengrad", + "WEIC", ] STORAGE_UNITS_COLUMNS = ["NutzbareSpeicherkapazitaet", "VerknuepfteEinheit"] DEFAULT_DB = Path.home() / ".open-MaStR" / "data" / "sqlite" / "open-mastr.db" -DEFAULT_OUT_DIR = Path.home() / ".local" / "share" / "powerplantmatching" / "data" / "in" +DEFAULT_OUT_DIR = ( + Path.home() / ".local" / "share" / "powerplantmatching" / "data" / "in" +) def _existing_columns(con: sqlite3.Connection, table: str) -> list[str]: @@ -69,15 +92,20 @@ def build(db_path: Path, out_path: Path, date_tag: str) -> None: folder = f"bnetza_open_mastr_{date_tag}" out_path.parent.mkdir(parents=True, exist_ok=True) - with tempfile.TemporaryDirectory() as tmp, zipfile.ZipFile( - out_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6 - ) as zf: + with ( + tempfile.TemporaryDirectory() as tmp, + zipfile.ZipFile( + out_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6 + ) as zf, + ): for table, csv_name in TABLE_TO_CSV.items(): avail = _existing_columns(con, table) if not avail: print(f" skip {table}: table missing/empty") continue - wanted = STORAGE_UNITS_COLUMNS if table == "storage_units" else EXTENDED_COLUMNS + wanted = ( + STORAGE_UNITS_COLUMNS if table == "storage_units" else EXTENDED_COLUMNS + ) cols = [c for c in wanted if c in avail] if not cols: print(f" skip {table}: none of the wanted columns present") @@ -94,7 +122,9 @@ def build(db_path: Path, out_path: Path, date_tag: str) -> None: def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--db", type=Path, default=DEFAULT_DB) - ap.add_argument("--date", default=None, help="date tag for folder/filename, e.g. 2026-06-14") + ap.add_argument( + "--date", default=None, help="date tag for folder/filename, e.g. 2026-06-14" + ) ap.add_argument("--out", type=Path, default=None) args = ap.parse_args() From cd02d745446684c8b2a5be59703023e48e5965c7 Mon Sep 17 00:00:00 2001 From: Mayk Thewessen Date: Fri, 19 Jun 2026 15:04:19 +0200 Subject: [PATCH 3/3] Add SPDX license header to build_mastr_zip script The new helper script lacked the REUSE-required SPDX header, failing the reuse pre-commit.ci check on #298. Add the MIT header used across the package. --- scripts/build_mastr_zip_from_open_mastr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/build_mastr_zip_from_open_mastr.py b/scripts/build_mastr_zip_from_open_mastr.py index 4e636c0..ec92593 100644 --- a/scripts/build_mastr_zip_from_open_mastr.py +++ b/scripts/build_mastr_zip_from_open_mastr.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Contributors to powerplantmatching +# +# SPDX-License-Identifier: MIT + """ Build a powerplantmatching-compatible MaStR zip from an open-mastr SQLite DB.