From b8b719fb1c25ced010d69436dcb6206a56715bbf Mon Sep 17 00:00:00 2001
From: Mayk Thewessen <mayk_12@hotmail.com>
Date: Wed, 17 Jun 2026 15:14:34 +0200
Subject: [PATCH 1/3] MASTR: support refreshing from the open-mastr bulk export

The MASTR source assumes the Zenodo CSV dump, whose last published version
is frozen at 2025-02-09. This adds support for building the source from a
current open-mastr bulk export instead:

- get_raw_file: a glob fn selects the newest matching local file (ISO-dated
  names sort chronologically). The newest local dump wins even on
  update=True, so a forced refresh never regresses to the frozen URL.
- MASTR(): match the storage_units CSV by suffix instead of a hardcoded
  bnetza_open_mastr_2025-02-09/ folder, so any dated dump loads.
- MASTR(): guard ThermischeNutzleistung, which is present in the Zenodo CSV
  but absent from the bulk export (CHP detection falls back to
  KwkMastrNummer alone).
- config: MASTR fn is now a glob (bnetza_open_mastr_*.zip), Zenodo URL kept
  as the fallback seed.
- scripts/build_mastr_zip_from_open_mastr.py: build the ppm-format zip from
  an open-mastr SQLite DB.
---
 powerplantmatching/data.py                  |  18 +++-
 powerplantmatching/package_data/config.yaml |   5 +-
 powerplantmatching/utils.py                 |  19 +++-
 scripts/build_mastr_zip_from_open_mastr.py  | 111 ++++++++++++++++++++
 4 files changed, 148 insertions(+), 5 deletions(-)
 create mode 100644 scripts/build_mastr_zip_from_open_mastr.py
diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py
index 524c15b..af59c31 100644
--- a/powerplantmatching/data.py
+++ b/powerplantmatching/data.py
@@ -2408,8 +2408,12 @@ def MASTR(
 
     cols = ["NutzbareSpeicherkapazitaet", "VerknuepfteEinheit"]
     with ZipFile(fn, "r") as file:
-        fn_storage_units = (
-            "bnetza_open_mastr_2025-02-09/bnetza_mastr_storage_units_raw.csv"
+        # Match by suffix rather than a hardcoded dated folder, so newer
+        # open-mastr dumps (different bnetza_open_mastr_<date>/ prefix) load.
+        fn_storage_units = next(
+            name
+            for name in file.namelist()
+            if name.endswith("bnetza_mastr_storage_units_raw.csv")
         )
         storage_units = pd.read_csv(file.open(fn_storage_units), usecols=cols)
 
@@ -2473,8 +2477,16 @@ def MASTR(
             parse_columns=PARSE_COLUMNS,
         )
         .assign(
+            # ThermischeNutzleistung is present in the Zenodo CSV dump but absent
+            # from the open-mastr bulk export; fall back to KwkMastrNummer alone.
             Set=lambda df: df["Set"].where(
-                df["KwkMastrNummer"].isna() & df["ThermischeNutzleistung"].isna(), "CHP"
+                df["KwkMastrNummer"].isna()
+                & (
+                    df["ThermischeNutzleistung"].isna()
+                    if "ThermischeNutzleistung" in df.columns
+                    else True
+                ),
+                "CHP",
             ),
         )
     )
diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml
index aced4f3..c84e94c 100644
--- a/powerplantmatching/package_data/config.yaml
+++ b/powerplantmatching/package_data/config.yaml
@@ -226,7 +226,10 @@ MASTR:
   reliability_score: 7
   status: ["In Betrieb", "In Planung", "Endgültig stillgelegt", "Vorübergehend stillgelegt"]
   capacity_threshold: 0.1  # all values below will be filtered out, given in MW
-  fn: bnetza_open_mastr_2025-02-09.zip
+  # Glob selects the newest local dump (see get_raw_file). Build a current one
+  # from the open-mastr bulk export with scripts/build_mastr_zip_from_open_mastr.py;
+  # otherwise the frozen Zenodo dump below (last published 2025-02-09) is fetched.
+  fn: bnetza_open_mastr_*.zip
   url: https://zenodo.org/records/14783581/files/bnetza_open_mastr_2025-02-09.zip
 EESI:
   net_capacity: true
diff --git a/powerplantmatching/utils.py b/powerplantmatching/utils.py
index 4b9396b..5fe5dbf 100644
--- a/powerplantmatching/utils.py
+++ b/powerplantmatching/utils.py
@@ -10,7 +10,9 @@
 import os
 import re
 from ast import literal_eval as liteval
+from glob import glob
 from importlib.metadata import version
+from urllib.parse import urlparse
 
 import country_converter as coco
 import numpy as np
@@ -76,7 +78,22 @@ def get_raw_file(name, update=False, config=None, skip_retrieve=False):
     if config is None:
         config = get_config()
     df_config = config[name]
-    path = _data_in(df_config["fn"])
+    fn = df_config["fn"]
+
+    # A glob pattern in `fn` selects the most recent matching local file
+    # (e.g. a locally built, dated dump such as a fresh open-mastr export).
+    # ISO-dated filenames sort chronologically, so the last match is the
+    # newest. The newest local match wins even when update=True: the URL is
+    # only a seed and can never be newer than a dated local build, so a forced
+    # refresh must not silently regress to stale data. Falls back to
+    # downloading the URL's basename when nothing local matches.
+    if any(c in fn for c in "*?["):
+        matches = sorted(glob(_data_in(fn)))
+        if matches:
+            return matches[-1]
+        path = _data_in(os.path.basename(urlparse(df_config["url"]).path))
+    else:
+        path = _data_in(fn)
 
     if (not os.path.exists(path) or update) and not skip_retrieve:
         url = df_config["url"]
diff --git a/scripts/build_mastr_zip_from_open_mastr.py b/scripts/build_mastr_zip_from_open_mastr.py
new file mode 100644
index 0000000..f38c4b1
--- /dev/null
+++ b/scripts/build_mastr_zip_from_open_mastr.py
@@ -0,0 +1,111 @@
+"""
+Build a powerplantmatching-compatible MaStR zip from an open-mastr SQLite DB.
+
+open-mastr's bulk download (`Mastr().download()`) writes the Marktstammdatenregister
+into ~/.open-MaStR/data/sqlite/open-mastr.db. powerplantmatching's MASTR() loader,
+however, reads a zip of `*_raw.csv` files (the layout of the Zenodo open-mastr dump).
+
+This script bridges the two: it exports the technology tables that ppm's loader
+consumes into exactly those CSVs and zips them, so a fresh bulk download can refresh
+ppm's MaStR source without waiting for the (infrequent) Zenodo re-release.
+
+Usage:
+    python scripts/build_mastr_zip_from_open_mastr.py [--db PATH] [--out PATH] [--date YYYY-MM-DD]
+
+Then point config.yaml MASTR.fn at the produced filename (it lands in ppm's data dir).
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sqlite3
+import tempfile
+import zipfile
+from pathlib import Path
+
+import pandas as pd
+
+# DB table -> ppm CSV filename suffix (ppm matches by str.endswith).
+TABLE_TO_CSV = {
+    "biomass_extended": "bnetza_mastr_biomass_raw.csv",
+    "combustion_extended": "bnetza_mastr_combustion_raw.csv",
+    "nuclear_extended": "bnetza_mastr_nuclear_raw.csv",
+    "hydro_extended": "bnetza_mastr_hydro_raw.csv",
+    "wind_extended": "bnetza_mastr_wind_raw.csv",
+    "solar_extended": "bnetza_mastr_solar_raw.csv",
+    "storage_extended": "bnetza_mastr_storage_raw.csv",
+    "storage_units": "bnetza_mastr_storage_units_raw.csv",
+}
+
+# Columns ppm's MASTR() loader reads from the *_extended tables (see data.py).
+# Exporting only these keeps the 6M-row solar CSV to a sane size; ppm intersects
+# with what is present, so a missing column in one table is harmless.
+EXTENDED_COLUMNS = [
+    # target_columns
+    "GeplantesInbetriebnahmedatum", "ThermischeNutzleistung", "KwkMastrNummer",
+    "Batterietechnologie", "DatumBeginnVoruebergehendeStilllegung",
+    "DatumWiederaufnahmeBetrieb", "Postleitzahl", "Ort", "Gemeinde", "Landkreis", "Lage",
+    # PARSE_COLUMNS (Filesuffix is added by ppm, not sourced)
+    "ArtDerWasserkraftanlage", "Biomasseart", "Energietraeger", "Hauptbrennstoff",
+    "NameStromerzeugungseinheit", "NameKraftwerksblock", "NameWindpark", "Technologie",
+    # RENAME_COLUMNS keys
+    "EinheitMastrNummer", "NameKraftwerk", "Land", "Nettonennleistung",
+    "Inbetriebnahmedatum", "DatumEndgueltigeStilllegung", "EinheitBetriebsstatus",
+    "Laengengrad", "Breitengrad", "WEIC",
+]
+STORAGE_UNITS_COLUMNS = ["NutzbareSpeicherkapazitaet", "VerknuepfteEinheit"]
+
+DEFAULT_DB = Path.home() / ".open-MaStR" / "data" / "sqlite" / "open-mastr.db"
+DEFAULT_OUT_DIR = Path.home() / ".local" / "share" / "powerplantmatching" / "data" / "in"
+
+
+def _existing_columns(con: sqlite3.Connection, table: str) -> list[str]:
+    return [r[1] for r in con.execute(f"PRAGMA table_info({table})")]
+
+
+def build(db_path: Path, out_path: Path, date_tag: str) -> None:
+    con = sqlite3.connect(db_path)
+    folder = f"bnetza_open_mastr_{date_tag}"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with tempfile.TemporaryDirectory() as tmp, zipfile.ZipFile(
+        out_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6
+    ) as zf:
+        for table, csv_name in TABLE_TO_CSV.items():
+            avail = _existing_columns(con, table)
+            if not avail:
+                print(f"  skip {table}: table missing/empty")
+                continue
+            wanted = STORAGE_UNITS_COLUMNS if table == "storage_units" else EXTENDED_COLUMNS
+            cols = [c for c in wanted if c in avail]
+            if not cols:
+                print(f"  skip {table}: none of the wanted columns present")
+                continue
+            df = pd.read_sql(f"SELECT {', '.join(cols)} FROM {table}", con)
+            csv_path = Path(tmp) / csv_name
+            df.to_csv(csv_path, index=False)
+            zf.write(csv_path, arcname=f"{folder}/{csv_name}")
+            print(f"  {table:22} -> {csv_name:38} rows={len(df):>9} cols={len(cols)}")
+    con.close()
+    print(f"\nWrote {out_path}  ({out_path.stat().st_size / 1e6:.0f} MB)")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", type=Path, default=DEFAULT_DB)
+    ap.add_argument("--date", default=None, help="date tag for folder/filename, e.g. 2026-06-14")
+    ap.add_argument("--out", type=Path, default=None)
+    args = ap.parse_args()
+
+    date_tag = args.date or os.environ.get("MASTR_DATE_TAG")
+    if not date_tag:
+        raise SystemExit("pass --date YYYY-MM-DD (the bulk export date)")
+
+    out_path = args.out or (DEFAULT_OUT_DIR / f"bnetza_open_mastr_{date_tag}.zip")
+    print(f"DB:  {args.db}\nOut: {out_path}\n")
+    build(args.db, out_path, date_tag)
+
+
+if __name__ == "__main__":
+    main()

From b5672915687accdfca1fef396b7521811fa21724 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 17 Jun 2026 13:15:10 +0000
Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/build_mastr_zip_from_open_mastr.py | 58 ++++++++++++++++------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/scripts/build_mastr_zip_from_open_mastr.py b/scripts/build_mastr_zip_from_open_mastr.py
index f38c4b1..4e636c0 100644
--- a/scripts/build_mastr_zip_from_open_mastr.py
+++ b/scripts/build_mastr_zip_from_open_mastr.py
@@ -43,21 +43,44 @@
 # with what is present, so a missing column in one table is harmless.
 EXTENDED_COLUMNS = [
     # target_columns
-    "GeplantesInbetriebnahmedatum", "ThermischeNutzleistung", "KwkMastrNummer",
-    "Batterietechnologie", "DatumBeginnVoruebergehendeStilllegung",
-    "DatumWiederaufnahmeBetrieb", "Postleitzahl", "Ort", "Gemeinde", "Landkreis", "Lage",
+    "GeplantesInbetriebnahmedatum",
+    "ThermischeNutzleistung",
+    "KwkMastrNummer",
+    "Batterietechnologie",
+    "DatumBeginnVoruebergehendeStilllegung",
+    "DatumWiederaufnahmeBetrieb",
+    "Postleitzahl",
+    "Ort",
+    "Gemeinde",
+    "Landkreis",
+    "Lage",
     # PARSE_COLUMNS (Filesuffix is added by ppm, not sourced)
-    "ArtDerWasserkraftanlage", "Biomasseart", "Energietraeger", "Hauptbrennstoff",
-    "NameStromerzeugungseinheit", "NameKraftwerksblock", "NameWindpark", "Technologie",
+    "ArtDerWasserkraftanlage",
+    "Biomasseart",
+    "Energietraeger",
+    "Hauptbrennstoff",
+    "NameStromerzeugungseinheit",
+    "NameKraftwerksblock",
+    "NameWindpark",
+    "Technologie",
     # RENAME_COLUMNS keys
-    "EinheitMastrNummer", "NameKraftwerk", "Land", "Nettonennleistung",
-    "Inbetriebnahmedatum", "DatumEndgueltigeStilllegung", "EinheitBetriebsstatus",
-    "Laengengrad", "Breitengrad", "WEIC",
+    "EinheitMastrNummer",
+    "NameKraftwerk",
+    "Land",
+    "Nettonennleistung",
+    "Inbetriebnahmedatum",
+    "DatumEndgueltigeStilllegung",
+    "EinheitBetriebsstatus",
+    "Laengengrad",
+    "Breitengrad",
+    "WEIC",
 ]
 STORAGE_UNITS_COLUMNS = ["NutzbareSpeicherkapazitaet", "VerknuepfteEinheit"]
 
 DEFAULT_DB = Path.home() / ".open-MaStR" / "data" / "sqlite" / "open-mastr.db"
-DEFAULT_OUT_DIR = Path.home() / ".local" / "share" / "powerplantmatching" / "data" / "in"
+DEFAULT_OUT_DIR = (
+    Path.home() / ".local" / "share" / "powerplantmatching" / "data" / "in"
+)
 
 
 def _existing_columns(con: sqlite3.Connection, table: str) -> list[str]:
@@ -69,15 +92,20 @@ def build(db_path: Path, out_path: Path, date_tag: str) -> None:
     folder = f"bnetza_open_mastr_{date_tag}"
     out_path.parent.mkdir(parents=True, exist_ok=True)
 
-    with tempfile.TemporaryDirectory() as tmp, zipfile.ZipFile(
-        out_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6
-    ) as zf:
+    with (
+        tempfile.TemporaryDirectory() as tmp,
+        zipfile.ZipFile(
+            out_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6
+        ) as zf,
+    ):
         for table, csv_name in TABLE_TO_CSV.items():
             avail = _existing_columns(con, table)
             if not avail:
                 print(f"  skip {table}: table missing/empty")
                 continue
-            wanted = STORAGE_UNITS_COLUMNS if table == "storage_units" else EXTENDED_COLUMNS
+            wanted = (
+                STORAGE_UNITS_COLUMNS if table == "storage_units" else EXTENDED_COLUMNS
+            )
             cols = [c for c in wanted if c in avail]
             if not cols:
                 print(f"  skip {table}: none of the wanted columns present")
@@ -94,7 +122,9 @@ def build(db_path: Path, out_path: Path, date_tag: str) -> None:
 def main() -> None:
     ap = argparse.ArgumentParser()
     ap.add_argument("--db", type=Path, default=DEFAULT_DB)
-    ap.add_argument("--date", default=None, help="date tag for folder/filename, e.g. 2026-06-14")
+    ap.add_argument(
+        "--date", default=None, help="date tag for folder/filename, e.g. 2026-06-14"
+    )
     ap.add_argument("--out", type=Path, default=None)
     args = ap.parse_args()
 

From cd02d745446684c8b2a5be59703023e48e5965c7 Mon Sep 17 00:00:00 2001
From: Mayk Thewessen <mayk_12@hotmail.com>
Date: Fri, 19 Jun 2026 15:04:19 +0200
Subject: [PATCH 3/3] Add SPDX license header to build_mastr_zip script

The new helper script lacked the REUSE-required SPDX header, failing the
reuse pre-commit.ci check on #298. Add the MIT header used across the
package.
---
 scripts/build_mastr_zip_from_open_mastr.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/build_mastr_zip_from_open_mastr.py b/scripts/build_mastr_zip_from_open_mastr.py
index 4e636c0..ec92593 100644
--- a/scripts/build_mastr_zip_from_open_mastr.py
+++ b/scripts/build_mastr_zip_from_open_mastr.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: Contributors to powerplantmatching <https://github.com/pypsa/powerplantmatching>
+#
+# SPDX-License-Identifier: MIT
+
 """
 Build a powerplantmatching-compatible MaStR zip from an open-mastr SQLite DB.