PyPSA · MaykThewessen · Mar 25, 2026 · Mar 25, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/powerplantmatching/matching.py b/powerplantmatching/matching.py
@@ -20,6 +20,124 @@
 logger = logging.getLogger(__name__)
 
 
+def _match_by_eic(df0, df1, label0, label1):
+    """
+    Deterministic matching of two datasets by EIC (Energy Identification Code).
+
+    Matches plants that share EIC codes before Duke fuzzy matching, so plants
+    with known unique identifiers are paired with certainty. This prevents
+    co-located plants with similar names but different fuels from being merged
+    by the fuzzy matcher (e.g. Eemshavencentrale coal vs Eemscentrale gas in the
+    Netherlands).
+
+    Only *unambiguous* 1-to-1 links are accepted: two rows are matched only when
+    their shared EIC codes link them to no other row on either side. Ambiguous
+    links are deliberately left to Duke. They arise when one source reports an
+    aggregated scheme under a single scheme-level EIC while the other splits it
+    into stations that all carry that same code (common for Alpine hydro: e.g.
+    ENTSOE "Oberhasli Ag Kwo" 1307 MW vs eight OPSD stations all tagged
+    ``12W-0000000031-O``). A single shared code cannot deterministically pick the
+    right pair there, so such clusters fall through to fuzzy matching.
+
+    Parameters
+    ----------
+    df0, df1 : pd.DataFrame
+        Source dataframes with an 'EIC' column holding sets/lists of EIC codes
+        (as produced by ``aggregate_units``).
+    label0, label1 : str
+        Dataset names, used as the output column names.
+
+    Returns
+    -------
+    pd.DataFrame
+        Columns ``[label0, label1]`` with the matched index pairs (empty when no
+        EIC column is present or no unambiguous match exists).
+    """
+    cols = [label0, label1]
+    if "EIC" not in df0.columns or "EIC" not in df1.columns:
+        return pd.DataFrame(columns=cols)
+
+    def codes(df, label):
+        # explode the per-row EIC collections into one (row index, code) per row
+        s = df["EIC"].explode()
+        s = s[s.map(lambda x: isinstance(x, str) and x != "")]
+        return s.rename_axis(label).reset_index(name="EIC")
+
+    # rows that share at least one EIC code (deduplicated to one row per pair)
+    links = pd.merge(codes(df0, label0), codes(df1, label1), on="EIC")[cols]
+    links = links.drop_duplicates()
+    if links.empty:
+        return pd.DataFrame(columns=cols)
+
+    # keep only unambiguous 1-to-1 links: an isolated pair in the shared-code
+    # bipartite graph has degree 1 on both ends (equivalent to a size-2
+    # connected component, verified against scipy on the OPSD/ENTSOE slice).
+    one_to_one = links.groupby(label0)[label1].transform("size").eq(1) & links.groupby(
+        label1
+    )[label0].transform("size").eq(1)
+    matches = links[one_to_one].reset_index(drop=True)
+
+    logger.info(
+        "EIC matching: %d deterministic 1-to-1 matches between `%s` and `%s` "
+        "(%d ambiguous link(s) left to fuzzy matching)",
+        len(matches),
+        label0,
+        label1,
+        int((~one_to_one).sum()),
+    )
+    return matches
+
+
+class DirectMatcher:
+    """
+    Deterministic (non-fuzzy) matching step, run before Duke.
+
+    Holds a sequence of exact-identifier matchers, each a callable
+    ``matcher(df0, df1, label0, label1) -> pd.DataFrame`` that returns matched
+    index pairs in columns ``[label0, label1]``. :meth:`run` applies them in
+    order, removing matched rows from the residual between matchers, and returns
+    the combined matches together with the residual dataframes, so the fuzzy
+    matcher only ever sees the unmatched remainder. This keeps the workflow
+    steps cleanly separated and makes the deterministic phase extensible (a
+    name+country or project-id matcher can join the list without touching Duke).
+
+    Parameters
+    ----------
+    matchers : list of callable, optional
+        Direct matchers to apply. Defaults to ``[_match_by_eic]``.
+    """
+
+    def __init__(self, matchers=None):
+        self.matchers = list(matchers) if matchers is not None else [_match_by_eic]
+
+    def run(self, df0, df1, label0, label1):
+        """
+        Apply the direct matchers to a pair of datasets.
+
+        Returns
+        -------
+        matches : pd.DataFrame
+            Combined matched index pairs, columns ``[label0, label1]``.
+        remaining : list of pd.DataFrame
+            ``[df0, df1]`` with matched rows removed.
+        """
+        cols = [label0, label1]
+        collected = []
+        rem0, rem1 = df0, df1
+        for matcher in self.matchers:
+            m = matcher(rem0, rem1, label0, label1)
+            if m.empty:
+                continue
+            collected.append(m)
+            rem0 = rem0.drop(index=m[label0], errors="ignore")
+            rem1 = rem1.drop(index=m[label1], errors="ignore")
+        if collected:
+            matches = pd.concat(collected, ignore_index=True)
+        else:
+            matches = pd.DataFrame(columns=cols)
+        return matches, [rem0, rem1]
+
+
 def best_matches(links):
     """
     Subsequent to duke() with singlematch=True. Returns reduced list of
@@ -77,6 +195,14 @@ def compare_two_datasets(dfs, labels, country_wise=True, config=None, **dukeargs
     if "singlematch" not in dukeargs:
         dukeargs["singlematch"] = True
 
+    # ── Deterministic matching (before fuzzy) ────────────────────────
+    # Pair plants sharing exact identifiers (EIC) and drop them from the Duke
+    # input, so the fuzzy matcher only handles the unmatched remainder.
+    direct_matches, remaining = DirectMatcher().run(
+        dfs[0], dfs[1], labels[0], labels[1]
+    )
+
+    # ── Duke fuzzy matching on residual ──────────────────────────────
     def country_link(dfs, country):
         # country_selector for both dataframes
         sel_country_b = [df["Country"] == country for df in dfs]
@@ -90,20 +216,22 @@ def country_link(dfs, country):
 
     if country_wise:
         countries = config["target_countries"]
-        links = [country_link(dfs, c) for c in countries]
+        links = [country_link(remaining, c) for c in countries]
         links = [link for link in links if not link.empty]
         if links:
             links = pd.concat(links, ignore_index=True)
         else:
             links = pd.DataFrame(columns=[*labels, "scores"])
     else:
-        links = duke(dfs, labels=labels, **dukeargs)
+        links = duke(remaining, labels=labels, **dukeargs)
 
     if links.empty:
-        matches = pd.DataFrame(columns=labels)
+        duke_matches = pd.DataFrame(columns=labels)
     else:
-        matches = best_matches(links)
+        duke_matches = best_matches(links)
 
+    # ── Combine direct + Duke matches ────────────────────────────────
+    matches = pd.concat([direct_matches, duke_matches], ignore_index=True)
     return matches
 
 

diff --git a/test/test_matching.py b/test/test_matching.py
@@ -0,0 +1,216 @@
+# SPDX-FileCopyrightText: Contributors to powerplantmatching <https://github.com/pypsa/powerplantmatching>
+#
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from powerplantmatching.matching import DirectMatcher, _match_by_eic
+
+
+@pytest.fixture
+def df_entsoe():
+    """ENTSOE-like dataset with EIC codes as sets."""
+    return pd.DataFrame(
+        {
+            "Name": ["Eemshavencentrale", "Eemscentrale", "Maasvlakte"],
+            "Fueltype": ["Hard Coal", "Natural Gas", "Hard Coal"],
+            "Country": ["Netherlands", "Netherlands", "Netherlands"],
+            "Capacity": [1560.0, 2200.0, 1040.0],
+            "EIC": [
+                {"49W000000000EMSA"},
+                {"49W00000000008xG", "49W00000000008xK"},
+                {"49W000000000MVSQ"},
+            ],
+            "lat": [53.44, 53.44, 51.95],
+            "lon": [6.83, 6.84, 4.03],
+        }
+    )
+
+
+@pytest.fixture
+def df_opsd():
+    """OPSD-like dataset with EIC codes as sets."""
+    return pd.DataFrame(
+        {
+            "Name": ["Eemshaven coal", "Eems gas", "Rijnmond"],
+            "Fueltype": ["Hard Coal", "Natural Gas", "Natural Gas"],
+            "Country": ["Netherlands", "Netherlands", "Netherlands"],
+            "Capacity": [1560.0, 2200.0, 800.0],
+            "EIC": [
+                {"49W000000000EMSA"},
+                {"49W00000000008xG"},
+                set(),  # Rijnmond has no EIC
+            ],
+            "lat": [53.44, 53.44, 51.88],
+            "lon": [6.83, 6.84, 4.50],
+        }
+    )
+
+
+def test_eic_matching_basic(df_entsoe, df_opsd):
+    """EIC matching correctly pairs plants sharing EIC codes."""
+    matches = _match_by_eic(df_entsoe, df_opsd, "ENTSOE", "OPSD")
+
+    # Eemshavencentrale (0) <-> Eemshaven coal (0) via EMSA
+    # Eemscentrale (1) <-> Eems gas (1) via 008xG (008xK is ENTSOE-only)
+    assert len(matches) == 2
+    assert set(matches["ENTSOE"]) == {0, 1}
+    assert set(matches["OPSD"]) == {0, 1}
+
+    # Maasvlakte (2) and Rijnmond (2) must NOT match (no shared EIC)
+    assert 2 not in set(matches["ENTSOE"])
+    assert 2 not in set(matches["OPSD"])
+
+
+def test_eic_matching_no_eic_column():
+    """Gracefully handles datasets without an EIC column."""
+    df0 = pd.DataFrame({"Name": ["Plant A"], "Capacity": [100]})
+    df1 = pd.DataFrame({"Name": ["Plant B"], "Capacity": [200], "EIC": [{"CODE1"}]})
+
+    matches = _match_by_eic(df0, df1, "A", "B")
+    assert matches.empty
+    assert list(matches.columns) == ["A", "B"]
+
+
+def test_eic_matching_empty_sets():
+    """No matches when all EIC sets are empty."""
+    df0 = pd.DataFrame({"Name": ["A"], "EIC": [set()]})
+    df1 = pd.DataFrame({"Name": ["B"], "EIC": [set()]})
+
+    matches = _match_by_eic(df0, df1, "X", "Y")
+    assert matches.empty
+
+
+def test_eic_matching_nan_values():
+    """Float nan inside EIC sets does not produce false matches."""
+    df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": [{np.nan}, {"CODE1"}]})
+    df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{np.nan}, {"CODE1"}]})
+
+    matches = _match_by_eic(df0, df1, "L", "R")
+    # Only CODE1 should match, never nan
+    assert len(matches) == 1
+    assert set(matches["L"]) == {1}
+    assert set(matches["R"]) == {1}
+
+
+def test_eic_matching_nan_only():
+    """All-None EIC column produces no matches."""
+    df0 = pd.DataFrame({"Name": ["A"], "EIC": [None]})
+    df1 = pd.DataFrame({"Name": ["B"], "EIC": [None]})
+
+    matches = _match_by_eic(df0, df1, "X", "Y")
+    assert matches.empty
+
+
+def test_eic_matching_one_to_many_is_ambiguous():
+    """A code shared with several rows on the other side is left to Duke.
+
+    df0 Plant A carries {C1, C2}; df1 splits these across Plant X {C1} and
+    Plant Y {C2}. Sharing a single code does not prove identity here, so the
+    deterministic phase makes no match and defers to fuzzy matching.
+    """
+    df0 = pd.DataFrame({"Name": ["Plant A"], "EIC": [{"C1", "C2"}]})
+    df1 = pd.DataFrame({"Name": ["Plant X", "Plant Y"], "EIC": [{"C1"}, {"C2"}]})
+
+    matches = _match_by_eic(df0, df1, "src0", "src1")
+    assert matches.empty
+
+
+def test_eic_matching_hydro_scheme_ambiguous():
+    """Regression for the Alpine-hydro aggregation mismatch (PR #289 review).
+
+    ENTSOE reports one aggregated scheme (1307 MW) under a single scheme-level
+    EIC; OPSD splits it into three stations all tagged with that same code.
+    'One shared code' would pick an arbitrary station, so none is matched.
+    """
+    entsoe = pd.DataFrame(
+        {
+            "Name": ["Oberhasli scheme"],
+            "Capacity": [1307.0],
+            "EIC": [{"12W-0000000031-O"}],
+        }
+    )
+    opsd = pd.DataFrame(
+        {
+            "Name": ["Innertkirchen", "Grimsel", "Handeck"],
+            "Capacity": [10.0, 389.0, 316.0],
+            "EIC": [{"12W-0000000031-O"}] * 3,
+        }
+    )
+    matches = _match_by_eic(entsoe, opsd, "ENTSOE", "OPSD")
+    assert matches.empty
+
+
+def test_eic_matching_multi_code_one_to_one():
+    """Two rows sharing several codes (and nothing else) are a single match."""
+    df0 = pd.DataFrame(
+        {"Name": ["Eems gas"], "EIC": [{"C1", "C2", "C3", "C4", "C5", "C6"}]}
+    )
+    df1 = pd.DataFrame(
+        {"Name": ["Eemscentrale"], "EIC": [{"C1", "C2", "C3", "C4", "C5", "C6"}]}
+    )
+
+    matches = _match_by_eic(df0, df1, "L", "R")
+    assert len(matches) == 1
+    assert matches.iloc[0]["L"] == 0
+    assert matches.iloc[0]["R"] == 0
+
+
+def test_eic_matching_subset_superset_one_to_one():
+    """Partial code coverage still matches when the link is unambiguous 1-to-1.
+
+    One source lists a subset of the other's codes (e.g. OPSD has one unit
+    code, ENTSOE has two). With no competing rows, this is a confident match.
+    """
+    df0 = pd.DataFrame({"Name": ["Ballylumford"], "EIC": [{"C1"}]})
+    df1 = pd.DataFrame({"Name": ["Ballylumford"], "EIC": [{"C1", "C2"}]})
+
+    matches = _match_by_eic(df0, df1, "OPSD", "ENTSOE")
+    assert len(matches) == 1
+
+
+def test_eic_matching_raw_string_treated_as_single_code():
+    """A raw string EIC (not wrapped in a set) is treated as one code."""
+    df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": ["CODE1", {"CODE2"}]})
+    df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{"CODE1"}, {"CODE2"}]})
+
+    matches = _match_by_eic(df0, df1, "L", "R")
+    # CODE1 (raw string) and CODE2 (set) both yield a clean 1-to-1 match
+    assert len(matches) == 2
+    assert set(matches["L"]) == {0, 1}
+
+
+def test_direct_matcher_run_returns_matches_and_residual(df_entsoe, df_opsd):
+    """DirectMatcher.run pairs via EIC and returns the unmatched residual."""
+    matches, remaining = DirectMatcher().run(df_entsoe, df_opsd, "ENTSOE", "OPSD")
+
+    assert len(matches) == 2
+    assert list(matches.columns) == ["ENTSOE", "OPSD"]
+
+    rem0, rem1 = remaining
+    # the two matched rows (idx 0, 1) are removed; only the unmatched stay
+    assert set(rem0.index) == {2}  # Maasvlakte
+    assert set(rem1.index) == {2}  # Rijnmond
+
+
+def test_direct_matcher_no_matches_passes_everything_through():
+    """With no shared identifiers, residual equals the inputs and matches is empty."""
+    df0 = pd.DataFrame({"Name": ["A"], "EIC": [{"C1"}]})
+    df1 = pd.DataFrame({"Name": ["B"], "EIC": [{"C2"}]})
+
+    matches, (rem0, rem1) = DirectMatcher().run(df0, df1, "L", "R")
+    assert matches.empty
+    assert list(matches.columns) == ["L", "R"]
+    assert len(rem0) == 1 and len(rem1) == 1
+
+
+def test_direct_matcher_custom_matcher_list():
+    """Matchers are pluggable; an empty list yields no matches and full residual."""
+    df0 = pd.DataFrame({"Name": ["A"], "EIC": [{"C1"}]})
+    df1 = pd.DataFrame({"Name": ["B"], "EIC": [{"C1"}]})
+
+    matches, (rem0, rem1) = DirectMatcher(matchers=[]).run(df0, df1, "L", "R")
+    assert matches.empty
+    assert len(rem0) == 1 and len(rem1) == 1