Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 132 additions & 4 deletions powerplantmatching/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,124 @@
logger = logging.getLogger(__name__)


def _match_by_eic(df0, df1, label0, label1):
"""
Deterministic matching of two datasets by EIC (Energy Identification Code).

Matches plants that share EIC codes before Duke fuzzy matching, so plants
with known unique identifiers are paired with certainty. This prevents
co-located plants with similar names but different fuels from being merged
by the fuzzy matcher (e.g. Eemshavencentrale coal vs Eemscentrale gas in the
Netherlands).

Only *unambiguous* 1-to-1 links are accepted: two rows are matched only when
their shared EIC codes link them to no other row on either side. Ambiguous
links are deliberately left to Duke. They arise when one source reports an
aggregated scheme under a single scheme-level EIC while the other splits it
into stations that all carry that same code (common for Alpine hydro: e.g.
ENTSOE "Oberhasli Ag Kwo" 1307 MW vs eight OPSD stations all tagged
``12W-0000000031-O``). A single shared code cannot deterministically pick the
right pair there, so such clusters fall through to fuzzy matching.

Parameters
----------
df0, df1 : pd.DataFrame
Source dataframes with an 'EIC' column holding sets/lists of EIC codes
(as produced by ``aggregate_units``).
label0, label1 : str
Dataset names, used as the output column names.

Returns
-------
pd.DataFrame
Columns ``[label0, label1]`` with the matched index pairs (empty when no
EIC column is present or no unambiguous match exists).
"""
cols = [label0, label1]
if "EIC" not in df0.columns or "EIC" not in df1.columns:
return pd.DataFrame(columns=cols)

def codes(df, label):
# explode the per-row EIC collections into one (row index, code) per row
s = df["EIC"].explode()
s = s[s.map(lambda x: isinstance(x, str) and x != "")]
return s.rename_axis(label).reset_index(name="EIC")

# rows that share at least one EIC code (deduplicated to one row per pair)
links = pd.merge(codes(df0, label0), codes(df1, label1), on="EIC")[cols]
links = links.drop_duplicates()
if links.empty:
return pd.DataFrame(columns=cols)

# keep only unambiguous 1-to-1 links: an isolated pair in the shared-code
# bipartite graph has degree 1 on both ends (equivalent to a size-2
# connected component, verified against scipy on the OPSD/ENTSOE slice).
one_to_one = links.groupby(label0)[label1].transform("size").eq(1) & links.groupby(
label1
)[label0].transform("size").eq(1)
matches = links[one_to_one].reset_index(drop=True)

logger.info(
"EIC matching: %d deterministic 1-to-1 matches between `%s` and `%s` "
"(%d ambiguous link(s) left to fuzzy matching)",
len(matches),
label0,
label1,
int((~one_to_one).sum()),
)
return matches


class DirectMatcher:
"""
Deterministic (non-fuzzy) matching step, run before Duke.

Holds a sequence of exact-identifier matchers, each a callable
``matcher(df0, df1, label0, label1) -> pd.DataFrame`` that returns matched
index pairs in columns ``[label0, label1]``. :meth:`run` applies them in
order, removing matched rows from the residual between matchers, and returns
the combined matches together with the residual dataframes, so the fuzzy
matcher only ever sees the unmatched remainder. This keeps the workflow
steps cleanly separated and makes the deterministic phase extensible (a
name+country or project-id matcher can join the list without touching Duke).

Parameters
----------
matchers : list of callable, optional
Direct matchers to apply. Defaults to ``[_match_by_eic]``.
"""

def __init__(self, matchers=None):
self.matchers = list(matchers) if matchers is not None else [_match_by_eic]

def run(self, df0, df1, label0, label1):
"""
Apply the direct matchers to a pair of datasets.

Returns
-------
matches : pd.DataFrame
Combined matched index pairs, columns ``[label0, label1]``.
remaining : list of pd.DataFrame
``[df0, df1]`` with matched rows removed.
"""
cols = [label0, label1]
collected = []
rem0, rem1 = df0, df1
for matcher in self.matchers:
m = matcher(rem0, rem1, label0, label1)
if m.empty:
continue
collected.append(m)
rem0 = rem0.drop(index=m[label0], errors="ignore")
rem1 = rem1.drop(index=m[label1], errors="ignore")
if collected:
matches = pd.concat(collected, ignore_index=True)
else:
matches = pd.DataFrame(columns=cols)
return matches, [rem0, rem1]


def best_matches(links):
"""
Subsequent to duke() with singlematch=True. Returns reduced list of
Expand Down Expand Up @@ -77,6 +195,14 @@ def compare_two_datasets(dfs, labels, country_wise=True, config=None, **dukeargs
if "singlematch" not in dukeargs:
dukeargs["singlematch"] = True

# ── Deterministic matching (before fuzzy) ────────────────────────
# Pair plants sharing exact identifiers (EIC) and drop them from the Duke
# input, so the fuzzy matcher only handles the unmatched remainder.
direct_matches, remaining = DirectMatcher().run(
dfs[0], dfs[1], labels[0], labels[1]
)

# ── Duke fuzzy matching on residual ──────────────────────────────
def country_link(dfs, country):
# country_selector for both dataframes
sel_country_b = [df["Country"] == country for df in dfs]
Expand All @@ -90,20 +216,22 @@ def country_link(dfs, country):

if country_wise:
countries = config["target_countries"]
links = [country_link(dfs, c) for c in countries]
links = [country_link(remaining, c) for c in countries]
links = [link for link in links if not link.empty]
if links:
links = pd.concat(links, ignore_index=True)
else:
links = pd.DataFrame(columns=[*labels, "scores"])
else:
links = duke(dfs, labels=labels, **dukeargs)
links = duke(remaining, labels=labels, **dukeargs)

if links.empty:
matches = pd.DataFrame(columns=labels)
duke_matches = pd.DataFrame(columns=labels)
else:
matches = best_matches(links)
duke_matches = best_matches(links)

# ── Combine direct + Duke matches ────────────────────────────────
matches = pd.concat([direct_matches, duke_matches], ignore_index=True)
return matches


Expand Down
216 changes: 216 additions & 0 deletions test/test_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# SPDX-FileCopyrightText: Contributors to powerplantmatching <https://github.com/pypsa/powerplantmatching>
#
# SPDX-License-Identifier: MIT

import numpy as np
import pandas as pd
import pytest

from powerplantmatching.matching import DirectMatcher, _match_by_eic


@pytest.fixture
def df_entsoe():
"""ENTSOE-like dataset with EIC codes as sets."""
return pd.DataFrame(
{
"Name": ["Eemshavencentrale", "Eemscentrale", "Maasvlakte"],
"Fueltype": ["Hard Coal", "Natural Gas", "Hard Coal"],
"Country": ["Netherlands", "Netherlands", "Netherlands"],
"Capacity": [1560.0, 2200.0, 1040.0],
"EIC": [
{"49W000000000EMSA"},
{"49W00000000008xG", "49W00000000008xK"},
{"49W000000000MVSQ"},
],
"lat": [53.44, 53.44, 51.95],
"lon": [6.83, 6.84, 4.03],
}
)


@pytest.fixture
def df_opsd():
"""OPSD-like dataset with EIC codes as sets."""
return pd.DataFrame(
{
"Name": ["Eemshaven coal", "Eems gas", "Rijnmond"],
"Fueltype": ["Hard Coal", "Natural Gas", "Natural Gas"],
"Country": ["Netherlands", "Netherlands", "Netherlands"],
"Capacity": [1560.0, 2200.0, 800.0],
"EIC": [
{"49W000000000EMSA"},
{"49W00000000008xG"},
set(), # Rijnmond has no EIC
],
"lat": [53.44, 53.44, 51.88],
"lon": [6.83, 6.84, 4.50],
}
)


def test_eic_matching_basic(df_entsoe, df_opsd):
"""EIC matching correctly pairs plants sharing EIC codes."""
matches = _match_by_eic(df_entsoe, df_opsd, "ENTSOE", "OPSD")

# Eemshavencentrale (0) <-> Eemshaven coal (0) via EMSA
# Eemscentrale (1) <-> Eems gas (1) via 008xG (008xK is ENTSOE-only)
assert len(matches) == 2
assert set(matches["ENTSOE"]) == {0, 1}
assert set(matches["OPSD"]) == {0, 1}

# Maasvlakte (2) and Rijnmond (2) must NOT match (no shared EIC)
assert 2 not in set(matches["ENTSOE"])
assert 2 not in set(matches["OPSD"])


def test_eic_matching_no_eic_column():
"""Gracefully handles datasets without an EIC column."""
df0 = pd.DataFrame({"Name": ["Plant A"], "Capacity": [100]})
df1 = pd.DataFrame({"Name": ["Plant B"], "Capacity": [200], "EIC": [{"CODE1"}]})

matches = _match_by_eic(df0, df1, "A", "B")
assert matches.empty
assert list(matches.columns) == ["A", "B"]


def test_eic_matching_empty_sets():
"""No matches when all EIC sets are empty."""
df0 = pd.DataFrame({"Name": ["A"], "EIC": [set()]})
df1 = pd.DataFrame({"Name": ["B"], "EIC": [set()]})

matches = _match_by_eic(df0, df1, "X", "Y")
assert matches.empty


def test_eic_matching_nan_values():
"""Float nan inside EIC sets does not produce false matches."""
df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": [{np.nan}, {"CODE1"}]})
df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{np.nan}, {"CODE1"}]})

matches = _match_by_eic(df0, df1, "L", "R")
# Only CODE1 should match, never nan
assert len(matches) == 1
assert set(matches["L"]) == {1}
assert set(matches["R"]) == {1}


def test_eic_matching_nan_only():
"""All-None EIC column produces no matches."""
df0 = pd.DataFrame({"Name": ["A"], "EIC": [None]})
df1 = pd.DataFrame({"Name": ["B"], "EIC": [None]})

matches = _match_by_eic(df0, df1, "X", "Y")
assert matches.empty


def test_eic_matching_one_to_many_is_ambiguous():
"""A code shared with several rows on the other side is left to Duke.

df0 Plant A carries {C1, C2}; df1 splits these across Plant X {C1} and
Plant Y {C2}. Sharing a single code does not prove identity here, so the
deterministic phase makes no match and defers to fuzzy matching.
"""
df0 = pd.DataFrame({"Name": ["Plant A"], "EIC": [{"C1", "C2"}]})
df1 = pd.DataFrame({"Name": ["Plant X", "Plant Y"], "EIC": [{"C1"}, {"C2"}]})

matches = _match_by_eic(df0, df1, "src0", "src1")
assert matches.empty


def test_eic_matching_hydro_scheme_ambiguous():
"""Regression for the Alpine-hydro aggregation mismatch (PR #289 review).

ENTSOE reports one aggregated scheme (1307 MW) under a single scheme-level
EIC; OPSD splits it into three stations all tagged with that same code.
'One shared code' would pick an arbitrary station, so none is matched.
"""
entsoe = pd.DataFrame(
{
"Name": ["Oberhasli scheme"],
"Capacity": [1307.0],
"EIC": [{"12W-0000000031-O"}],
}
)
opsd = pd.DataFrame(
{
"Name": ["Innertkirchen", "Grimsel", "Handeck"],
"Capacity": [10.0, 389.0, 316.0],
"EIC": [{"12W-0000000031-O"}] * 3,
}
)
matches = _match_by_eic(entsoe, opsd, "ENTSOE", "OPSD")
assert matches.empty


def test_eic_matching_multi_code_one_to_one():
"""Two rows sharing several codes (and nothing else) are a single match."""
df0 = pd.DataFrame(
{"Name": ["Eems gas"], "EIC": [{"C1", "C2", "C3", "C4", "C5", "C6"}]}
)
df1 = pd.DataFrame(
{"Name": ["Eemscentrale"], "EIC": [{"C1", "C2", "C3", "C4", "C5", "C6"}]}
)

matches = _match_by_eic(df0, df1, "L", "R")
assert len(matches) == 1
assert matches.iloc[0]["L"] == 0
assert matches.iloc[0]["R"] == 0


def test_eic_matching_subset_superset_one_to_one():
"""Partial code coverage still matches when the link is unambiguous 1-to-1.

One source lists a subset of the other's codes (e.g. OPSD has one unit
code, ENTSOE has two). With no competing rows, this is a confident match.
"""
df0 = pd.DataFrame({"Name": ["Ballylumford"], "EIC": [{"C1"}]})
df1 = pd.DataFrame({"Name": ["Ballylumford"], "EIC": [{"C1", "C2"}]})

matches = _match_by_eic(df0, df1, "OPSD", "ENTSOE")
assert len(matches) == 1


def test_eic_matching_raw_string_treated_as_single_code():
"""A raw string EIC (not wrapped in a set) is treated as one code."""
df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": ["CODE1", {"CODE2"}]})
df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{"CODE1"}, {"CODE2"}]})

matches = _match_by_eic(df0, df1, "L", "R")
# CODE1 (raw string) and CODE2 (set) both yield a clean 1-to-1 match
assert len(matches) == 2
assert set(matches["L"]) == {0, 1}


def test_direct_matcher_run_returns_matches_and_residual(df_entsoe, df_opsd):
"""DirectMatcher.run pairs via EIC and returns the unmatched residual."""
matches, remaining = DirectMatcher().run(df_entsoe, df_opsd, "ENTSOE", "OPSD")

assert len(matches) == 2
assert list(matches.columns) == ["ENTSOE", "OPSD"]

rem0, rem1 = remaining
# the two matched rows (idx 0, 1) are removed; only the unmatched stay
assert set(rem0.index) == {2} # Maasvlakte
assert set(rem1.index) == {2} # Rijnmond


def test_direct_matcher_no_matches_passes_everything_through():
"""With no shared identifiers, residual equals the inputs and matches is empty."""
df0 = pd.DataFrame({"Name": ["A"], "EIC": [{"C1"}]})
df1 = pd.DataFrame({"Name": ["B"], "EIC": [{"C2"}]})

matches, (rem0, rem1) = DirectMatcher().run(df0, df1, "L", "R")
assert matches.empty
assert list(matches.columns) == ["L", "R"]
assert len(rem0) == 1 and len(rem1) == 1


def test_direct_matcher_custom_matcher_list():
"""Matchers are pluggable; an empty list yields no matches and full residual."""
df0 = pd.DataFrame({"Name": ["A"], "EIC": [{"C1"}]})
df1 = pd.DataFrame({"Name": ["B"], "EIC": [{"C1"}]})

matches, (rem0, rem1) = DirectMatcher(matchers=[]).run(df0, df1, "L", "R")
assert matches.empty
assert len(rem0) == 1 and len(rem1) == 1
Loading