|
| 1 | +"""Adapter that wraps `microcalibrate.Calibration` in the microplex-us interface. |
| 2 | +
|
| 3 | +Mainline production calibrator per `docs/calibrator-decision.md`. |
| 4 | +
|
| 5 | +`MicrocalibrateAdapter.fit_transform` has the same call signature as the |
| 6 | +legacy `microplex.calibration.Calibrator.fit_transform` used by the current |
| 7 | +`pe_us_data_rebuild` pipeline: take a DataFrame of records, a tuple of |
| 8 | +`LinearConstraint` objects, and a `weight_col`; return a DataFrame with the |
| 9 | +same rows and adjusted weights. Every input record survives to the output |
| 10 | +with a non-negative weight — identity preservation is the contract. |
| 11 | +
|
| 12 | +This is a drop-in replacement for the calibration step that killed v6 with |
| 13 | +`backend="entropy"`. Instead of materializing a dense Jacobian over |
| 14 | +(n_records × n_constraints), `microcalibrate` does gradient descent over the |
| 15 | +weight vector with an optional L0 regularizer that defaults off. |
| 16 | +""" |
| 17 | + |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +from dataclasses import dataclass, field |
| 21 | +from typing import Any, Sequence |
| 22 | + |
| 23 | +import numpy as np |
| 24 | +import pandas as pd |
| 25 | +from microcalibrate import Calibration |
| 26 | +from microplex.calibration import LinearConstraint |
| 27 | + |
| 28 | + |
| 29 | +@dataclass(frozen=True) |
| 30 | +class MicrocalibrateAdapterConfig: |
| 31 | + """Hyperparameters for `MicrocalibrateAdapter`. |
| 32 | +
|
| 33 | + Defaults come from `microcalibrate.Calibration`'s own defaults |
| 34 | + (epochs=32, learning_rate=1e-3, noise_level=10.0) except `device`, |
| 35 | + which microcalibrate picks automatically from CUDA > MPS > CPU but |
| 36 | + we pin to a single choice for reproducibility. |
| 37 | + """ |
| 38 | + |
| 39 | + epochs: int = 32 |
| 40 | + learning_rate: float = 1e-3 |
| 41 | + noise_level: float = 10.0 |
| 42 | + dropout_rate: float = 0.0 |
| 43 | + device: str | None = None # None = let microcalibrate auto-select |
| 44 | + seed: int = 42 |
| 45 | + regularize_with_l0: bool = False |
| 46 | + l0_lambda: float = 5e-6 |
| 47 | + init_mean: float = 0.999 |
| 48 | + temperature: float = 0.5 |
| 49 | + sparse_learning_rate: float = 0.2 |
| 50 | + |
| 51 | + |
| 52 | +class MicrocalibrateAdapter: |
| 53 | + """Drop-in replacement for the `fit_transform` / `validate` surface. |
| 54 | +
|
| 55 | + Usage: |
| 56 | +
|
| 57 | + >>> adapter = MicrocalibrateAdapter() |
| 58 | + >>> result = adapter.fit_transform( |
| 59 | + ... data=households_df, |
| 60 | + ... marginal_targets={}, # unused; kept for signature parity |
| 61 | + ... weight_col="household_weight", |
| 62 | + ... linear_constraints=tuple_of_LinearConstraints, |
| 63 | + ... ) |
| 64 | + >>> validation = adapter.validate(result) |
| 65 | +
|
| 66 | + The returned DataFrame is a copy of `data` with `weight_col` updated. |
| 67 | + """ |
| 68 | + |
| 69 | + def __init__( |
| 70 | + self, |
| 71 | + config: MicrocalibrateAdapterConfig | None = None, |
| 72 | + ) -> None: |
| 73 | + self.config = config or MicrocalibrateAdapterConfig() |
| 74 | + self._last_calibration: Calibration | None = None |
| 75 | + self._last_constraint_names: list[str] | None = None |
| 76 | + self._last_targets: np.ndarray | None = None |
| 77 | + self._last_performance: pd.DataFrame | None = None |
| 78 | + |
| 79 | + def fit_transform( |
| 80 | + self, |
| 81 | + data: pd.DataFrame, |
| 82 | + marginal_targets: dict[str, dict[str, float]] | None = None, |
| 83 | + continuous_targets: dict[str, float] | None = None, |
| 84 | + *, |
| 85 | + weight_col: str = "weight", |
| 86 | + linear_constraints: Sequence[LinearConstraint] = (), |
| 87 | + ) -> pd.DataFrame: |
| 88 | + """Calibrate weights via gradient-descent chi-squared. |
| 89 | +
|
| 90 | + `marginal_targets` and `continuous_targets` are accepted for |
| 91 | + signature parity with the legacy `Calibrator`, but this adapter |
| 92 | + expects constraints to be expressed as `LinearConstraint` rows. |
| 93 | + Callers should compile their marginal / continuous targets into |
| 94 | + linear constraints before calling. |
| 95 | + """ |
| 96 | + if weight_col not in data.columns: |
| 97 | + raise ValueError( |
| 98 | + f"MicrocalibrateAdapter: weight column {weight_col!r} " |
| 99 | + f"not found in data (columns: {list(data.columns)[:10]}...)" |
| 100 | + ) |
| 101 | + |
| 102 | + n_records = len(data) |
| 103 | + initial_weights = data[weight_col].to_numpy(dtype=float) |
| 104 | + |
| 105 | + if not linear_constraints: |
| 106 | + # Nothing to calibrate — preserve caller expectations. |
| 107 | + self._last_calibration = None |
| 108 | + self._last_constraint_names = [] |
| 109 | + self._last_targets = np.empty(0, dtype=float) |
| 110 | + self._last_performance = None |
| 111 | + return data.copy() |
| 112 | + |
| 113 | + target_names = [c.name for c in linear_constraints] |
| 114 | + targets = np.array([c.target for c in linear_constraints], dtype=float) |
| 115 | + |
| 116 | + for constraint in linear_constraints: |
| 117 | + if constraint.coefficients.shape != (n_records,): |
| 118 | + raise ValueError( |
| 119 | + f"MicrocalibrateAdapter: constraint {constraint.name!r} has " |
| 120 | + f"coefficients shape {constraint.coefficients.shape}, expected " |
| 121 | + f"({n_records},) matching the data length." |
| 122 | + ) |
| 123 | + |
| 124 | + estimate_matrix = pd.DataFrame( |
| 125 | + {c.name: np.asarray(c.coefficients, dtype=float) for c in linear_constraints} |
| 126 | + ) |
| 127 | + |
| 128 | + calibrator = Calibration( |
| 129 | + weights=initial_weights, |
| 130 | + targets=targets, |
| 131 | + target_names=np.array(target_names), |
| 132 | + estimate_matrix=estimate_matrix, |
| 133 | + epochs=self.config.epochs, |
| 134 | + learning_rate=self.config.learning_rate, |
| 135 | + noise_level=self.config.noise_level, |
| 136 | + dropout_rate=self.config.dropout_rate, |
| 137 | + device=self.config.device, |
| 138 | + seed=self.config.seed, |
| 139 | + regularize_with_l0=self.config.regularize_with_l0, |
| 140 | + l0_lambda=self.config.l0_lambda, |
| 141 | + init_mean=self.config.init_mean, |
| 142 | + temperature=self.config.temperature, |
| 143 | + sparse_learning_rate=self.config.sparse_learning_rate, |
| 144 | + ) |
| 145 | + |
| 146 | + performance_df = calibrator.calibrate() |
| 147 | + self._last_calibration = calibrator |
| 148 | + self._last_constraint_names = target_names |
| 149 | + self._last_targets = targets |
| 150 | + self._last_performance = performance_df |
| 151 | + |
| 152 | + result = data.copy() |
| 153 | + result[weight_col] = calibrator.weights |
| 154 | + return result |
| 155 | + |
| 156 | + def validate(self, calibrated: pd.DataFrame | None = None) -> dict[str, Any]: |
| 157 | + """Return validation metrics in the shape the legacy pipeline expects. |
| 158 | +
|
| 159 | + The legacy `Calibrator.validate` returns `{"converged", "max_error", |
| 160 | + "sparsity", "linear_errors"}`. We populate the same keys. |
| 161 | +
|
| 162 | + `calibrated` is accepted for interface parity but not read; the |
| 163 | + authoritative values come from the last `calibrate()` call. |
| 164 | + """ |
| 165 | + if self._last_calibration is None: |
| 166 | + return { |
| 167 | + "converged": True, |
| 168 | + "max_error": 0.0, |
| 169 | + "sparsity": 0.0, |
| 170 | + "linear_errors": {}, |
| 171 | + } |
| 172 | + |
| 173 | + estimates = self._last_calibration.estimate().to_numpy(dtype=float) |
| 174 | + targets = self._last_targets |
| 175 | + assert targets is not None |
| 176 | + names = self._last_constraint_names |
| 177 | + assert names is not None |
| 178 | + |
| 179 | + rel_errors = np.where( |
| 180 | + np.abs(targets) > 1e-12, |
| 181 | + np.abs(estimates - targets) / np.abs(targets), |
| 182 | + np.abs(estimates - targets), |
| 183 | + ) |
| 184 | + linear_errors = { |
| 185 | + name: { |
| 186 | + "target": float(target_value), |
| 187 | + "estimate": float(estimate_value), |
| 188 | + "relative_error": float(rel_error), |
| 189 | + "absolute_error": float(abs(estimate_value - target_value)), |
| 190 | + } |
| 191 | + for name, target_value, estimate_value, rel_error in zip( |
| 192 | + names, targets, estimates, rel_errors, strict=True |
| 193 | + ) |
| 194 | + } |
| 195 | + |
| 196 | + max_error = float(rel_errors.max()) if rel_errors.size else 0.0 |
| 197 | + weights = self._last_calibration.weights |
| 198 | + sparsity = float((weights == 0).sum()) / max(len(weights), 1) |
| 199 | + |
| 200 | + return { |
| 201 | + "converged": bool(max_error < 0.05), # 5 % relative error bar |
| 202 | + "max_error": max_error, |
| 203 | + "sparsity": sparsity, |
| 204 | + "linear_errors": linear_errors, |
| 205 | + } |
| 206 | + |
| 207 | + def performance_history(self) -> pd.DataFrame | None: |
| 208 | + """The per-epoch performance log from microcalibrate, if available.""" |
| 209 | + return self._last_performance |
| 210 | + |
| 211 | + |
| 212 | +__all__ = [ |
| 213 | + "MicrocalibrateAdapter", |
| 214 | + "MicrocalibrateAdapterConfig", |
| 215 | +] |
0 commit comments