Skip to content

Commit 7d7ca66

Browse files
MaxGhenisclaude
andcommitted
Add MicrocalibrateAdapter as mainline calibration backend
First real code on spec-based-ecps-rewire. Wraps microcalibrate (gradient- descent chi-squared) behind the same fit_transform / validate interface as the legacy microplex.calibration.Calibrator — drop-in replacement for the entropy calibration step that killed v6. Interface contract (tested): - Same fit_transform signature: data, marginal_targets, weight_col, linear_constraints - Same validate() output keys: converged, max_error, sparsity, linear_errors - Identity preservation: every input record survives with a non-negative weight (v4/v6 entropy path does not guarantee this) - Empty constraints returns copy of input unchanged - Constraint shape and weight-column existence validated up front Smoke tests (tests/calibration/test_microcalibrate_adapter.py, 8 tests, 5.2 s): - Interface contract coverage - Single age-band count constraint converges within 5 % relative error on 200 records - Two orthogonal constraints (count + income-sum) both reach within 10 % relative error on 300 records - Validation output shape matches legacy contract Packaging: - microcalibrate >= 0.21 added to required dependencies - requires-python bumped to >= 3.13 to match microcalibrate's lower bound Not in this commit (deliberate): - No changes to pe_us_data_rebuild / us.py pipeline yet — adapter is standalone so it can be wired incrementally - No scale-up validation — that goes through the protocol in docs/synthesizer-benchmark-scale-up.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7186926 commit 7d7ca66

5 files changed

Lines changed: 469 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@ license = "MIT"
1111
authors = [
1212
{ name = "Cosilico", email = "hello@cosilico.ai" }
1313
]
14-
requires-python = ">=3.10"
14+
requires-python = ">=3.13"
1515
dependencies = [
1616
"microplex",
1717
"duckdb>=1.2",
18+
"microcalibrate>=0.21",
1819
]
1920

2021
[project.optional-dependencies]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""Calibration backends for microplex-us.
2+
3+
The mainline production calibrator is `MicrocalibrateAdapter`, which wraps
4+
the `microcalibrate` gradient-descent chi-squared solver in the same
5+
interface the rest of microplex-us expects from the legacy
6+
`microplex.calibration.Calibrator`.
7+
8+
See `docs/calibrator-decision.md` for the rationale.
9+
"""
10+
11+
from microplex_us.calibration.microcalibrate_adapter import (
12+
MicrocalibrateAdapter,
13+
MicrocalibrateAdapterConfig,
14+
)
15+
16+
__all__ = [
17+
"MicrocalibrateAdapter",
18+
"MicrocalibrateAdapterConfig",
19+
]
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
"""Adapter that wraps `microcalibrate.Calibration` in the microplex-us interface.
2+
3+
Mainline production calibrator per `docs/calibrator-decision.md`.
4+
5+
`MicrocalibrateAdapter.fit_transform` has the same call signature as the
6+
legacy `microplex.calibration.Calibrator.fit_transform` used by the current
7+
`pe_us_data_rebuild` pipeline: take a DataFrame of records, a tuple of
8+
`LinearConstraint` objects, and a `weight_col`; return a DataFrame with the
9+
same rows and adjusted weights. Every input record survives to the output
10+
with a non-negative weight — identity preservation is the contract.
11+
12+
This is a drop-in replacement for the calibration step that killed v6 with
13+
`backend="entropy"`. Instead of materializing a dense Jacobian over
14+
(n_records × n_constraints), `microcalibrate` does gradient descent over the
15+
weight vector with an optional L0 regularizer that defaults off.
16+
"""
17+
18+
from __future__ import annotations
19+
20+
from dataclasses import dataclass, field
21+
from typing import Any, Sequence
22+
23+
import numpy as np
24+
import pandas as pd
25+
from microcalibrate import Calibration
26+
from microplex.calibration import LinearConstraint
27+
28+
29+
@dataclass(frozen=True)
30+
class MicrocalibrateAdapterConfig:
31+
"""Hyperparameters for `MicrocalibrateAdapter`.
32+
33+
Defaults come from `microcalibrate.Calibration`'s own defaults
34+
(epochs=32, learning_rate=1e-3, noise_level=10.0) except `device`,
35+
which microcalibrate picks automatically from CUDA > MPS > CPU but
36+
we pin to a single choice for reproducibility.
37+
"""
38+
39+
epochs: int = 32
40+
learning_rate: float = 1e-3
41+
noise_level: float = 10.0
42+
dropout_rate: float = 0.0
43+
device: str | None = None # None = let microcalibrate auto-select
44+
seed: int = 42
45+
regularize_with_l0: bool = False
46+
l0_lambda: float = 5e-6
47+
init_mean: float = 0.999
48+
temperature: float = 0.5
49+
sparse_learning_rate: float = 0.2
50+
51+
52+
class MicrocalibrateAdapter:
53+
"""Drop-in replacement for the `fit_transform` / `validate` surface.
54+
55+
Usage:
56+
57+
>>> adapter = MicrocalibrateAdapter()
58+
>>> result = adapter.fit_transform(
59+
... data=households_df,
60+
... marginal_targets={}, # unused; kept for signature parity
61+
... weight_col="household_weight",
62+
... linear_constraints=tuple_of_LinearConstraints,
63+
... )
64+
>>> validation = adapter.validate(result)
65+
66+
The returned DataFrame is a copy of `data` with `weight_col` updated.
67+
"""
68+
69+
def __init__(
70+
self,
71+
config: MicrocalibrateAdapterConfig | None = None,
72+
) -> None:
73+
self.config = config or MicrocalibrateAdapterConfig()
74+
self._last_calibration: Calibration | None = None
75+
self._last_constraint_names: list[str] | None = None
76+
self._last_targets: np.ndarray | None = None
77+
self._last_performance: pd.DataFrame | None = None
78+
79+
def fit_transform(
80+
self,
81+
data: pd.DataFrame,
82+
marginal_targets: dict[str, dict[str, float]] | None = None,
83+
continuous_targets: dict[str, float] | None = None,
84+
*,
85+
weight_col: str = "weight",
86+
linear_constraints: Sequence[LinearConstraint] = (),
87+
) -> pd.DataFrame:
88+
"""Calibrate weights via gradient-descent chi-squared.
89+
90+
`marginal_targets` and `continuous_targets` are accepted for
91+
signature parity with the legacy `Calibrator`, but this adapter
92+
expects constraints to be expressed as `LinearConstraint` rows.
93+
Callers should compile their marginal / continuous targets into
94+
linear constraints before calling.
95+
"""
96+
if weight_col not in data.columns:
97+
raise ValueError(
98+
f"MicrocalibrateAdapter: weight column {weight_col!r} "
99+
f"not found in data (columns: {list(data.columns)[:10]}...)"
100+
)
101+
102+
n_records = len(data)
103+
initial_weights = data[weight_col].to_numpy(dtype=float)
104+
105+
if not linear_constraints:
106+
# Nothing to calibrate — preserve caller expectations.
107+
self._last_calibration = None
108+
self._last_constraint_names = []
109+
self._last_targets = np.empty(0, dtype=float)
110+
self._last_performance = None
111+
return data.copy()
112+
113+
target_names = [c.name for c in linear_constraints]
114+
targets = np.array([c.target for c in linear_constraints], dtype=float)
115+
116+
for constraint in linear_constraints:
117+
if constraint.coefficients.shape != (n_records,):
118+
raise ValueError(
119+
f"MicrocalibrateAdapter: constraint {constraint.name!r} has "
120+
f"coefficients shape {constraint.coefficients.shape}, expected "
121+
f"({n_records},) matching the data length."
122+
)
123+
124+
estimate_matrix = pd.DataFrame(
125+
{c.name: np.asarray(c.coefficients, dtype=float) for c in linear_constraints}
126+
)
127+
128+
calibrator = Calibration(
129+
weights=initial_weights,
130+
targets=targets,
131+
target_names=np.array(target_names),
132+
estimate_matrix=estimate_matrix,
133+
epochs=self.config.epochs,
134+
learning_rate=self.config.learning_rate,
135+
noise_level=self.config.noise_level,
136+
dropout_rate=self.config.dropout_rate,
137+
device=self.config.device,
138+
seed=self.config.seed,
139+
regularize_with_l0=self.config.regularize_with_l0,
140+
l0_lambda=self.config.l0_lambda,
141+
init_mean=self.config.init_mean,
142+
temperature=self.config.temperature,
143+
sparse_learning_rate=self.config.sparse_learning_rate,
144+
)
145+
146+
performance_df = calibrator.calibrate()
147+
self._last_calibration = calibrator
148+
self._last_constraint_names = target_names
149+
self._last_targets = targets
150+
self._last_performance = performance_df
151+
152+
result = data.copy()
153+
result[weight_col] = calibrator.weights
154+
return result
155+
156+
def validate(self, calibrated: pd.DataFrame | None = None) -> dict[str, Any]:
157+
"""Return validation metrics in the shape the legacy pipeline expects.
158+
159+
The legacy `Calibrator.validate` returns `{"converged", "max_error",
160+
"sparsity", "linear_errors"}`. We populate the same keys.
161+
162+
`calibrated` is accepted for interface parity but not read; the
163+
authoritative values come from the last `calibrate()` call.
164+
"""
165+
if self._last_calibration is None:
166+
return {
167+
"converged": True,
168+
"max_error": 0.0,
169+
"sparsity": 0.0,
170+
"linear_errors": {},
171+
}
172+
173+
estimates = self._last_calibration.estimate().to_numpy(dtype=float)
174+
targets = self._last_targets
175+
assert targets is not None
176+
names = self._last_constraint_names
177+
assert names is not None
178+
179+
rel_errors = np.where(
180+
np.abs(targets) > 1e-12,
181+
np.abs(estimates - targets) / np.abs(targets),
182+
np.abs(estimates - targets),
183+
)
184+
linear_errors = {
185+
name: {
186+
"target": float(target_value),
187+
"estimate": float(estimate_value),
188+
"relative_error": float(rel_error),
189+
"absolute_error": float(abs(estimate_value - target_value)),
190+
}
191+
for name, target_value, estimate_value, rel_error in zip(
192+
names, targets, estimates, rel_errors, strict=True
193+
)
194+
}
195+
196+
max_error = float(rel_errors.max()) if rel_errors.size else 0.0
197+
weights = self._last_calibration.weights
198+
sparsity = float((weights == 0).sum()) / max(len(weights), 1)
199+
200+
return {
201+
"converged": bool(max_error < 0.05), # 5 % relative error bar
202+
"max_error": max_error,
203+
"sparsity": sparsity,
204+
"linear_errors": linear_errors,
205+
}
206+
207+
def performance_history(self) -> pd.DataFrame | None:
208+
"""The per-epoch performance log from microcalibrate, if available."""
209+
return self._last_performance
210+
211+
212+
__all__ = [
213+
"MicrocalibrateAdapter",
214+
"MicrocalibrateAdapterConfig",
215+
]

tests/calibration/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)