solve_brute: defer NaN/stats diagnostics to post-solve (async) (#334)

hmgaudecker · claude · web-flow · commit 88ba7c240b5e · 2026-04-28T05:57:40.000+02:00
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/lcm/solution/solve_brute.py b/src/lcm/solution/solve_brute.py
@@ -4,21 +4,20 @@
 import time
 from collections.abc import Callable, Hashable
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
 from types import MappingProxyType
 
 import jax
 import jax.numpy as jnp
 
 from lcm.ages import AgeGrid
 from lcm.interfaces import InternalRegime
-from lcm.typing import FloatND, InternalParams, RegimeName
+from lcm.typing import FlatRegimeParams, FloatND, InternalParams, RegimeName
 from lcm.utils.error_handling import validate_V
 from lcm.utils.logging import (
     format_duration,
-    log_nan_in_V,
     log_period_header,
     log_period_timing,
-    log_V_stats,
 )
 
 
@@ -71,6 +70,28 @@ def solve(
 
     solution: dict[int, MappingProxyType[RegimeName, FloatND]] = {}
 
+    # Async diagnostics accumulators: every `jnp.any(isnan)`,
+    # `jnp.any(isinf)` (and the debug min/max/mean trio) lives here as
+    # a device-side scalar during the hot loop. No host sync happens
+    # until the single flush in `_emit_deferred_diagnostics` post-loop.
+    # This replaces the pre-existing synchronous `log_nan_in_V` +
+    # `log_V_stats` + `validate_V` triple, which forced one host
+    # transfer per (regime, period) — ~n_regimes * n_periods stalls
+    # per solve, a meaningful throughput tax in MSM-style loops.
+    # Both gates fall out of the public log level: `"off"` ⇒ nothing,
+    # `"warning"` / `"progress"` ⇒ NaN/Inf only, `"debug"` ⇒ adds the
+    # min/max/mean trio. `"off"` skips even the NaN fail-fast — that
+    # is the documented contract of `"off"` (suppress all output) and
+    # is what makes the level useful for tight estimation loops.
+    diagnostics_enabled = logger.isEnabledFor(logging.WARNING)
+    stats_enabled = logger.isEnabledFor(logging.DEBUG)
+    diagnostic_rows: list[_DiagnosticRow] = []
+    diagnostic_min: list[FloatND] = []
+    diagnostic_max: list[FloatND] = []
+    diagnostic_mean: list[FloatND] = []
+    diagnostic_any_nan: list[FloatND] = []
+    diagnostic_any_inf: list[FloatND] = []
+
     logger.info("Starting solution")
     total_start = time.monotonic()
 
@@ -110,36 +131,36 @@ def solve(
                 age=ages.values[period],
             )
 
-            log_nan_in_V(
-                logger=logger,
-                regime_name=name,
-                age=ages.values[period],
-                V_arr=V_arr,
-            )
-            log_V_stats(logger=logger, regime_name=name, V_arr=V_arr)
-
-            # Include sibling regimes already solved this period (and the
-            # current regime's V_arr, even though it is NaN-bearing — users
-            # debugging the snapshot want to see all of it).
-            partial = MappingProxyType(
-                {
-                    **solution,
-                    period: MappingProxyType({**period_solution, name: V_arr}),
-                }
-            )
-            validate_V(
-                V_arr=V_arr,
-                age=float(ages.values[period]),
-                regime_name=name,
-                partial_solution=partial,
-                compute_intermediates=internal_regime.solve_functions.compute_intermediates.get(
-                    period
-                ),
-                state_action_space=state_action_space,
-                next_regime_to_V_arr=next_regime_to_V_arr,
-                internal_params=internal_params[name],
-                period=period,
-            )
+            # Async reductions: gated on log level. `"off"` skips
+            # everything — no kernel launches, no host syncs, no
+            # NaN fail-fast. `"warning"` / `"progress"` launches the
+            # two cheap isnan/isinf reductions; `"debug"` adds the
+            # min/max/mean trio. Each extra full-V read is a
+            # memory-bandwidth tax on the larger models, so the
+            # default keeps it to two reductions per (regime, period).
+            if diagnostics_enabled:
+                if stats_enabled:
+                    diagnostic_min.append(jnp.min(V_arr))
+                    diagnostic_max.append(jnp.max(V_arr))
+                    diagnostic_mean.append(jnp.mean(V_arr))
+                diagnostic_any_nan.append(jnp.any(jnp.isnan(V_arr)))
+                diagnostic_any_inf.append(jnp.any(jnp.isinf(V_arr)))
+                diagnostic_rows.append(
+                    _DiagnosticRow(
+                        regime_name=name,
+                        period=period,
+                        age=float(ages.values[period]),
+                        state_action_space=state_action_space,
+                        next_regime_to_V_arr=next_regime_to_V_arr,
+                        regime_params=internal_params[name],
+                        compute_intermediates=(
+                            internal_regime.solve_functions.compute_intermediates.get(
+                                period
+                            )
+                        ),
+                    )
+                )
+
             period_solution[name] = V_arr
 
         # Maintain consistent pytree structure: keep all regime keys,
@@ -155,6 +176,24 @@ def solve(
         elapsed = time.monotonic() - period_start
         log_period_timing(logger=logger, elapsed=elapsed)
 
+    # One flush of the GPU kernel queue: ship the stacked reductions
+    # to host in two transfers (isnan / isinf) by default, plus three
+    # more (min / max / mean) when debug stats were enabled. Skipped
+    # entirely at `log_level="off"` — nothing was accumulated.
+    if diagnostics_enabled:
+        _emit_deferred_diagnostics(
+            logger=logger,
+            diagnostic_rows=diagnostic_rows,
+            reductions=_StackedReductions(
+                mins=jnp.stack(diagnostic_min) if diagnostic_min else None,
+                maxs=jnp.stack(diagnostic_max) if diagnostic_max else None,
+                means=jnp.stack(diagnostic_mean) if diagnostic_mean else None,
+                any_nan=jnp.stack(diagnostic_any_nan),
+                any_inf=jnp.stack(diagnostic_any_inf),
+            ),
+            solution=MappingProxyType(solution),
+        )
+
     total_elapsed = time.monotonic() - total_start
     logger.info("Solution complete  (%s)", format_duration(seconds=total_elapsed))
 
@@ -338,3 +377,158 @@ def _get_regime_V_shapes(
         )
         shapes[name] = tuple(len(v) for v in state_action_space.states.values())
     return shapes
+
+
+@dataclass(frozen=True)
+class _DiagnosticRow:
+    """Metadata captured during the backward-induction loop.
+
+    Stored refs only — no device work — so appending these rows inside
+    the hot loop costs essentially nothing. The expensive part (NaN
+    diagnostic enrichment via `compute_intermediates`) runs at most
+    once per solve, on the first offending row found after the single
+    post-loop host flush.
+    """
+
+    regime_name: RegimeName
+    """Name of the regime whose V-array this row summarises."""
+    period: int
+    """Period index in the backward-induction loop."""
+    age: float
+    """Age corresponding to `period` (pulled off `AgeGrid.values`)."""
+    state_action_space: object
+    """Typed as `object` to avoid a heavy import cycle; consumers know
+    the actual runtime type from the `max_Q_over_a` signature."""
+    next_regime_to_V_arr: MappingProxyType[RegimeName, FloatND]
+    """Incoming next-period V-arrays, passed through unchanged to
+    `compute_intermediates` when a NaN is detected."""
+    regime_params: FlatRegimeParams
+    """Flat regime parameters used at this (regime, period)."""
+    compute_intermediates: Callable | None
+    """Optional closure that recomputes U / F / E[V] / Q for NaN
+    diagnostic enrichment. `None` when the regime has no
+    compute-intermediates closure (e.g. terminal periods)."""
+
+
+@dataclass(frozen=True)
+class _StackedReductions:
+    """Per-stat JAX arrays stacked across all diagnostic rows; still on device.
+
+    `mins` / `maxs` / `means` are `None` when the solve ran with a log
+    level below `debug` — the GPU wasn't asked to compute those
+    statistics so there's nothing to stack.
+    """
+
+    mins: FloatND | None
+    """Per-row min of V, or `None` below debug log level."""
+    maxs: FloatND | None
+    """Per-row max of V, or `None` below debug log level."""
+    means: FloatND | None
+    """Per-row mean of V, or `None` below debug log level."""
+    any_nan: FloatND
+    """Per-row boolean flag: any NaN in V at this (regime, period)."""
+    any_inf: FloatND
+    """Per-row boolean flag: any Inf in V at this (regime, period)."""
+
+
+def _emit_deferred_diagnostics(
+    *,
+    logger: logging.Logger,
+    diagnostic_rows: list[_DiagnosticRow],
+    reductions: _StackedReductions,
+    solution: MappingProxyType[int, MappingProxyType[RegimeName, FloatND]],
+) -> None:
+    """Flush async diagnostics to host, emit logs, raise on NaN.
+
+    Exactly two host transfers by default (one per stat stack), plus
+    three more (min / max / mean) when debug stats were enabled.
+    Ordering: NaN check first so we raise before emitting any stats
+    lines the user wouldn't see anyway; inf check next (warning only);
+    per-period stats last at debug log level. The `.tolist()` calls
+    are what actually block on the GPU queue — everything above this
+    function ran async.
+    """
+    any_nan = reductions.any_nan.tolist()
+    any_inf = reductions.any_inf.tolist()
+
+    _raise_if_nan(
+        diagnostic_rows=diagnostic_rows,
+        any_nan_per_row=any_nan,
+        solution=solution,
+    )
+    _warn_if_inf(
+        logger=logger,
+        diagnostic_rows=diagnostic_rows,
+        any_inf_per_row=any_inf,
+    )
+
+    if (
+        not logger.isEnabledFor(logging.DEBUG)
+        or reductions.mins is None
+        or reductions.maxs is None
+        or reductions.means is None
+    ):
+        return
+
+    mins = reductions.mins.tolist()
+    maxs = reductions.maxs.tolist()
+    means = reductions.means.tolist()
+    for row, v_min, v_max, v_mean in zip(
+        diagnostic_rows, mins, maxs, means, strict=True
+    ):
+        logger.debug(
+            "  %s  age %s   V min=%.3g  max=%.3g  mean=%.3g",
+            row.regime_name,
+            row.age,
+            v_min,
+            v_max,
+            v_mean,
+        )
+
+
+def _raise_if_nan(
+    *,
+    diagnostic_rows: list[_DiagnosticRow],
+    any_nan_per_row: list,  # list[bool]
+    solution: MappingProxyType[int, MappingProxyType[RegimeName, FloatND]],
+) -> None:
+    """Find the first NaN-bearing (regime, period) and raise."""
+    for row, flag in zip(diagnostic_rows, any_nan_per_row, strict=True):
+        if flag:
+            _raise_at(row=row, solution=solution)
+
+
+def _raise_at(
+    *,
+    row: _DiagnosticRow,
+    solution: MappingProxyType[int, MappingProxyType[RegimeName, FloatND]],
+) -> None:
+    """Run the enriched NaN diagnostic on a single offending row and raise."""
+    V_arr = solution[row.period][row.regime_name]
+    validate_V(
+        V_arr=V_arr,
+        age=row.age,
+        regime_name=row.regime_name,
+        partial_solution=solution,
+        compute_intermediates=row.compute_intermediates,
+        state_action_space=row.state_action_space,  # ty: ignore[invalid-argument-type]
+        next_regime_to_V_arr=row.next_regime_to_V_arr,
+        internal_params=row.regime_params,
+        period=row.period,
+    )
+
+
+def _warn_if_inf(
+    *,
+    logger: logging.Logger,
+    diagnostic_rows: list[_DiagnosticRow],
+    any_inf_per_row: list,  # list[bool]
+) -> None:
+    """Emit a warning per (regime, period) with Inf values."""
+    for row, flag in zip(diagnostic_rows, any_inf_per_row, strict=True):
+        if flag:
+            logger.warning(
+                "Inf in V_arr for regime '%s' at age %s",
+                row.regime_name,
+                row.age,
+            )
diff --git a/src/lcm/utils/logging.py b/src/lcm/utils/logging.py
@@ -76,32 +76,6 @@ def log_nan_in_V(
         logger.warning("NaN/Inf in V_arr for regime '%s' at age %s", regime_name, age)
 
 
-def log_V_stats(
-    *,
-    logger: logging.Logger,
-    regime_name: str,
-    V_arr: FloatND,
-) -> None:
-    """Log min/max/mean statistics of a value function array at debug level.
-
-    Args:
-        logger: Logger instance.
-        regime_name: Name of the regime.
-        V_arr: Value function array.
-
-    """
-    if not logger.isEnabledFor(logging.DEBUG):
-        return
-
-    logger.debug(
-        "  - %s: V min=%.3g max=%.3g mean=%.3g",
-        regime_name,
-        float(jnp.min(V_arr)),
-        float(jnp.max(V_arr)),
-        float(jnp.mean(V_arr)),
-    )
-
-
 def log_period_header(
     *,
     logger: logging.Logger,