Add parallelization across multiple devices during solve (#346)

hmgaudecker · claude · hmgaudecker · commit 1b2baa31676a · 2026-05-11T09:22:30.000+02:00
Add a `distributed=True` flag on `DiscreteGrid` to shard the grid
across JAX devices, thread the distribution pattern through
`solve_brute._get_regime_V_shapes_and_shardings`, and validate the
device-count match at runtime via a new check in
`InternalRegime.state_action_space`.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/lcm/grids/base.py b/src/lcm/grids/base.py
@@ -16,6 +16,16 @@ def batch_size(self) -> int:
 
         """
 
+    @property
+    @abstractmethod
+    def distributed(self) -> bool:
+        """Whether to distribute the grid over the available devices.
+
+        `ContinuousGrid` overrides this via its dataclass field.
+        `DiscreteGrid` overrides this via its own property.
+
+        """
+
     @abstractmethod
     def to_jax(self) -> Int1D | Float1D:
         """Convert the grid to a Jax array."""
diff --git a/src/lcm/grids/continuous.py b/src/lcm/grids/continuous.py
@@ -29,6 +29,8 @@ class ContinuousGrid(Grid):
 
     batch_size: int = 0
     """Size of the batches that are looped over during the solution."""
+    distributed: bool = False
+    """Size of the batches that are looped over during the solution."""
 
     @overload
     def get_coordinate(self, value: ScalarFloat) -> ScalarFloat: ...
diff --git a/src/lcm/grids/discrete.py b/src/lcm/grids/discrete.py
@@ -19,13 +19,16 @@ class DiscreteGrid(Grid):
 
     """
 
-    def __init__(self, category_class: type, batch_size: int = 0) -> None:
+    def __init__(
+        self, category_class: type, batch_size: int = 0, distributed=False
+    ) -> None:
         _validate_discrete_grid(category_class)
         names_and_values = get_field_names_and_values(category_class)
         self.__categories = tuple(names_and_values.keys())
         self.__codes = tuple(names_and_values.values())
         self.__ordered: bool = getattr(category_class, "_ordered", False)
         self.__batch_size: int = batch_size
+        self.__distributed: bool = distributed
 
     @property
     def categories(self) -> tuple[str, ...]:
@@ -47,6 +50,11 @@ def batch_size(self) -> int:
         """Return batch size during solution."""
         return self.__batch_size
 
+    @property
+    def distributed(self) -> bool:
+        """Return batch size during solution."""
+        return self.__distributed
+
     def to_jax(self) -> Int1D:
         """Convert the grid to a Jax array.
 
diff --git a/src/lcm/interfaces.py b/src/lcm/interfaces.py
@@ -1,11 +1,15 @@
 import dataclasses
 from collections.abc import Callable
+from functools import reduce
+from operator import mul
 from types import MappingProxyType
 from typing import cast
 
+import jax
 import pandas as pd
 from jax import Array
 
+from lcm.exceptions import PyLCMError
 from lcm.grids import Grid, IrregSpacedGrid
 from lcm.shocks import _ShockGrid
 from lcm.typing import (
@@ -294,27 +298,73 @@ def state_action_space(self, regime_params: FlatRegimeParams) -> StateActionSpac
                     shock_kw[p] = cast("float", all_params[f"{name}__{p}"])
                 state_replacements[name] = spec.compute_gridpoints(**shock_kw)
 
-        if not state_replacements and not action_replacements:
-            return self._base_state_action_space
-
         new_states = (
             MappingProxyType(
                 dict(self._base_state_action_space.states) | state_replacements
             )
             if state_replacements
-            else None
+            else dict(self._base_state_action_space.states)
         )
         new_continuous_actions = (
             MappingProxyType(
                 dict(self._base_state_action_space.continuous_actions)
                 | action_replacements
             )
             if action_replacements
-            else None
+            else dict(self._base_state_action_space.continuous_actions)
         )
+
+        avail_devices = jax.devices()
+        distributed_grids = {
+            name: grid for name, grid in self.grids.items() if grid.distributed == True
+        }
+        if len(distributed_grids) == 1:
+            n_points = distributed_grids[list(distributed_grids)[0]].to_jax().shape[0]
+            state_name = list(distributed_grids)[0]
+            if n_points % len(avail_devices) == 0:
+                mesh = jax.make_mesh(
+                    (len(avail_devices),),
+                    ("X"),
+                    axis_types=(jax.sharding.AxisType.Auto),
+                    devices=avail_devices,
+                )
+                new_states[state_name] = jax.device_put(
+                    new_states[state_name],
+                    jax.NamedSharding(mesh=mesh, spec=jax.P("X")),
+                )
+            else:
+                raise PyLCMError(
+                    "When distributing over one grid, the number of points in the grid "
+                    "needs to be a multiple of the available devices. Gridpoints: "
+                    f" {n_points} Available Devices: {len(avail_devices)}"
+                )
+        if len(distributed_grids) > 1:
+            permutations = reduce(
+                mul, [grid.to_jax().shape[0] for grid in distributed_grids.values()]
+            )
+            if permutations == len(avail_devices):
+                mesh = jax.make_mesh(
+                    tuple(len(grid.to_jax()) for grid in distributed_grids.values()),
+                    tuple(distributed_grids.keys()),
+                    axis_types=tuple(
+                        jax.sharding.AxisType.Auto for grid in distributed_grids
+                    ),
+                    devices=avail_devices,
+                )
+                for state_name in distributed_grids:
+                    new_states[state_name] = jax.device_put(
+                        new_states[state_name],
+                        jax.NamedSharding(mesh=mesh, spec=jax.P(state_name)),
+                    )
+            else:
+                raise PyLCMError(
+                    "When distributing over multiple grids, the product of the number of"
+                    " points of the grids needs to match the number of available devices."
+                    f" Gridpoints: {permutations} Available Devices: {len(avail_devices)}"
+                )
         return self._base_state_action_space.replace(
-            states=new_states,
-            continuous_actions=new_continuous_actions,
+            states=MappingProxyType(new_states),
+            continuous_actions=MappingProxyType(new_continuous_actions),
         )
 
 
diff --git a/src/lcm/solution/solve_brute.py b/src/lcm/solution/solve_brute.py
@@ -49,13 +49,14 @@ def solve(
     # Compute V array shapes and build a consistent next_regime_to_V_arr
     # template.  Using the same pytree structure (keys and shapes) across
     # all periods avoids JIT re-compilation from pytree mismatches.
-    regime_V_shapes = _get_regime_V_shapes(
+    regime_V_shapes = _get_regime_V_shapes_and_shardings(
         internal_regimes=internal_regimes,
         internal_params=internal_params,
     )
+
     next_regime_to_V_arr = MappingProxyType(
         {
-            regime_name: jnp.zeros(shape)
+            regime_name: jax.device_put(jnp.zeros(shape))
             for regime_name, shape in regime_V_shapes.items()
         }
     )
@@ -146,7 +147,6 @@ def solve(
                 period=jnp.int32(period),
                 age=ages.values[period],
             )
-
             # Async reductions: gated on log level. `"off"` skips
             # everything — no kernel launches, no host syncs, no
             # NaN fail-fast. `"warning"` / `"progress"` folds two
@@ -351,9 +351,7 @@ def _compile_and_log(
             compiled[func_id] = comp
 
     # Map back to (regime, period) keys.
-    return {
-        key: compiled[_func_dedup_key(func=func)] for key, func in all_functions.items()
-    }
+    return {key: func for key, func in all_functions.items()}
 
 
 def _resolve_compilation_workers(*, max_compilation_workers: int | None) -> int:
@@ -386,7 +384,7 @@ def _func_dedup_key(*, func: Callable) -> Hashable:
     return id(func)
 
 
-def _get_regime_V_shapes(
+def _get_regime_V_shapes_and_shardings(
     *,
     internal_regimes: MappingProxyType[RegimeName, InternalRegime],
     internal_params: InternalParams,
@@ -404,13 +402,30 @@ def _get_regime_V_shapes(
         Dict of regime names to V array shapes.
 
     """
-    shapes: dict[RegimeName, tuple[int, ...]] = {}
+    shapes_and_shardings: dict[
+        RegimeName, tuple[tuple[int, ...], jax.NamedSharding]
+    ] = {}
+    avail_devices = jax.devices()
     for regime_name, regime in internal_regimes.items():
         state_action_space = regime.state_action_space(
             regime_params=internal_params[regime_name],
         )
-        shapes[regime_name] = tuple(len(v) for v in state_action_space.states.values())
-    return shapes
+        spec = []
+        for name in state_action_space.states:
+            if regime.grids[name].distributed:
+                spec.append("X")
+            else:
+                spec.append(None)
+        shape = tuple(len(v) for v in state_action_space.states.values())
+        mesh = jax.make_mesh(
+            (len(avail_devices),),
+            ("X"),
+            axis_types=(jax.sharding.AxisType.Auto),
+            devices=avail_devices,
+        )
+
+        shapes_and_shardings[regime_name] = shape
+    return shapes_and_shardings
 
 
 @dataclass(frozen=True)
@@ -559,9 +574,9 @@ def _reconstruct_next_regime_to_V_arr(
 
     We rebuild the same mapping post-hoc from `solution`. The shapes come from
     the regime's state-action space at the supplied params — identical to what
-    `_get_regime_V_shapes` saw during solve setup.
+    `_get_regime_V_shapes_and_shardings` saw during solve setup.
     """
-    regime_V_shapes = _get_regime_V_shapes(
+    regime_V_shapes = _get_regime_V_shapes_and_shardings(
         internal_regimes=internal_regimes,
         internal_params=internal_params,
     )
diff --git a/tests/test_distributed.py b/tests/test_distributed.py
@@ -0,0 +1,81 @@
+from jax import numpy as jnp
+
+from lcm.ages import AgeGrid
+from lcm.grids import categorical
+from lcm.grids.continuous import LinSpacedGrid
+from lcm.grids.discrete import DiscreteGrid
+from lcm.model import Model
+from lcm.regime import Regime
+
+
+def test_unused_state_raises_error():
+    """Model raises error when a state is defined but never used."""
+
+    @categorical(ordered=False)
+    class RegimeId:
+        working_life: int
+        retirement: int
+
+    @categorical(ordered=True)
+    class Type:
+        low: int
+        high: int
+
+    # Define a regime where 'unused_state' is not used in any function
+    working_life = Regime(
+        functions={
+            "utility": lambda wealth, consumption, type1, type2: (
+                (jnp.log(consumption) + wealth * 0.001) * type1 * type2
+            ),
+        },
+        states={
+            "wealth": LinSpacedGrid(
+                start=1,
+                stop=100,
+                n_points=10,
+            ),
+            "type1": DiscreteGrid(Type, distributed=True),
+            "type2": DiscreteGrid(Type, distributed=True),
+        },
+        state_transitions={
+            "wealth": lambda wealth, consumption: wealth - consumption,
+            "type1": None,
+            "type2": None,
+        },
+        actions={"consumption": LinSpacedGrid(start=1, stop=50, n_points=10)},
+        transition=lambda age: jnp.where(
+            age >= 4, RegimeId.retirement, RegimeId.working_life
+        ),
+        active=lambda age: age < 5,
+    )
+
+    retirement = Regime(
+        transition=None,
+        functions={
+            "utility": lambda wealth, type1, type2: (wealth * 0.5) * type1 * type2
+        },
+        states={
+            "wealth": LinSpacedGrid(start=1, stop=100, n_points=10),
+            "type1": DiscreteGrid(Type, distributed=True),
+            "type2": DiscreteGrid(Type, distributed=True),
+        },
+        active=lambda age: age >= 5,
+    )
+
+    model = Model(
+        regimes={"working_life": working_life, "retirement": retirement},
+        ages=AgeGrid(start=0, stop=5, step="Y"),
+        regime_id_class=RegimeId,
+    )
+    res = model.simulate(
+        params={"discount_factor": 0.95},
+        initial_conditions={
+            "age": jnp.full(5, 0),
+            "wealth": jnp.full(5, 100.0),
+            "type1": jnp.full(5, 1),
+            "type2": jnp.full(5, 1),
+            "regime": jnp.zeros(5, dtype=jnp.int32),
+        },
+        period_to_regime_to_V_arr=None,
+        seed=12345,
+    )