[network] Add *untested* distributed model

gilbertmike · gilbertmike · commit 4b3fcabcd670 · 2026-05-22T13:44:09.000-04:00
diff --git a/accelforge/frontend/arch/spatialable.py b/accelforge/frontend/arch/spatialable.py
@@ -142,11 +142,11 @@ def _get_physical_fanout_along(self, dim_name: str, default: int = 1) -> int:
                 return s.fanout
         return default
 
-    def _get_physical_stride_along(self, dim_name: str, default: int = 1) -> int:
+    def _get_physical_stride_along(self, dim_name: str) -> int:
         for s in self._physical_spatial:
             if s.name == dim_name:
                 return s.stride
-        return default
+        raise ValueError(f"dimension {dim_name} not found")
 
     def _spatial_str(self, include_newline=True) -> str:
         if not self.spatial:
diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py
@@ -12,10 +12,10 @@
     PartiallyRelevant,
 )
 
-from accelforge.util._sympy.broadcast_max import Min, Max, MaxGeqZero
+from accelforge.util._sympy.broadcast_max import MaxGeqZero, MinGeqZero
 
 from ._common import AnalysisInfo
-from ._stats import NetworkStats
+from ._stats import NetworkStats, SymbolicAnalysisOutput
 
 
 class NetworkAnalyzer:
@@ -25,7 +25,7 @@ def __init__(self, network_stats):
 
     def accumulate_child_result(
         self,
-        child_result,
+        child_result: SymbolicAnalysisOutput,
         info: AnalysisInfo,
         shape_repeats,
         einsum_name,
@@ -35,6 +35,7 @@ def accumulate_child_result(
         flattened_arch = info.job.flattened_arch
 
         for network, child_network_stats in child_result.network_stats.items():
+            src_component = flattened_arch[network.source.level]
             if network not in self.network_stats:
                 self.network_stats[network] = NetworkStats()
             accumulated_network_stats = self.network_stats[network]
@@ -64,93 +65,72 @@ def accumulate_child_result(
                 * actions_per_value
             )
 
-            if is_component_a_above_b(node.component, network.component, flattened_arch):
+            if flattened_arch.is_above(node.component, network.component):
                 continue
 
             relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable]
 
+            # The fanout in this dimension in mapping nodes below, i.e., the stride
             last_fanout = child_result.fanout.get((node.component, einsum_name), {})
             last_fanout = last_fanout.get(node.name, 1)
             if isinstance(relevancy, Irrelevant):
-                # Cost of multicasting is the cost of delivering along the dimension
-                multicast_hops = shape_repeats * last_fanout
-                multicast_cost = multicast_hops * volume
-                self.overall_max_hops += multicast_hops
-
-                accumulated_network_stats.total_hops += multicast_cost
-                accumulated_network_stats.max_hops = MaxGeqZero(
-                    accumulated_network_stats.max_hops,
-                    self.overall_max_hops + child_network_stats.max_hops,
-                )
+                # Distributed or not, the amount of total cost is the same.
+                # However, the accesses now come from different physical memories
+                total_cost = multicast_cost(shape_repeats, last_fanout)*volume
+                max_hops = shape_repeats*last_fanout
             elif isinstance(relevancy, Relevant):
-                # Cost of unicast is the cost of delivering to each point in
-                # the dimension with shape as stride
-                # TODO: we should use the actual stride
-                total_unicast_cost = (
-                    0.5 * (shape_repeats + 1) * shape_repeats * last_fanout * volume
-                )
-                max_unicast_hops = shape_repeats * last_fanout
-                self.overall_max_hops += max_unicast_hops
-
-                accumulated_network_stats.total_hops += total_unicast_cost
-                accumulated_network_stats.max_hops = MaxGeqZero(
-                    accumulated_network_stats.max_hops,
-                    self.overall_max_hops + child_network_stats.max_hops,
-                )
+                # If distributed, then we bind data as locally as possible in the
+                # physical buffers
+                if src_component._get_physical_fanout_along(node.name) > 1:
+                    physical_stride = src_component._get_physical_stride_along(node.name)
+
+                    n_dsts_per_physical = MinGeqZero(
+                        # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops
+                        # later (which is correct because the set of destinations always overlap
+                        # the set of sources).
+                        MaxGeqZero(physical_stride / last_fanout, 1),
+                        shape_repeats
+                    )
+                    n_activated_physical = MaxGeqZero(shape_repeats*last_fanout/physical_stride, 1)
+                    total_cost = (
+                        n_activated_physical
+                        *
+                        unicast_cost(n_dsts_per_physical, last_fanout)
+                        *
+                        volume
+                    )
+                    max_hops = MinGeqZero(shape_repeats*last_fanout, physical_stride)
+                else:
+                    total_cost = unicast_cost(shape_repeats, last_fanout)*volume
+                    max_hops = shape_repeats * last_fanout
             elif isinstance(relevancy, PartiallyRelevant):
                 raise NotImplementedError()
             else:
                 raise RuntimeError(f"unhandled relevancy type {relevancy}")
 
-        return self.overall_max_hops
-
-
-def reduce_dicts(dict1: dict, dict2: dict, reduce_op):
-    for key in dict1:
-        if key not in dict2:
-            dict2[key] = dict1[key]
-        else:
-            dict2[key] = reduce_op(dict1[key], dict2[key])
-
-
-def get_total_to_per_unit(total, max_per_unit):
-    if total == 0 and max_per_unit != 0:
-        raise ValueError(f"total is 0 but max_per_unit is {max_per_unit}")
-    if total == 0:
-        return 1
-    return max_per_unit / total
+            # TODO: this is sketchy
+            self.overall_max_hops += max_hops
 
+            accumulated_network_stats.total_hops += total_cost
+            accumulated_network_stats.max_hops = MaxGeqZero(
+                accumulated_network_stats.max_hops,
+                self.overall_max_hops + child_network_stats.max_hops,
+            )
 
-def has_parent_tensor_holder(
-    tensor: TensorName, node_idx: int, info
-) -> bool:
-    for node in info.mapping[:node_idx]:
-        if isinstance(node, TensorHolder) and tensor in node.tensors:
-            return True
-    return False
+        return self.overall_max_hops
 
 
-def find_component_object(
-    component: str, flattened_arch: list[arch.Leaf]
-) -> arch.TensorHolder:
-    for node in flattened_arch:
-        if node.name == component:
-            return node
-    raise ValueError(f"Component {component} not found in flattened arch")
+def multicast_cost(n_dsts, stride):
+    """Returns total hops of multicast along a dimension."""
+    return (n_dsts-1)*stride
 
 
-def is_component_a_above_b(component_a: str, component_b: str, flattened_arch):
-    a_found = False
-    b_found = False
-    for node in flattened_arch:
-        if node.name == component_a:
-            a_found = True
-        if node.name == component_b:
-            b_found = True
+def unicast_cost(n_dsts, stride):
+    """Returns total hops of unicast along a dimension."""
+    # Cost of unicast is the cost of delivering to each point in
+    # the dimension with shape as stride
+    return arithmetic_sum(n_dsts-1)*stride
 
-        if a_found and not b_found:
-            return True
-        elif b_found and not a_found:
-            return False
-    raise ValueError(f"Neither {component_a} nor {component_b} found in flattened arch")
 
+def arithmetic_sum(n):
+    return 0.5 * (n+1) * n
diff --git a/tests/test_network.py b/tests/test_network.py
@@ -78,7 +78,7 @@ def test_hierarchical_1d(self):
             * (KN / MAC_TILE)  # number of used Scratchpad
             * M_TILE
             * KN  # temporal for n1 in mapping
-            * sum(i+1 for i in range(MAC_TILE))  # unicast along X-axis of MacArray
+            * sum(i for i in range(MAC_TILE))  # unicast along X-axis of MacArray
             * BITS_PER_VALUE,
         )
         # NOTE: assuming XY routing (as defined in mapping)
@@ -88,7 +88,7 @@ def test_hierarchical_1d(self):
             * (KN / MAC_TILE)
             * M_TILE
             * KN  # temporal for n1 in mapping
-            * MAC_TILE   # multicast along X-axis of MacArray
+            * (MAC_TILE - 1)   # multicast along X-axis of MacArray
             * BITS_PER_VALUE,
         )
         self.assertEqual(
@@ -97,14 +97,14 @@ def test_hierarchical_1d(self):
             * (KN / MAC_TILE)
             * M_TILE
             * KN
-            * sum(i+1 for i in range(MAC_TILE))
+            * sum(i for i in range(MAC_TILE))
             * BITS_PER_VALUE,
         )
 
         self.assertEqual(
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T0<SEP>hops"].iloc[0],
             (M / M_TILE)
-            * sum(i+1 for i in range(KN // MAC_TILE))  # unicast along X-axis of PeArray
+            * sum(i for i in range(KN // MAC_TILE))  # unicast along X-axis of PeArray
             * M_TILE
             * MAC_TILE
             * BITS_PER_VALUE,
@@ -113,15 +113,15 @@ def test_hierarchical_1d(self):
         self.assertEqual(
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T1<SEP>hops"].iloc[0],
             (M / M_TILE)
-            * KN // MAC_TILE  # multicast along X-axis of PeArray
+            * (KN // MAC_TILE - 1)  # multicast along X-axis of PeArray
             * M_TILE
             * KN
             * BITS_PER_VALUE,
         )
         self.assertEqual(
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>W0<SEP>hops"].iloc[0],
             (M / M_TILE)
-            * sum(i+1 for i in range(KN // MAC_TILE))  # unicast along PeArray
+            * sum(i for i in range(KN // MAC_TILE))  # unicast along PeArray
             * MAC_TILE
             * KN
             * BITS_PER_VALUE,
@@ -156,9 +156,9 @@ def test_hierarchical(self):
             * (KN / MAC_TILE) ** 2
             * M_TILE
             * (
-                sum(i+1 for i in range(MAC_TILE))  # unicasting along X
+                sum(i for i in range(MAC_TILE))  # unicasting along X
                 +
-                MAC_TILE * MAC_TILE  # multicast along Y for each column
+                MAC_TILE * (MAC_TILE-1)  # multicast along Y for each column
             )
             * BITS_PER_VALUE,
         )
@@ -169,9 +169,9 @@ def test_hierarchical(self):
             * (KN / MAC_TILE) ** 2
             * M_TILE
             * (
-                MAC_TILE * MAC_TILE  # multicast along X (the tile is shape N1, which is MAC_TILE here)
+                MAC_TILE * (MAC_TILE - 1)  # multicast along X (the tile is shape N1, which is MAC_TILE here)
                 +
-                MAC_TILE * sum(i+1 for i in range(MAC_TILE))  # unicasting along Y for each row
+                MAC_TILE * sum(i for i in range(MAC_TILE))  # unicasting along Y for each row
             )
             * BITS_PER_VALUE,
         )
@@ -181,9 +181,9 @@ def test_hierarchical(self):
             * (KN / MAC_TILE) ** 2
             * M_TILE
             * (
-                MAC_TILE * sum(i+1 for i in range(MAC_TILE))  # unicast along X (the tile is shape N1, which is MAC_TILE here)
+                MAC_TILE * sum(i for i in range(MAC_TILE))  # unicast along X (the tile is shape N1, which is MAC_TILE here)
                 +
-                MAC_TILE * sum(i+1 for i in range(MAC_TILE))  # unicasting along Y for each row
+                MAC_TILE * sum(i for i in range(MAC_TILE))  # unicasting along Y for each row
             )
             * BITS_PER_VALUE,
         )
@@ -192,9 +192,9 @@ def test_hierarchical(self):
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T0<SEP>hops"].iloc[0],
             (M / M_TILE)
             * (
-                sum(i+1 for i in range(PE_TILE))
+                sum(i for i in range(PE_TILE))
                 +
-                PE_TILE * PE_TILE
+                PE_TILE * (PE_TILE - 1)
             )
             # tile shape
             * M_TILE
@@ -206,9 +206,9 @@ def test_hierarchical(self):
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T1<SEP>hops"].iloc[0],
             (M / M_TILE)
             * (
-                PE_TILE * PE_TILE
+                PE_TILE * (PE_TILE - 1)
                 +
-                PE_TILE * sum(i+1 for i in range(PE_TILE))
+                PE_TILE * sum(i for i in range(PE_TILE))
             )
             * M_TILE
             * MAC_TILE
@@ -218,9 +218,9 @@ def test_hierarchical(self):
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>W0<SEP>hops"].iloc[0],
             (M / M_TILE)
             * (
-                PE_TILE * sum(i+1 for i in range(PE_TILE))
+                PE_TILE * sum(i for i in range(PE_TILE))
                 +
-                PE_TILE * sum(i+1 for i in range(PE_TILE))
+                PE_TILE * sum(i for i in range(PE_TILE))
             )
             * MAC_TILE**2
             * BITS_PER_VALUE,