all_to_all model implementation first approximation in ISL

rengzhengcodes · rengzhengcodes · commit 7099e75e9ffb · 2026-06-11T14:33:24.000-04:00
diff --git a/accelforge/model/_looptree/reuse/isl/distributed/bind.py b/accelforge/model/_looptree/reuse/isl/distributed/bind.py
@@ -1,6 +1,6 @@
 """Applies the binding layer into one that can be used for later analysis,"""
 
-from accelforge.frontend.binding import Binding
+from accelforge.frontend._binding import Binding
 from accelforge.frontend.mapping import Mapping
 from accelforge.frontend.workload import Workload
 
diff --git a/notebooks/astrasim2_correlation/correlation.ipynb b/notebooks/astrasim2_correlation/correlation.ipynb
@@ -26,60 +26,7 @@
    "id": "95cf92d9",
    "metadata": {},
    "source": [
-    "## Astrasim-2.0\n",
-    "\n",
-    "The [Astrasim-2.0 paper](https://arxiv.org/abs/2303.14006) has correlation to a torus of 4 and 16 V100s on page 6 to the network latency of data transfers of an all-reduce.\n",
-    "\n",
-    "On top of this, on page 9 they have data on their simulation framework correlated to the above and the latencies of certain operation of their model.\n",
-    "\n",
-    "We aim to show that we can match these numbers with our analytical model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f70059c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numbers import Number\n",
-    "from typing import Dict, Sequence\n",
-    "\n",
-    "def ring_4xV100():\n",
-    "\t\"\"\"\n",
-    "\tGenerates a graph with the latencies of 4xV100s.\n",
-    "\t\"\"\"\n",
-    "\t# Collective size in MB to latency TODO: stop eyeballing latency.\n",
-    "\tGROUND: Dict[int, int] = {\n",
-    "\t\t64: 500,\n",
-    "\t\t96: 750,\n",
-    "\t\t128: 1000,\n",
-    "\t\t192: 2000,\n",
-    "\t\t750: 10_000,\n",
-    "\t\t1500: 20_000\n",
-    "\t}\n",
-    "\n",
-    "\tsize: Sequence[Number] = tuple(GROUND.keys())\n",
-    "\ttruth: Sequence[Number] = tuple(GROUND.values())\n",
-    "\testimate: Sequence[Number] = []\n",
-    "\n",
-    "\t# Generates estimates from model\n",
-    "\tfor mb in size:\n",
-    "\t\t# TODO: Read and Jinja2 these items.\n",
-    "\t\t\n",
-    "\n",
-    "\tx = np.arange(len(size))  # positions\n",
-    "\twidth = 0.35  # bar width\n",
-    "\n",
-    "\tfig, ax = plt.subplots()\n",
-    "\t# Position bars side-by-side using offset\n",
-    "\tax.bar(x - width/2, men, width, label='Ground', color=\"blue\")\n",
-    "\tax.bar(x + width/2, women, width, label='Model', color=\"red\")\n",
-    "\n",
-    "\tax.set_xticks(x) # Center labels\n",
-    "\tax.set_xticklabels(labels)\n",
-    "\tax.legend()\n",
-    "\tplt.show()"
+    "## We are testing an 8 GPU All-to-All to Correlate Later, simulating an NVLink Switch\n"
    ]
   }
  ],
diff --git a/tests/not_working/distribuffers/multicast/test_cases.yaml b/tests/not_working/distribuffers/multicast/test_cases.yaml
@@ -479,4 +479,121 @@
 #     dist_fn: *ring_dist_size_8
 #     expected:
 #         latency: 1
-#         total_hops: 4
+#         total_hops: 4
+###################################################
+# 8-GPU fully-connected (NVLink/NVSwitch-style)   #
+# all-to-all, one-hot GPU encoding.               #
+#                                                 #
+# GPU i sits at one-hot coordinate e_i so every   #
+# src!=dst cast has extent 1 along exactly the    #
+# src and dst dims: cost = (1+1)(1+1)-1 = 3,      #
+# uniform across all pairs (fully-connected).     #
+# Self-chunks never cross the fabric (cost 0).    #
+# dist_fn is unit-cost (matching only).           #
+###################################################
+
+# All-to-all over 8 GPUs: each GPU holds chunks data[self, d] and requests data[s, self].
+-   occ: |
+        {
+            noc[gs0, gs1, gs2, gs3, gs4, gs5, gs6, gs7] -> data[s, d] :
+                0 <= gs0 <= 1 and 0 <= gs1 <= 1 and 0 <= gs2 <= 1 and 0 <= gs3 <= 1 and 0 <= gs4 <= 1 and 0 <= gs5 <= 1 and 0 <= gs6 <= 1 and 0 <= gs7 <= 1 and
+                gs0 + gs1 + gs2 + gs3 + gs4 + gs5 + gs6 + gs7 = 1 and
+                s = 1*gs1 + 2*gs2 + 3*gs3 + 4*gs4 + 5*gs5 + 6*gs6 + 7*gs7 and 0 <= d < 8
+        }
+    fill: |
+        {
+            noc[gd0, gd1, gd2, gd3, gd4, gd5, gd6, gd7] -> data[s, d] :
+                0 <= gd0 <= 1 and 0 <= gd1 <= 1 and 0 <= gd2 <= 1 and 0 <= gd3 <= 1 and 0 <= gd4 <= 1 and 0 <= gd5 <= 1 and 0 <= gd6 <= 1 and 0 <= gd7 <= 1 and
+                gd0 + gd1 + gd2 + gd3 + gd4 + gd5 + gd6 + gd7 = 1 and
+                d = 1*gd1 + 2*gd2 + 3*gd3 + 4*gd4 + 5*gd5 + 6*gd6 + 7*gd7 and 0 <= s < 8
+        }
+    dims: &8d_onehot_spatial
+        -   type: Spatial
+            spatial_dim: 0
+            target: 0
+        -   type: Spatial
+            spatial_dim: 1
+            target: 0
+        -   type: Spatial
+            spatial_dim: 2
+            target: 0
+        -   type: Spatial
+            spatial_dim: 3
+            target: 0
+        -   type: Spatial
+            spatial_dim: 4
+            target: 0
+        -   type: Spatial
+            spatial_dim: 5
+            target: 0
+        -   type: Spatial
+            spatial_dim: 6
+            target: 0
+        -   type: Spatial
+            spatial_dim: 7
+            target: 0
+    dist_fn: &fully_connected_unit |
+        {
+            [noc[xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7] -> noc[xs0, xs1, xs2, xs3, xs4, xs5, xs6, xs7]] -> hops[0] :
+                xd0 = xs0 and xd1 = xs1 and xd2 = xs2 and xd3 = xs3 and xd4 = xs4 and xd5 = xs5 and xd6 = xs6 and xd7 = xs7;
+            [noc[xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7] -> noc[xs0, xs1, xs2, xs3, xs4, xs5, xs6, xs7]] -> hops[1] :
+                (xd0 < xs0) or (xd0 > xs0) or (xd1 < xs1) or (xd1 > xs1) or (xd2 < xs2) or (xd2 > xs2) or (xd3 < xs3) or (xd3 > xs3) or (xd4 < xs4) or (xd4 > xs4) or (xd5 < xs5) or (xd5 > xs5) or (xd6 < xs6) or (xd6 > xs6) or (xd7 < xs7) or (xd7 > xs7)
+        }
+    expected:
+        latency: null
+        total_hops: null
+        multicast_hops: null
+        hypercube_hops: 168
+        extent_DOR_hops: null
+
+# Single chunk GPU0 -> GPU3: one unicast cast, cost (1+1)(1+1)-1 = 3.
+-   occ: |
+        {
+            noc[gs0, gs1, gs2, gs3, gs4, gs5, gs6, gs7] -> data[s, d] :
+                0 <= gs0 <= 1 and 0 <= gs1 <= 1 and 0 <= gs2 <= 1 and 0 <= gs3 <= 1 and 0 <= gs4 <= 1 and 0 <= gs5 <= 1 and 0 <= gs6 <= 1 and 0 <= gs7 <= 1 and
+                gs0 + gs1 + gs2 + gs3 + gs4 + gs5 + gs6 + gs7 = 1 and
+                s = 1*gs1 + 2*gs2 + 3*gs3 + 4*gs4 + 5*gs5 + 6*gs6 + 7*gs7 and s = 0 and d = 3
+        }
+    fill: |
+        {
+            noc[gd0, gd1, gd2, gd3, gd4, gd5, gd6, gd7] -> data[s, d] :
+                0 <= gd0 <= 1 and 0 <= gd1 <= 1 and 0 <= gd2 <= 1 and 0 <= gd3 <= 1 and 0 <= gd4 <= 1 and 0 <= gd5 <= 1 and 0 <= gd6 <= 1 and 0 <= gd7 <= 1 and
+                gd0 + gd1 + gd2 + gd3 + gd4 + gd5 + gd6 + gd7 = 1 and
+                d = 1*gd1 + 2*gd2 + 3*gd3 + 4*gd4 + 5*gd5 + 6*gd6 + 7*gd7 and s = 0 and d = 3
+        }
+    dims: *8d_onehot_spatial
+
+    dist_fn: *fully_connected_unit
+
+    expected:
+        latency: null
+        total_hops: null
+        multicast_hops: null
+        hypercube_hops: 3
+        extent_DOR_hops: null
+
+# Self chunk GPU5 -> GPU5: never crosses the fabric, cost 0.
+-   occ: |
+        {
+            noc[gs0, gs1, gs2, gs3, gs4, gs5, gs6, gs7] -> data[s, d] :
+                0 <= gs0 <= 1 and 0 <= gs1 <= 1 and 0 <= gs2 <= 1 and 0 <= gs3 <= 1 and 0 <= gs4 <= 1 and 0 <= gs5 <= 1 and 0 <= gs6 <= 1 and 0 <= gs7 <= 1 and
+                gs0 + gs1 + gs2 + gs3 + gs4 + gs5 + gs6 + gs7 = 1 and
+                s = 1*gs1 + 2*gs2 + 3*gs3 + 4*gs4 + 5*gs5 + 6*gs6 + 7*gs7 and s = 5 and d = 5
+        }
+    fill: |
+        {
+            noc[gd0, gd1, gd2, gd3, gd4, gd5, gd6, gd7] -> data[s, d] :
+                0 <= gd0 <= 1 and 0 <= gd1 <= 1 and 0 <= gd2 <= 1 and 0 <= gd3 <= 1 and 0 <= gd4 <= 1 and 0 <= gd5 <= 1 and 0 <= gd6 <= 1 and 0 <= gd7 <= 1 and
+                gd0 + gd1 + gd2 + gd3 + gd4 + gd5 + gd6 + gd7 = 1 and
+                d = 1*gd1 + 2*gd2 + 3*gd3 + 4*gd4 + 5*gd5 + 6*gd6 + 7*gd7 and s = 5 and d = 5
+        }
+    dims: *8d_onehot_spatial
+
+    dist_fn: *fully_connected_unit
+
+    expected:
+        latency: null
+        total_hops: null
+        multicast_hops: null
+        hypercube_hops: 0
+        extent_DOR_hops: null
diff --git a/tests/not_working/distribuffers/test_multicast.py b/tests/not_working/distribuffers/test_multicast.py
diff --git a/tests/not_working/distribuffers/util.py b/tests/not_working/distribuffers/util.py
@@ -0,0 +1,56 @@
+"""
+Utility functions common to testing the isl mapper functions.
+"""
+
+from pathlib import Path
+import islpy as isl
+
+from ruamel.yaml import YAML
+
+
+def to_isl_maps(obj: str | list | dict) -> dict:
+    """
+    Given an object, attempt to reduce all strings in tree with isl.Map
+
+    Parameters
+    ----------
+    obj:
+        A DAG which can be explored and contains isl.Map strings within it.
+
+    Returns
+    -------
+    `obj` but all strings are converted to isl.Map.
+    """
+
+    def _to_isl_maps(obj: str | dict | list) -> isl.Map | dict | list:
+        """Recursively convert string ISL maps to isl.Map; leave others alone."""
+        if isinstance(obj, str):
+            return isl.Map.read_from_str(isl.DEFAULT_CONTEXT, obj)
+        if isinstance(obj, dict):
+            return {k: (_to_isl_maps(v) if k != "type" else v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_to_isl_maps(v) for v in obj]
+        return obj
+
+    return _to_isl_maps(obj)  # type: ignore
+
+
+def load_solutions(path: Path) -> dict:
+    """
+    Loads in a dictionary with the isl solutions to a workload problem.
+
+    Parameters
+    ----------
+    path:
+        The path to the solutions.
+
+    Returns
+    -------
+    A dictionary relating Python-based keys generated by the mapper (e.g.,
+    `BufferTensorEinsum` to their corresponding isl.Map.)
+    """
+    # Load expected solutions (YAML file with string ISL maps)
+    yaml: YAML = YAML(typ="safe")
+
+    with open(path, "r", encoding="utf-8") as f:
+        return to_isl_maps(yaml.load(f))