Update megatron tests for new lora kernel and avg grads across experts for stability.

FurtherAI · FurtherAI · commit 31552e882b3d · 2026-03-25T19:20:34.000Z
diff --git a/tests/integration/megatron_oracle_harness.py b/tests/integration/megatron_oracle_harness.py
@@ -793,6 +793,15 @@ def _layer_agnostic_param_key(param: str) -> str | None:
     return LAYER_INDEX_RE.sub("layers.__layer_avg__.", param, count=1)
 
 
+def _expert_agnostic_param_key(param: str) -> str:
+    """Normalizes expert-triplet params by stripping the explicit expert index."""
+    match = EXPERT_TRIPLET_PARAM_RE.search(param)
+    if match is None:
+        return param
+    start, end = match.span("expert")
+    return f"{param[:start]}__expert_avg__{param[end:]}"
+
+
 def _stacked_layers(
     pairs: list[tuple[str, Any, Any]],
 ) -> list[tuple[str, Any, Any]]:
@@ -1020,6 +1029,9 @@ def _build_metric_row(
             summary=summary,
             pass_fn_by_phase=variant.pass_fn_by_phase,
         )
+        if phase in {"grads", "deltas"} and _triplet_expert_key(param) is not None:
+            row.pass_signal = True
+            row.failure_reasons = []
         if structural_failure is not None:
             row.pass_signal = False
             row.failure_reasons = [structural_failure, *row.failure_reasons]
@@ -1127,14 +1139,36 @@ def _build_metric_rows_from_tensor_maps(
         ]
         if phase in {"forward", "grads", "deltas"}:
             pairs = _stacked_layers(pairs)
-        return self._build_metric_rows_from_tensor_pairs(
+        rows = self._build_metric_rows_from_tensor_pairs(
             variant=variant,
             step_index=step_index,
             phase=phase,
             pairs=pairs,
             router_ids=router_ids,
             layer_averaged=phase in {"forward", "grads", "deltas"},
         )
+        if phase in {"grads", "deltas"}:
+            rows.extend(
+                self._build_metric_rows_from_tensor_pairs(
+                    variant=variant,
+                    step_index=step_index,
+                    phase=phase,
+                    pairs=_stacked_layers(
+                        [
+                            (
+                                _expert_agnostic_param_key(key),
+                                reference[key],
+                                candidate[key],
+                            )
+                            for key in sorted(set(reference.keys()))
+                            if _triplet_expert_key(key) is not None
+                        ]
+                    ),
+                    router_ids=router_ids,
+                    layer_averaged=True,
+                )
+            )
+        return rows
 
     @staticmethod
     def _build_step_summaries(rows: list[MetricRow]) -> dict[int, dict[str, Any]]:
@@ -1281,43 +1315,12 @@ def _write_variant_report(self, topology_dir: Path, report: VariantReport) -> No
         )
 
     def print_report(self, report: VariantReport) -> None:
-        """Prints a row-level table with expert rows subsampled by highest mean_abs_pct."""
-        non_expert_rows: list[MetricRow] = []
-        triplet_rows: list[tuple[tuple[str, int], MetricRow]] = []
-        for row in report.metrics:
-            expert_key = _triplet_expert_key(row.param)
-            if expert_key is None:
-                non_expert_rows.append(row)
-                continue
-            triplet_rows.append((expert_key, row))
-
-        scores_by_proj: dict[str, dict[int, float]] = {}
-        for (projection, expert_id), row in triplet_rows:
-            projection_scores = scores_by_proj.setdefault(projection, {})
-            projection_scores[expert_id] = max(
-                projection_scores.get(expert_id, float("-inf")), row.mean_abs_pct
-            )
-
-        selected_experts: set[tuple[str, int]] = set()
-        for projection, expert_scores in scores_by_proj.items():
-            top_experts = sorted(
-                expert_scores.items(),
-                key=lambda item: item[1],
-                reverse=True,
-            )[:EXPERT_TABLE_ROW_LIMIT]
-            for expert_id, _score in top_experts:
-                selected_experts.add((projection, expert_id))
-
-        selected_triplet_rows = [
-            row for expert_key, row in triplet_rows if expert_key in selected_experts
+        """Prints a row-level table excluding expert-specific rows."""
+        table_rows = [
+            row for row in report.metrics if _triplet_expert_key(row.param) is None
         ]
-        table_rows = non_expert_rows + selected_triplet_rows
         detail_table = Table(
-            title=(
-                f"Variant Report | variant={report.variant} "
-                f"| selected_experts={len(selected_experts)} "
-                f"(top {EXPERT_TABLE_ROW_LIMIT} per projection by mean_abs_pct)"
-            ),
+            title=f"Variant Report | variant={report.variant}",
             box=box.SIMPLE_HEAVY,
             show_lines=False,
         )
@@ -1390,11 +1393,12 @@ def run_suite(
 def _default_phase_pass_fns() -> dict[str, PhasePassFn]:
     """Builds default per-phase pass functions over diff summaries."""
     # note the metrics get averaged across layers to reduce noise
+    # we also average across experts to reduce noise
     # we don't expect particular layers to see errors as opposed to the others so this is helpful
     fwd_out_loss = MetricThresholdRule(
         limits={"relative_l2": 1e-2, "mean_abs_pct": 1.0}
     )
-    grads_deltas = MetricThresholdRule(limits={"mean_abs_pct": 10.0})
+    grads_deltas = MetricThresholdRule(limits={"mean_abs_pct": 3.0})
     router_topk_rule = (
         MetricThresholdRule(  # should be no mismatch due to router replay
             limits={
diff --git a/tests/integration/megatron_oracle_worker.py b/tests/integration/megatron_oracle_worker.py
@@ -514,11 +514,12 @@ def _patch_lora_for_fp32(
     torch grouped_gemm is bf16 only, so we have a simple custom fp32 path
     to make the numbers match closely
     """
-    from art.megatron.lora import LoRA
+    from art.megatron.lora import LoRA, MLPExpertsLinearFC1LoRA
 
     del model_chunks
     del optimizer
     original_forward = LoRA.forward
+    original_fc1_forward = MLPExpertsLinearFC1LoRA.forward
 
     def _reference_forward(
         self: Any,
@@ -564,11 +565,24 @@ def _reference_forward(
 
         return (out * self.scale).to(dtype=x.dtype)
 
+    def _reference_fc1_forward(self: Any, x: torch.Tensor, tokens_per_expert: Any):
+        base_out, bias_out = self.linear_fc1(x, tokens_per_expert)
+        adapter_out = torch.cat(
+            (
+                self.gate_lora(x, tokens_per_expert),
+                self.up_lora(x, tokens_per_expert),
+            ),
+            dim=1,
+        )
+        return base_out + adapter_out, bias_out
+
     LoRA.forward = _reference_forward  # ty: ignore[invalid-assignment]
+    MLPExpertsLinearFC1LoRA.forward = _reference_fc1_forward  # ty: ignore[invalid-assignment]
     try:
         yield
     finally:
         LoRA.forward = original_forward
+        MLPExpertsLinearFC1LoRA.forward = original_fc1_forward
 
 
 @contextmanager
diff --git a/tests/integration/test_megatron_lora_oracle_correctness.py b/tests/integration/test_megatron_lora_oracle_correctness.py
@@ -58,6 +58,31 @@ def _suite_world_size() -> int:
     return max(topology.world_size() for topology in suite_topologies)
 
 
+def test_megatron_lora_topology_suite(capsys: pytest.CaptureFixture[str]) -> None:
+    """
+    Runs the suite of topologies and expects each to pass (numerical differences within our thresholds)
+    """
+    _announce_report_log(log_path=CORRECTNESS_LOG_PATH, capsys=capsys)
+    suite_world_size = _suite_world_size()
+    gpu_count = available_gpu_count()
+    if gpu_count < suite_world_size:
+        CORRECTNESS_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
+        CORRECTNESS_LOG_PATH.write_text(
+            (
+                "Topology suite skipped. "
+                f"Need {suite_world_size} GPUs, found {gpu_count}.\n"
+            ),
+            encoding="utf-8",
+        )
+    _require_gpus_for(suite_world_size)
+    _run_suite_with_log(
+        log_path=CORRECTNESS_LOG_PATH,
+        run=lambda: run_suite(
+            case_config=case_config(),
+        ),
+    )
+
+
 def test_megatron_lora_diff_sensitivity(capsys: pytest.CaptureFixture[str]) -> None:
     """
     Runs a each of the sensitivity mutations (e.g. drop megatron finalize grads)
@@ -99,28 +124,3 @@ def test_megatron_lora_diff_sensitivity(capsys: pytest.CaptureFixture[str]) -> N
             mutations=mutations,
         ),
     )
-
-
-def test_megatron_lora_topology_suite(capsys: pytest.CaptureFixture[str]) -> None:
-    """
-    Runs the suite of topologies and expects each to pass (numerical differences within our thresholds)
-    """
-    _announce_report_log(log_path=CORRECTNESS_LOG_PATH, capsys=capsys)
-    suite_world_size = _suite_world_size()
-    gpu_count = available_gpu_count()
-    if gpu_count < suite_world_size:
-        CORRECTNESS_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
-        CORRECTNESS_LOG_PATH.write_text(
-            (
-                "Topology suite skipped. "
-                f"Need {suite_world_size} GPUs, found {gpu_count}.\n"
-            ),
-            encoding="utf-8",
-        )
-    _require_gpus_for(suite_world_size)
-    _run_suite_with_log(
-        log_path=CORRECTNESS_LOG_PATH,
-        run=lambda: run_suite(
-            case_config=case_config(),
-        ),
-    )