[mpmd] Merge inferred fragments inline in UniquifyFunctionInputsOutputsPass.

petebu · copybara-github · commit f40dce328ad1 · 2026-05-29T11:38:29.000-07:00
Instead of running a separate MergeInferredFragmentsPass after uniquify,
merge each newly created inferred fragment into an existing same-mesh
fragment directly within the pass. Removes the separate pass from the
export pipeline.

PiperOrigin-RevId: 908184894
diff --git a/shardy/dialect/mpmd/transforms/common/passes.td b/shardy/dialect/mpmd/transforms/common/passes.td
@@ -450,6 +450,10 @@ def UniquifyFunctionInputsOutputsPass :
     Similarly, if a function returns a block argument, this pass creates an
     identity fragment for that block argument, guaranteeing that values are
     passed by value to the function, not by reference.
+
+    Additionally, when not using transfers, the pass will attempt to merge
+    each newly created inferred fragment into an existing same-mesh fragment
+    to reduce the total number of fragments.
   }];
 
   let options = [
diff --git a/shardy/dialect/mpmd/transforms/common/test/uniquify_function_inputs_outputs_with_reshard.mlir b/shardy/dialect/mpmd/transforms/common/test/uniquify_function_inputs_outputs_with_reshard.mlir
@@ -26,11 +26,9 @@ func.func @no_work_needed(%arg0: !mesh_1_tensor, %arg1: !mesh_2_tensor) -> (!mes
 func.func @single_mesh_one_return_operand(%arg0: !mesh_1_tensor) -> (!mesh_1_tensor, !mesh_1_tensor, !mesh_1_tensor) attributes {
   "topology"=#mpmd.topology<<"m1": <["x"=2]>>>
 } {
-  // CHECK-NEXT: %[[F1:.*]] = mpmd.fragment<mesh="m1", origin=["f1"]>
-  // CHECK:      %[[F2:.*]] = mpmd.fragment<mesh="m1", origin=["f2"]>
-  // CHECK:      %[[UF:.*]]:2 = mpmd.fragment<mesh="m1", origin=[]> (%[[F1]]) {mpmd.inferred_by = ["uniquify"]} (%arg1: tensor<4xf32>) {
-  // CHECK:         mpmd.return %arg1, %arg1 : tensor<4xf32>, tensor<4xf32>
-  // CHECK:      %[[F2]], %[[UF]]#0, %[[UF]]#1
+  // CHECK-NEXT: %[[F1:.*]]:3 = mpmd.fragment<mesh="m1", origin=["f1"]>
+  // CHECK:      %[[F2:.*]] = mpmd.fragment<mesh="m1", origin=["f2"]> (%[[F1]]#0)
+  // CHECK:      return %[[F2]], %[[F1]]#1, %[[F1]]#2
   %0 = mpmd.fragment<mesh="m1", origin=["f1"]> (%arg0) (%arg1: tensor<4xf32>) {
     %1 = stablehlo.add %arg1, %arg1 : tensor<4xf32>
     mpmd.return %1 : tensor<4xf32>
@@ -48,11 +46,8 @@ func.func @needs_fragment_for_m1_with_many_values(%arg0: !mesh_1_tensor, %arg1:
 } {
   // CHECK-NEXT: %[[F1:.*]] = mpmd.fragment<mesh="m1", origin=["f1"]>
   // CHECK:      %[[F2:.*]] = mpmd.fragment<mesh="m2", origin=["f2"]>
-  // CHECK:      %[[F3:.*]] = mpmd.fragment<mesh="m1", origin=["f3"]>
-  // CHECK:      %[[UF:.*]]:5 = mpmd.fragment<mesh="m1", origin=[]> (%[[F1]], %[[F3]]) {mpmd.inferred_by = ["uniquify"]} (%[[A1:.*]]: tensor<4xf32>, %[[A2:.*]]: tensor<4xf32>)
-  // CHECK-NEXT:   mpmd.return %[[A1]], %[[A1]], %[[A2]], %[[A2]], %[[A2]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return %[[F2]], %[[UF]]#0, %[[UF]]#2, %[[UF]]#1, %[[UF]]#3, %[[UF]]#4
+  // CHECK:      %[[F3:.*]]:5 = mpmd.fragment<mesh="m1", origin=["f3"]> (%[[F1]], %arg0)
+  // CHECK:      return %[[F2]], %[[F3]]#0, %[[F3]]#2, %[[F3]]#1, %[[F3]]#3, %[[F3]]#4
   %0 = mpmd.fragment<mesh="m1", origin=["f1"]> (%arg0) (%arg2: tensor<4xf32>) {
     mpmd.return %arg2 : tensor<4xf32>
   } : (!mesh_1_tensor) -> !mesh_1_tensor
@@ -70,9 +65,10 @@ func.func @needs_fragment_for_m1_and_m2(%arg0: !mesh_1_tensor, %arg1: !mesh_2_te
 ) -> (!mesh_1_tensor, !mesh_2_tensor, !mesh_2_tensor, !mesh_1_tensor, !mesh_1_tensor, !mesh_1_tensor) attributes {
   "topology"=#mpmd.topology<<"m1": <["x"=2]>>, <"m2": <["x"=2]>>>
 } {
-  // CHECK: %[[UF1:.*]]:4 = mpmd.fragment<mesh="m1", origin=[]> ({{.*}}) {mpmd.inferred_by = ["uniquify"]}
-  // CHECK: %[[UF2:.*]]:2 = mpmd.fragment<mesh="m2", origin=[]> ({{.*}}) {mpmd.inferred_by = ["uniquify"]}
-  // CHECK: return %[[UF1]]#0, %[[UF2]]#0, %[[UF2]]#1, %[[UF1]]#2, %[[UF1]]#1, %[[UF1]]#3
+  // CHECK: %[[F1:.*]] = mpmd.fragment<mesh="m1", origin=["f1"]>
+  // CHECK: %[[F2:.*]]:2 = mpmd.fragment<mesh="m2", origin=["f2"]>
+  // CHECK: %[[F3:.*]]:4 = mpmd.fragment<mesh="m1", origin=["f3"]> (%[[F1]], %arg0)
+  // CHECK: return %[[F3]]#0, %[[F2]]#0, %[[F2]]#1, %[[F3]]#2, %[[F3]]#1, %[[F3]]#3
   %0 = mpmd.fragment<mesh="m1", origin=["f1"]> (%arg0) (%arg2: tensor<4xf32>) {
     mpmd.return %arg2 : tensor<4xf32>
   } : (!mesh_1_tensor) -> !mesh_1_tensor
@@ -95,11 +91,9 @@ module {
 func.func @single_mesh_one_return_operand_with_global_view(%arg0: !dist_mesh_tensor) -> (!dist_mesh_tensor, !dist_mesh_tensor, !dist_mesh_tensor) attributes {
   "topology"=#mpmd.topology<<"m1": <["x"=2]>>>
 } {
-  // CHECK-NEXT: %[[F1:.*]] = mpmd.fragment<mesh="m1", origin=["f1"]>
-  // CHECK:      %[[F2:.*]] = mpmd.fragment<mesh="m1", origin=["f2"]>
-  // CHECK:      %[[UF:.*]]:2 = mpmd.fragment<mesh="m1", origin=[]> (%[[F1]]) {mpmd.inferred_by = ["uniquify"]} (%arg1: tensor<4xf32>) {
-  // CHECK:         mpmd.return %arg1, %arg1 : tensor<4xf32>, tensor<4xf32>
-  // CHECK:      %[[F2]], %[[UF]]#0, %[[UF]]#1
+  // CHECK-NEXT: %[[F1:.*]]:3 = mpmd.fragment<mesh="m1", origin=["f1"]>
+  // CHECK:      %[[F2:.*]] = mpmd.fragment<mesh="m1", origin=["f2"]> (%[[F1]]#0)
+  // CHECK:      return %[[F2]], %[[F1]]#1, %[[F1]]#2
   %0 = mpmd.fragment<mesh="m1", origin=["f1"]> (%arg0) (%arg1: tensor<4xf32>) {
     %1 = stablehlo.add %arg1, %arg1 : tensor<4xf32>
     mpmd.return %1 : tensor<4xf32>
diff --git a/shardy/dialect/mpmd/transforms/common/uniquify_function_inputs_outputs.cc b/shardy/dialect/mpmd/transforms/common/uniquify_function_inputs_outputs.cc
@@ -18,18 +18,21 @@ limitations under the License.
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
 #include "shardy/common/logging.h"
 #include "shardy/dialect/mpmd/ir/dialect.h"
 #include "shardy/dialect/mpmd/ir/utils.h"
 #include "shardy/dialect/mpmd/transforms/common/passes.h"  // IWYU pragma: keep
+#include "shardy/dialect/mpmd/transforms/common/utils.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 
 namespace mlir::mpmd {
@@ -41,17 +44,111 @@ namespace {
 
 using ValueToReturnIndices = llvm::MapVector<Value, SmallVector<int64_t>>;
 
+bool CanMoveAfter(Operation* op_to_move, Operation* target_op) {
+  if (op_to_move->getBlock() != target_op->getBlock()) return false;
+  if (!op_to_move->isBeforeInBlock(target_op)) return false;
+
+  Operation* current = op_to_move->getNextNode();
+  while (current) {
+    for (Value result : op_to_move->getResults()) {
+      if (llvm::is_contained(current->getOperands(), result)) {
+        return false;
+      }
+    }
+    for (Value operand : op_to_move->getOperands()) {
+      if (operand.getDefiningOp() == current) {
+        return false;
+      }
+    }
+
+    if (current == target_op) break;
+    current = current->getNextNode();
+  }
+  return true;
+}
+
+// Tries to merge the newly created inferred fragment into an existing
+// same-mesh fragment in the block.
+void MergeInferredFragmentWithExisting(FragmentOp fragment_op,
+                                       StringRef mesh_name,
+                                       Operation* return_op,
+                                       OpBuilder& builder) {
+  // Try to find an existing same-mesh fragment to merge the newly created
+  // inferred fragment into. We track two things:
+  //   - latest_operand_producer: the latest op that produces any operand of
+  //     the inferred fragment (needed for positioning constraints).
+  //   - merge_target: the latest same-mesh fragment we can merge into.
+  FragmentOp merge_target = nullptr;
+  Operation* latest_operand_producer = nullptr;
+
+  // Updates merge_target if `op` is a same-mesh fragment that appears later
+  // in the block than the current candidate.
+  auto updateMergeTarget = [&](Operation* op) {
+    auto frag = dyn_cast<FragmentOp>(op);
+    if (frag && frag.getMeshName() == mesh_name &&
+        (!merge_target || merge_target->isBeforeInBlock(frag))) {
+      merge_target = frag;
+    }
+  };
+
+  // First, scan the operand producers for a merge candidate.
+  for (Value v : fragment_op.getOperands()) {
+    Operation* op = v.getDefiningOp();
+    if (!op) continue;
+    if (!latest_operand_producer ||
+        latest_operand_producer->isBeforeInBlock(op)) {
+      latest_operand_producer = op;
+    }
+    updateMergeTarget(op);
+  }
+
+  // If no producer fragment on the same mesh was found among the operands,
+  // look for any fragment on the same mesh in the block (sideways merge).
+  // Only do this when there are actual producer ops (not just block arguments).
+  if (!merge_target && latest_operand_producer) {
+    for (Operation& op : *return_op->getBlock()) {
+      if (&op == return_op || &op == fragment_op) continue;
+      updateMergeTarget(&op);
+    }
+  }
+
+  // Give up if no merge candidate was found, or if the candidate can't be
+  // moved after the latest operand producer (which would break dominance).
+  if (!merge_target || (merge_target != latest_operand_producer &&
+                        !CanMoveAfter(merge_target, latest_operand_producer))) {
+    return;
+  }
+
+  // Position the merge target after the latest operand producer so all
+  // operands of the inferred fragment are available, then merge.
+  if (merge_target != latest_operand_producer) {
+    merge_target->moveAfter(latest_operand_producer);
+  }
+
+  fragment_op->moveAfter(merge_target);
+  IRRewriter rewriter(builder.getContext());
+  FragmentOp merged_fragment = MergeRegionOps(
+      merge_target, fragment_op, rewriter,
+      /*num_static_args=*/0, /*replace_producer_use_in_consumer_block=*/
+      [](OpOperand&, Value) {
+        SDY_CHECK(false) << "Fragment ops shouldn't have free variables";
+      },
+      GetFragmentOriginUnion(merge_target, fragment_op, rewriter),
+      merge_target.getMeshNameAttr(),
+      /*stage_id=*/merge_target.getStageIdAttr());
+  SetInferredByAttr(merged_fragment, "uniquify", builder);
+}
+
 void CreateReturnFragmentForMesh(StringRef mesh_name, Operation* return_op,
                                  ValueToReturnIndices& value_to_return_indices,
                                  OpBuilder& builder) {
   // We remove any entries that require no work, in order to avoid too many
   // checks.
   value_to_return_indices.remove_if([](const auto& it) {
     if (it.second.size() == 1) {
-      Value v = it.first;
-      return !isa<BlockArgument>(v);
+      return !isa<BlockArgument>(it.first);
     }
-    return it.second.empty();
+    return false;
   });
 
   if (value_to_return_indices.empty()) {
@@ -98,6 +195,8 @@ void CreateReturnFragmentForMesh(StringRef mesh_name, Operation* return_op,
   }
   auto block_builder = OpBuilder::atBlockEnd(&fragment_block);
   ReturnOp::create(block_builder, loc, returned_values);
+
+  MergeInferredFragmentWithExisting(fragment_op, mesh_name, return_op, builder);
 }
 
 // Replaces the return values of the function with transfer ops.
diff --git a/shardy/dialect/mpmd/transforms/export/export_pipeline.cc b/shardy/dialect/mpmd/transforms/export/export_pipeline.cc
@@ -86,11 +86,6 @@ void addExportPipeline(OpPassManager& pm, const ExportOptions& options) {
   // identity fragments, which would be canonicalized away.
   pm.addNestedPass<FuncOp>(createUniquifyFunctionInputsOutputsPass());
 
-  // The fragments created by the pass above maybe slowdown compilation (more
-  // fragments to compile) and may cause performance regressions. Thus, we merge
-  // them with other fragments.
-  pm.addNestedPass<FuncOp>(createMergeInferredFragmentsPass());
-
   // Mark each fragment with the inputs and outputs which are offloaded to host
   // memory.
   pm.addNestedPass<FuncOp>(createMarkOffloadedInputOutputPass());
diff --git a/shardy/dialect/mpmd/transforms/export/test/export_pipeline.mlir b/shardy/dialect/mpmd/transforms/export/test/export_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: mpmd_opt %s -mpmd-export-pipeline 2>&1 | FileCheck %s
+// RUN: mpmd_opt %s -mpmd-export-pipeline -split-input-file 2>&1 | FileCheck %s
 
 !mesh_1_tensor_4_8_f32 = !mpmd.mesh_tensor<"m1", tensor<4x8xf32>>
 
@@ -17,3 +17,36 @@ func.func @main(%arg0: !mesh_1_tensor_4_8_f32 {tf.aliasing_output = 0: i32}, %ar
   } : (!mesh_1_tensor_4_8_f32, !mesh_1_tensor_4_8_f32) -> (!mesh_1_tensor_4_8_f32)
   func.return %0 : !mesh_1_tensor_4_8_f32
 }
+
+// -----
+
+!mesh_1_tensor_4_8_f32 = !mpmd.mesh_tensor<"m1", tensor<4x8xf32>>
+!mesh_2_tensor_4_8_f32 = !mpmd.mesh_tensor<"m2", tensor<4x8xf32>>
+
+// This test verifies that an explicit fragment and an inferred fragment
+// (created by the UniquifyFunctionInputsOutputsPass for the duplicated return
+// of the transfer result) are merged sideways. Without sideways merge, the
+// transfer result would produce a separate inferred fragment call on m1.
+// The function-level returns remain unique SSA values (%[[RES]]#0, #1, #2),
+// preserving the invariant established by the uniquify pass, even though the
+// fragment body internally returns the same value in multiple positions.
+// CHECK-LABEL: func.func @test_sideways_merge
+func.func @test_sideways_merge(%arg0: !mesh_1_tensor_4_8_f32, %arg1: !mesh_2_tensor_4_8_f32)
+  -> (!mesh_1_tensor_4_8_f32, !mesh_1_tensor_4_8_f32, !mesh_1_tensor_4_8_f32) attributes {
+      "topology"=#mpmd.topology<
+      <"m1": <["x"=2]>>,
+      <"m2": <["x"=2]>>
+    >} {
+  // CHECK: %[[RES:.*]]:3 = mpmd.fragment_call<mesh="m1", origin=["f1"]> @[[CALLEE_M1:.*]]
+  // CHECK-NOT: mpmd.fragment_call<mesh="m1"
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2
+
+  %0 = mpmd.fragment<mesh="m1", origin=["f1"]> (%arg0) (%arg2: tensor<4x8xf32>) {
+    %4 = stablehlo.add %arg2, %arg2 : tensor<4x8xf32>
+    mpmd.return %4 : tensor<4x8xf32>
+  } : (!mesh_1_tensor_4_8_f32) -> !mesh_1_tensor_4_8_f32
+
+  %1 = mpmd.transfer %arg1 : (!mesh_2_tensor_4_8_f32) -> !mesh_1_tensor_4_8_f32
+
+  func.return %0, %1, %1 : !mesh_1_tensor_4_8_f32, !mesh_1_tensor_4_8_f32, !mesh_1_tensor_4_8_f32
+}