einsum: parallelize hadamard-reduction outer h-tile loop

bimalgaudel · bimalgaudel · commit 17105f2ef6dd · 2026-04-27T10:41:43.000-04:00
diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h
@@ -687,10 +687,24 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
 
       auto pa = A.permutation;
       auto pb = B.permutation;
-      for (Index h : H.tiles) {
+
+      // Each H-tile iteration produces an independent output tile, so the
+      // loop is parallel-safe. Dispatch per-H-tile work to the MADNESS task
+      // queue; pre-size a per-slot result vector so tasks write their own
+      // slot without synchronization, and gather before exiting scope so
+      // captured references stay alive for the task lifetime.
+      std::vector<Index> local_hs;
+      {
+        auto const pc = C.permutation;
+        for (Index h : H.tiles) {
+          if (C.array.is_local(apply(pc, h))) local_hs.push_back(h);
+        }
+      }
+      std::vector<std::pair<Index, ResultTensor>> h_results(local_hs.size());
+
+      auto per_h_work = [&, pa, pb](Index h, size_t slot) -> bool {
         auto const pc = C.permutation;
         auto const c = apply(pc, h);
-        if (!C.array.is_local(c)) continue;
         size_t batch = 1;
         for (size_t i = 0; i < h.size(); ++i) {
           batch *= H.batch[i].at(h[i]);
@@ -752,8 +766,18 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
         tile = tile.reshape(shape);
         // then permute to target C layout c = (c1 c2 ...)
         if (pc) tile = tile.permute(pc);
-        // and move to C_local_tiles
-        C_local_tiles.emplace_back(std::move(c), std::move(tile));
+        h_results[slot] = {c, std::move(tile)};
+        return true;
+      };
+
+      std::vector<madness::Future<bool>> h_futures;
+      h_futures.reserve(local_hs.size());
+      for (size_t slot = 0; slot < local_hs.size(); ++slot) {
+        h_futures.push_back(world.taskq.add(per_h_work, local_hs[slot], slot));
+      }
+      for (auto &fut : h_futures) fut.get();
+      for (auto &r : h_results) {
+        C_local_tiles.emplace_back(std::move(r.first), std::move(r.second));
       }
 
       build_C_array();