Skip to content

Commit 003ed71

Browse files
committed
Relax timeout to 180s
1 parent 7f2a703 commit 003ed71

1 file changed

Lines changed: 3 additions & 3 deletions

File tree

deep_gemm/include/deep_gemm/comm/barrier.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,15 @@ CUTLASS_DEVICE void nvlink_barrier(const layout::Workspace& workspace,
5050
ptx::red_add_rel_sys(sym_buffer.map(signal_ptr, thread_idx), signal_sign ? -1 : 1);
5151
sync_scope();
5252

53-
// Update status and wait arrival (with 30s timeout, at 2 GHz)
54-
constexpr int64_t kNumTimeoutCycles = 30ll * 2000000000ll;
53+
// Update status and wait arrival (with 180s timeout, at 2 GHz)
54+
constexpr int64_t kNumTimeoutCycles = 180ll * 2000000000ll;
5555
if (thread_idx == 0) {
5656
ptx::red_add(counter_ptr, 1);
5757
const int target = signal_sign ? 0 : static_cast<int>(kNumRanks);
5858
const auto start_clock = clock64();
5959
while (ptx::ld_acq_sys(signal_ptr) != target) {
6060
if (clock64() - start_clock >= kNumTimeoutCycles) {
61-
printf("DeepGEMM NVLink barrier timeout (30s): rank=%d, counter=%d, signal=%d, target=%d, phase=%d, sign=%d, tag=%d\n",
61+
printf("DeepGEMM NVLink barrier timeout (180s): rank=%d, counter=%d, signal=%d, target=%d, phase=%d, sign=%d, tag=%d\n",
6262
sym_buffer.rank_idx, *counter_ptr, ptx::ld_acq_sys(signal_ptr), target, signal_phase, signal_sign, kTag);
6363
DG_DEVICE_ASSERT(false and "NVLink barrier timeout");
6464
}

0 commit comments

Comments
 (0)