Skip to content

Commit a3fb7bb

Browse files
authored
Stabilize MPI test timing (#780)
Synchronize ranks before timed sections so scheduler skew and barrier waits are not counted as task runtime, preventing rare timeout flakes like these: ``` [ RUN ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3 unknown file: error: C++ exception with description " Task execute time need to be: time < 1 secs. Original time in secs: 1.21769 " thrown in the test body. [ OK ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3 (1224 ms) [ FAILED ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3, where GetParam() = (64-byte object <20-AA 75-60 F6-7F 00-00 C0-6C 6E-60 F6-7F 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 88-77 B8-48 FD-01 00-00>, "nesterov_a_test_task_processes_3_mpi_enabled", (3, "3")) (1225 ms) [ RUN ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_7_7 job aborted: [ranks] message [0] terminated [1] application aborted aborting MPI_COMM_WORLD (comm=0x44000000), error 1, comm rank 1 [2] terminated ---- error analysis ----- [1] on runnervmqq1k9 D:\a\parallel_programming_course\parallel_programming_course\install\bin\ppc_func_tests aborted the job. abort code 1 ---- error analysis ----- [ PROCESS 1 ] [ PROCESS 1 ] Traceback (most recent call last): File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 308, in <module> _execute(args_dict, env_copy) File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 283, in _execute runner.run_processes(args_dict["additional_mpi_args"]) File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 247, in run_processes self.__run_exec( File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 122, in __run_exec raise Exception(f"Subprocess return {result.returncode}.") Exception: Subprocess return 1. Error: Process completed with exit code 1. ``` Please go to the `Preview` tab and select the appropriate template: * [Submit Student task (English)](?expand=1&template=task_submission_en.md) * [Submit Student task (Russian)](?expand=1&template=task_submission_ru.md) * [Submit Fix for Student task (English)](?expand=1&template=task_fix_submission_en.md) * [Submit Fix for Student task (Russian)](?expand=1&template=task_fix_submission_ru.md)
1 parent 626e4be commit a3fb7bb

5 files changed

Lines changed: 28 additions & 6 deletions

File tree

modules/runners/src/runners.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,22 +82,22 @@ int RunAllTests() {
8282
}
8383

8484
void SyncGTestSeed() {
85-
unsigned int seed = 0;
8685
int rank = -1;
8786
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
88-
if (rank == 0) {
87+
int seed = ::testing::GTEST_FLAG(random_seed);
88+
if (rank == 0 && seed == 0) {
8989
try {
90-
seed = std::random_device{}();
90+
seed = static_cast<int>((std::random_device{}() % 99999U) + 1U);
9191
} catch (...) {
9292
seed = 0;
9393
}
9494
if (seed == 0) {
9595
const auto now = static_cast<std::uint64_t>(std::chrono::steady_clock::now().time_since_epoch().count());
96-
seed = static_cast<unsigned int>(((now & 0x7fffffffULL) | 1ULL));
96+
seed = static_cast<int>((now % 99999ULL) + 1ULL);
9797
}
9898
}
99-
MPI_Bcast(&seed, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
100-
::testing::GTEST_FLAG(random_seed) = static_cast<int>(seed);
99+
MPI_Bcast(&seed, 1, MPI_INT, 0, MPI_COMM_WORLD);
100+
::testing::GTEST_FLAG(random_seed) = seed;
101101
}
102102

103103
void SyncGTestFilter() {

modules/util/include/func_test_util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class BaseRunFuncTests : public ::testing::TestWithParam<FuncTestParam<InType, O
103103

104104
void ValidateTask() {
105105
EXPECT_TRUE(task_->Validation());
106+
SynchronizeMpiRanks();
106107
EXPECT_TRUE(task_->PreProcessing());
107108
}
108109

modules/util/include/perf_test_util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class BaseRunPerfTests : public ::testing::TestWithParam<PerfTestParam<InType, O
8585
task_ = task_getter(GetTestInputData());
8686
ppc::performance::Perf perf(task_);
8787
ppc::performance::PerfAttr perf_attr;
88+
SynchronizeMpiRanks();
8889
SetPerfAttributes(perf_attr);
8990

9091
if (mode == ppc::performance::PerfResults::TypeOfRunning::kPipeline) {

modules/util/include/util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ int GetNumThreads();
7575
int GetNumProc();
7676
double GetTaskMaxTime();
7777
double GetPerfMaxTime();
78+
void SynchronizeMpiRanks();
7879

7980
template <typename T>
8081
std::string GetNamespace() {

modules/util/src/util.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "util/include/util.hpp"
22

3+
#include <mpi.h>
4+
35
#include <algorithm>
46
#include <array>
57
#include <filesystem>
@@ -65,3 +67,20 @@ bool ppc::util::IsUnderMpirun() {
6567
return static_cast<bool>(mpi_env.has_value());
6668
});
6769
}
70+
71+
void ppc::util::SynchronizeMpiRanks() {
72+
int initialized = 0;
73+
if (MPI_Initialized(&initialized) != MPI_SUCCESS || initialized == 0) {
74+
return;
75+
}
76+
77+
int finalized = 0;
78+
if (MPI_Finalized(&finalized) != MPI_SUCCESS || finalized != 0) {
79+
return;
80+
}
81+
82+
const int barrier_res = MPI_Barrier(MPI_COMM_WORLD);
83+
if (barrier_res != MPI_SUCCESS) {
84+
MPI_Abort(MPI_COMM_WORLD, barrier_res);
85+
}
86+
}

0 commit comments

Comments
 (0)