11#include < algorithm>
22#include < cctype>
3+ #include < cosma/local_multiply.hpp>
4+ #include < cosma/mpi_mapper.hpp>
5+ #include < cosma/multiply.hpp>
36#include < cstdlib>
47#include < fstream>
58#include < iostream>
9+ #include < random>
610#include < string>
711#include < vector>
8- #include < random>
9- #include < cosma/local_multiply.hpp>
10- #include < cosma/multiply.hpp>
11- #include < cosma/mpi_mapper.hpp>
1212
1313using namespace cosma ;
1414
1515template <typename T>
16- void fill_matrix (T* ptr, size_t size) {
17- static std::random_device dev; // seed
18- static std::mt19937 rng (dev ()); // generator
16+ void fill_matrix (T * ptr, size_t size) {
17+ static std::random_device dev; // seed
18+ static std::mt19937 rng (dev ()); // generator
1919 static std::uniform_real_distribution<T> dist (10.0 ); // distribution
2020
2121 for (unsigned i = 0 ; i < size; ++i) {
@@ -24,9 +24,9 @@ void fill_matrix(T* ptr, size_t size) {
2424}
2525
2626template <typename T>
27- void fill_matrix (std::complex <T>* ptr, size_t size) {
28- static std::random_device dev; // seed
29- static std::mt19937 rng (dev ()); // generator
27+ void fill_matrix (std::complex <T> * ptr, size_t size) {
28+ static std::random_device dev; // seed
29+ static std::mt19937 rng (dev ()); // generator
3030 static std::uniform_real_distribution<T> dist (10.0 ); // distribution
3131
3232 for (unsigned i = 0 ; i < size; ++i) {
@@ -36,10 +36,10 @@ void fill_matrix(std::complex<T>* ptr, size_t size) {
3636
3737template <typename Scalar>
3838bool test_cosma (Strategy s,
39- context<Scalar>& ctx,
40- MPI_Comm comm = MPI_COMM_WORLD,
41- double epsilon = 1e-8 ,
42- int tag = 0 ) {
39+ context<Scalar> & ctx,
40+ MPI_Comm comm = MPI_COMM_WORLD,
41+ double epsilon = 1e-8 ,
42+ int tag = 0 ) {
4343 auto alpha = Scalar{1 };
4444 auto beta = Scalar{1 };
4545
@@ -103,12 +103,15 @@ bool test_cosma(Strategy s,
103103 std::vector<Scalar> As, Bs, Cs;
104104 if (rank == 0 ) {
105105 As = std::vector<Scalar>(m * k);
106- std::memcpy (As.data (), A.matrix_pointer (), A.matrix_size ()*sizeof (Scalar));
106+ std::memcpy (
107+ As.data (), A.matrix_pointer (), A.matrix_size () * sizeof (Scalar));
107108 Bs = std::vector<Scalar>(k * n);
108- std::memcpy (Bs.data (), B.matrix_pointer (), B.matrix_size ()*sizeof (Scalar));
109+ std::memcpy (
110+ Bs.data (), B.matrix_pointer (), B.matrix_size () * sizeof (Scalar));
109111 // copy C in case beta > 0
110112 Cs = std::vector<Scalar>(m * n);
111- std::memcpy (Cs.data (), C.matrix_pointer (), C.matrix_size ()*sizeof (Scalar));
113+ std::memcpy (
114+ Cs.data (), C.matrix_pointer (), C.matrix_size () * sizeof (Scalar));
112115
113116 int offsetA = sizeA;
114117 int offsetB = sizeB;
@@ -124,53 +127,62 @@ bool test_cosma(Strategy s,
124127
125128 // Rank 0 receive data
126129 int info = MPI_Recv (As.data () + offsetA,
127- receive_size_A,
128- mpi_type,
129- i,
130- tag* n_comm_rounds,
131- comm,
132- &status);
130+ receive_size_A,
131+ mpi_type,
132+ i,
133+ tag * n_comm_rounds,
134+ comm,
135+ &status);
133136 if (info != MPI_SUCCESS) {
134137 // check if we received the right amount
135138 MPI_Get_elements (&status, mpi_type, &amount);
136139 if (amount != receive_size_A) {
137- std::cout << " Error: Did not receive all data for matrix A!" << std::endl;
138- std::cout << " Received " << amount << " , instead of " << receive_size_A << std::endl;
139- std::cout << " Message source: " << status.MPI_SOURCE << " , tag = " << status.MPI_TAG << std::endl;
140+ std::cout << " Error: Did not receive all data for matrix A!"
141+ << std::endl;
142+ std::cout << " Received " << amount << " , instead of "
143+ << receive_size_A << std::endl;
144+ std::cout << " Message source: " << status.MPI_SOURCE
145+ << " , tag = " << status.MPI_TAG << std::endl;
140146 }
141147 }
142148
143149 info = MPI_Recv (Bs.data () + offsetB,
144- receive_size_B,
145- mpi_type,
146- i,
147- tag* n_comm_rounds + 1 ,
148- comm,
149- &status);
150+ receive_size_B,
151+ mpi_type,
152+ i,
153+ tag * n_comm_rounds + 1 ,
154+ comm,
155+ &status);
150156 if (info != MPI_SUCCESS) {
151157 // check if we received the right amount
152158 MPI_Get_elements (&status, mpi_type, &amount);
153159 if (amount != receive_size_B) {
154- std::cout << " Error: Did not receive all data for matrix B!" << std::endl;
155- std::cout << " Received " << amount << " , instead of " << receive_size_B << std::endl;
156- std::cout << " Message source: " << status.MPI_SOURCE << " , tag = " << status.MPI_TAG << std::endl;
160+ std::cout << " Error: Did not receive all data for matrix B!"
161+ << std::endl;
162+ std::cout << " Received " << amount << " , instead of "
163+ << receive_size_B << std::endl;
164+ std::cout << " Message source: " << status.MPI_SOURCE
165+ << " , tag = " << status.MPI_TAG << std::endl;
157166 }
158167 }
159168
160169 info = MPI_Recv (Cs.data () + offsetC,
161- receive_size_C,
162- mpi_type,
163- i,
164- tag* n_comm_rounds + 2 ,
165- comm,
166- &status);
170+ receive_size_C,
171+ mpi_type,
172+ i,
173+ tag * n_comm_rounds + 2 ,
174+ comm,
175+ &status);
167176 if (info != MPI_SUCCESS) {
168177 // check if we received the right amount
169178 MPI_Get_elements (&status, mpi_type, &amount);
170179 if (amount != receive_size_C) {
171- std::cout << " Error: Did not receive all data for matrix C!" << std::endl;
172- std::cout << " Received " << amount << " , instead of " << receive_size_C << std::endl;
173- std::cout << " Message source: " << status.MPI_SOURCE << " , tag = " << status.MPI_TAG << std::endl;
180+ std::cout << " Error: Did not receive all data for matrix C!"
181+ << std::endl;
182+ std::cout << " Received " << amount << " , instead of "
183+ << receive_size_C << std::endl;
184+ std::cout << " Message source: " << status.MPI_SOURCE
185+ << " , tag = " << status.MPI_TAG << std::endl;
174186 }
175187 }
176188
@@ -181,17 +193,31 @@ bool test_cosma(Strategy s,
181193 }
182194 // Rank i send data
183195 if (rank > 0 && rank < s.P ) {
184- int info = MPI_Ssend (A.matrix_pointer (), sizeA, mpi_type, 0 , tag*n_comm_rounds, comm);
196+ int info = MPI_Ssend (
197+ A.matrix_pointer (), sizeA, mpi_type, 0 , tag * n_comm_rounds, comm);
185198 if (info != MPI_SUCCESS) {
186- std::cout << " MPI_Send was not successful on rank: " << rank << " , for matrix A" << std::endl;
199+ std::cout << " MPI_Send was not successful on rank: " << rank
200+ << " , for matrix A" << std::endl;
187201 }
188- info = MPI_Ssend (B.matrix_pointer (), sizeB, mpi_type, 0 , tag*n_comm_rounds+1 , comm);
202+ info = MPI_Ssend (B.matrix_pointer (),
203+ sizeB,
204+ mpi_type,
205+ 0 ,
206+ tag * n_comm_rounds + 1 ,
207+ comm);
189208 if (info != MPI_SUCCESS) {
190- std::cout << " MPI_Send was not successful on rank: " << rank << " , for matrix B" << std::endl;
209+ std::cout << " MPI_Send was not successful on rank: " << rank
210+ << " , for matrix B" << std::endl;
191211 }
192- info = MPI_Ssend (C.matrix_pointer (), sizeC, mpi_type, 0 , tag*n_comm_rounds+2 , comm);
212+ info = MPI_Ssend (C.matrix_pointer (),
213+ sizeC,
214+ mpi_type,
215+ 0 ,
216+ tag * n_comm_rounds + 2 ,
217+ comm);
193218 if (info != MPI_SUCCESS) {
194- std::cout << " MPI_Send was not successful on rank: " << rank << " , for matrix C" << std::endl;
219+ std::cout << " MPI_Send was not successful on rank: " << rank
220+ << " , for matrix C" << std::endl;
195221 }
196222 }
197223
@@ -247,15 +273,14 @@ bool test_cosma(Strategy s,
247273 offsetC += local_size_C;
248274 }
249275 // Now compute the result
250- cosma::local_multiply_cpu (
251- globA.data (),
252- globB.data (),
253- globCcheck.data (),
254- m,
255- n,
256- k,
257- alpha,
258- beta);
276+ cosma::local_multiply_cpu (globA.data (),
277+ globB.data (),
278+ globCcheck.data (),
279+ m,
280+ n,
281+ k,
282+ alpha,
283+ beta);
259284#ifdef DEBUG
260285 std::cout << " Complete matrix A: " << std::endl;
261286 for (int i = 0 ; i < m; i++) {
@@ -289,7 +314,8 @@ bool test_cosma(Strategy s,
289314
290315 // Then rank0 asks for other ranks data
291316 if (rank == 0 ) {
292- std::memcpy (Cs.data (), C.matrix_pointer (), C.matrix_size ()*sizeof (Scalar));
317+ std::memcpy (
318+ Cs.data (), C.matrix_pointer (), C.matrix_size () * sizeof (Scalar));
293319
294320 int offsetC = sizeC;
295321
@@ -300,15 +326,20 @@ bool test_cosma(Strategy s,
300326 receive_size_C,
301327 mpi_type,
302328 i,
303- tag* n_comm_rounds + 4 ,
329+ tag * n_comm_rounds + 4 ,
304330 comm,
305331 MPI_STATUSES_IGNORE);
306332 offsetC += receive_size_C;
307333 }
308334 }
309335 // Rank i sends data
310336 if (rank > 0 && rank < s.P ) {
311- MPI_Ssend (C.matrix_pointer (), sizeC, mpi_type, 0 , tag*n_comm_rounds+4 , comm);
337+ MPI_Ssend (C.matrix_pointer (),
338+ sizeC,
339+ mpi_type,
340+ 0 ,
341+ tag * n_comm_rounds + 4 ,
342+ comm);
312343 }
313344
314345 // Then rank 0 must reorder data locally
@@ -333,26 +364,45 @@ bool test_cosma(Strategy s,
333364 // Now Check result
334365 isOK = globCcheck.size () == globC.size ();
335366 for (int i = 0 ; i < globC.size (); ++i) {
336- isOK = isOK && (std::abs (globC[i] - globCcheck[i]) < epsilon);
367+ // Use relative error for large values, absolute error for small
368+ // values
369+ double abs_error = std::abs (globC[i] - globCcheck[i]);
370+ double scale =
371+ std::max (std::abs (globC[i]), std::abs (globCcheck[i]));
372+ double rel_error = (scale > 1e-10 ) ? abs_error / scale : abs_error;
373+ // For float32, relative error tolerance should be ~1e-6
374+ // For float64, relative error tolerance should be ~1e-12
375+ double tolerance = (sizeof (Scalar) == 4 ) ? 1e-5 : epsilon;
376+ isOK = isOK && (rel_error < tolerance);
337377 }
338378
339379 if (!isOK) {
340380 std::cout << " Result is NOT OK" << std::endl;
381+ int error_count = 0 ;
382+ const int MAX_ERRORS_TO_PRINT = 20 ;
341383 for (int i = 0 ; i < m * n; i++) {
342384 if (globCcheck[i] != globC[i]) {
343- int x = i % m;
344- int y = i / m;
345- int locidx, rank;
346- std::tie (locidx, rank) = C.local_coordinates (x, y);
347- std::cout << " global(" << x << " , " << y
348- << " ) = (loc = " << locidx << " , rank = " << rank
349- << " ) = " << globC.at (i) << " and should be "
350- << globCcheck.at (i) << std::endl;
385+ error_count++;
386+ if (error_count <= MAX_ERRORS_TO_PRINT) {
387+ int x = i % m;
388+ int y = i / m;
389+ int locidx, rank;
390+ std::tie (locidx, rank) = C.local_coordinates (x, y);
391+ std::cout
392+ << " global(" << x << " , " << y
393+ << " ) = (loc = " << locidx << " , rank = " << rank
394+ << " ) = " << globC.at (i) << " and should be "
395+ << globCcheck.at (i) << " (diff = "
396+ << std::abs (globC.at (i) - globCcheck.at (i)) << " )"
397+ << std::endl;
398+ }
351399 }
352400 }
353- }
354- else {
355- std::cout <<" Result is OK" <<std::endl;
401+ std::cout << " Total errors: " << error_count << " out of "
402+ << (m * n) << " elements ("
403+ << (100.0 * error_count / (m * n)) << " %)" << std::endl;
404+ } else {
405+ std::cout << " Result is OK" << std::endl;
356406 }
357407 }
358408#ifdef DEBUG
@@ -376,5 +426,9 @@ bool test_cosma(Strategy s,
376426 MPI_Barrier (comm);
377427 }
378428#endif // DEBUG
429+
430+ // Synchronize all ranks before returning to prevent hangs
431+ MPI_Barrier (comm);
432+
379433 return rank > 0 || isOK;
380434}
0 commit comments