Skip to content

Commit 39938b1

Browse files
Use relative tolerance for tests (#156)
* Fix validation tolerance: use relative error instead of absolute The validation was using absolute error tolerance (1e-8) which fails for large matrix multiplication results (magnitude ~1e4). This caused false negatives where COSMA computed correct results but failed validation. Changes: - Switch from absolute error to relative error for validation - Use 1e-5 tolerance for float32 (appropriate for single precision) - Use 1e-8 tolerance for float64 (appropriate for double precision) - Handle small values near zero with absolute error fallback This fixes issue #153 where K-split strategy was incorrectly reported as producing 93.6% errors when actual relative errors were < 1e-6. Tested with: - 32x896x896 float32: now passes (was 93.8% false errors) - 32x10000x896 float32: now passes (was 93.6% false errors) - 32x32x32 float64: still passes (regression test) * format --------- Co-authored-by: David Sanftenberg <david.sanftenberg@gmail.com>
1 parent 5ce36f9 commit 39938b1

1 file changed

Lines changed: 128 additions & 74 deletions

File tree

utils/cosma_utils.hpp

Lines changed: 128 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
#include <algorithm>
22
#include <cctype>
3+
#include <cosma/local_multiply.hpp>
4+
#include <cosma/mpi_mapper.hpp>
5+
#include <cosma/multiply.hpp>
36
#include <cstdlib>
47
#include <fstream>
58
#include <iostream>
9+
#include <random>
610
#include <string>
711
#include <vector>
8-
#include <random>
9-
#include <cosma/local_multiply.hpp>
10-
#include <cosma/multiply.hpp>
11-
#include <cosma/mpi_mapper.hpp>
1212

1313
using namespace cosma;
1414

1515
template <typename T>
16-
void fill_matrix(T* ptr, size_t size) {
17-
static std::random_device dev; // seed
18-
static std::mt19937 rng(dev()); // generator
16+
void fill_matrix(T *ptr, size_t size) {
17+
static std::random_device dev; // seed
18+
static std::mt19937 rng(dev()); // generator
1919
static std::uniform_real_distribution<T> dist(10.0); // distribution
2020

2121
for (unsigned i = 0; i < size; ++i) {
@@ -24,9 +24,9 @@ void fill_matrix(T* ptr, size_t size) {
2424
}
2525

2626
template <typename T>
27-
void fill_matrix(std::complex<T>* ptr, size_t size) {
28-
static std::random_device dev; // seed
29-
static std::mt19937 rng(dev()); // generator
27+
void fill_matrix(std::complex<T> *ptr, size_t size) {
28+
static std::random_device dev; // seed
29+
static std::mt19937 rng(dev()); // generator
3030
static std::uniform_real_distribution<T> dist(10.0); // distribution
3131

3232
for (unsigned i = 0; i < size; ++i) {
@@ -36,10 +36,10 @@ void fill_matrix(std::complex<T>* ptr, size_t size) {
3636

3737
template <typename Scalar>
3838
bool test_cosma(Strategy s,
39-
context<Scalar>& ctx,
40-
MPI_Comm comm = MPI_COMM_WORLD,
41-
double epsilon = 1e-8,
42-
int tag = 0) {
39+
context<Scalar> &ctx,
40+
MPI_Comm comm = MPI_COMM_WORLD,
41+
double epsilon = 1e-8,
42+
int tag = 0) {
4343
auto alpha = Scalar{1};
4444
auto beta = Scalar{1};
4545

@@ -103,12 +103,15 @@ bool test_cosma(Strategy s,
103103
std::vector<Scalar> As, Bs, Cs;
104104
if (rank == 0) {
105105
As = std::vector<Scalar>(m * k);
106-
std::memcpy(As.data(), A.matrix_pointer(), A.matrix_size()*sizeof(Scalar));
106+
std::memcpy(
107+
As.data(), A.matrix_pointer(), A.matrix_size() * sizeof(Scalar));
107108
Bs = std::vector<Scalar>(k * n);
108-
std::memcpy(Bs.data(), B.matrix_pointer(), B.matrix_size()*sizeof(Scalar));
109+
std::memcpy(
110+
Bs.data(), B.matrix_pointer(), B.matrix_size() * sizeof(Scalar));
109111
// copy C in case beta > 0
110112
Cs = std::vector<Scalar>(m * n);
111-
std::memcpy(Cs.data(), C.matrix_pointer(), C.matrix_size()*sizeof(Scalar));
113+
std::memcpy(
114+
Cs.data(), C.matrix_pointer(), C.matrix_size() * sizeof(Scalar));
112115

113116
int offsetA = sizeA;
114117
int offsetB = sizeB;
@@ -124,53 +127,62 @@ bool test_cosma(Strategy s,
124127

125128
// Rank 0 receive data
126129
int info = MPI_Recv(As.data() + offsetA,
127-
receive_size_A,
128-
mpi_type,
129-
i,
130-
tag*n_comm_rounds,
131-
comm,
132-
&status);
130+
receive_size_A,
131+
mpi_type,
132+
i,
133+
tag * n_comm_rounds,
134+
comm,
135+
&status);
133136
if (info != MPI_SUCCESS) {
134137
// check if we received the right amount
135138
MPI_Get_elements(&status, mpi_type, &amount);
136139
if (amount != receive_size_A) {
137-
std::cout << "Error: Did not receive all data for matrix A!" << std::endl;
138-
std::cout << "Received " << amount << ", instead of " << receive_size_A << std::endl;
139-
std::cout << "Message source: " << status.MPI_SOURCE << ", tag = " << status.MPI_TAG << std::endl;
140+
std::cout << "Error: Did not receive all data for matrix A!"
141+
<< std::endl;
142+
std::cout << "Received " << amount << ", instead of "
143+
<< receive_size_A << std::endl;
144+
std::cout << "Message source: " << status.MPI_SOURCE
145+
<< ", tag = " << status.MPI_TAG << std::endl;
140146
}
141147
}
142148

143149
info = MPI_Recv(Bs.data() + offsetB,
144-
receive_size_B,
145-
mpi_type,
146-
i,
147-
tag*n_comm_rounds + 1,
148-
comm,
149-
&status);
150+
receive_size_B,
151+
mpi_type,
152+
i,
153+
tag * n_comm_rounds + 1,
154+
comm,
155+
&status);
150156
if (info != MPI_SUCCESS) {
151157
// check if we received the right amount
152158
MPI_Get_elements(&status, mpi_type, &amount);
153159
if (amount != receive_size_B) {
154-
std::cout << "Error: Did not receive all data for matrix B!" << std::endl;
155-
std::cout << "Received " << amount << ", instead of " << receive_size_B << std::endl;
156-
std::cout << "Message source: " << status.MPI_SOURCE << ", tag = " << status.MPI_TAG << std::endl;
160+
std::cout << "Error: Did not receive all data for matrix B!"
161+
<< std::endl;
162+
std::cout << "Received " << amount << ", instead of "
163+
<< receive_size_B << std::endl;
164+
std::cout << "Message source: " << status.MPI_SOURCE
165+
<< ", tag = " << status.MPI_TAG << std::endl;
157166
}
158167
}
159168

160169
info = MPI_Recv(Cs.data() + offsetC,
161-
receive_size_C,
162-
mpi_type,
163-
i,
164-
tag*n_comm_rounds + 2,
165-
comm,
166-
&status);
170+
receive_size_C,
171+
mpi_type,
172+
i,
173+
tag * n_comm_rounds + 2,
174+
comm,
175+
&status);
167176
if (info != MPI_SUCCESS) {
168177
// check if we received the right amount
169178
MPI_Get_elements(&status, mpi_type, &amount);
170179
if (amount != receive_size_C) {
171-
std::cout << "Error: Did not receive all data for matrix C!" << std::endl;
172-
std::cout << "Received " << amount << ", instead of " << receive_size_C << std::endl;
173-
std::cout << "Message source: " << status.MPI_SOURCE << ", tag = " << status.MPI_TAG << std::endl;
180+
std::cout << "Error: Did not receive all data for matrix C!"
181+
<< std::endl;
182+
std::cout << "Received " << amount << ", instead of "
183+
<< receive_size_C << std::endl;
184+
std::cout << "Message source: " << status.MPI_SOURCE
185+
<< ", tag = " << status.MPI_TAG << std::endl;
174186
}
175187
}
176188

@@ -181,17 +193,31 @@ bool test_cosma(Strategy s,
181193
}
182194
// Rank i send data
183195
if (rank > 0 && rank < s.P) {
184-
int info = MPI_Ssend(A.matrix_pointer(), sizeA, mpi_type, 0, tag*n_comm_rounds, comm);
196+
int info = MPI_Ssend(
197+
A.matrix_pointer(), sizeA, mpi_type, 0, tag * n_comm_rounds, comm);
185198
if (info != MPI_SUCCESS) {
186-
std::cout << "MPI_Send was not successful on rank: " << rank << ", for matrix A" << std::endl;
199+
std::cout << "MPI_Send was not successful on rank: " << rank
200+
<< ", for matrix A" << std::endl;
187201
}
188-
info = MPI_Ssend(B.matrix_pointer(), sizeB, mpi_type, 0, tag*n_comm_rounds+1, comm);
202+
info = MPI_Ssend(B.matrix_pointer(),
203+
sizeB,
204+
mpi_type,
205+
0,
206+
tag * n_comm_rounds + 1,
207+
comm);
189208
if (info != MPI_SUCCESS) {
190-
std::cout << "MPI_Send was not successful on rank: " << rank << ", for matrix B" << std::endl;
209+
std::cout << "MPI_Send was not successful on rank: " << rank
210+
<< ", for matrix B" << std::endl;
191211
}
192-
info = MPI_Ssend(C.matrix_pointer(), sizeC, mpi_type, 0, tag*n_comm_rounds+2, comm);
212+
info = MPI_Ssend(C.matrix_pointer(),
213+
sizeC,
214+
mpi_type,
215+
0,
216+
tag * n_comm_rounds + 2,
217+
comm);
193218
if (info != MPI_SUCCESS) {
194-
std::cout << "MPI_Send was not successful on rank: " << rank << ", for matrix C" << std::endl;
219+
std::cout << "MPI_Send was not successful on rank: " << rank
220+
<< ", for matrix C" << std::endl;
195221
}
196222
}
197223

@@ -247,15 +273,14 @@ bool test_cosma(Strategy s,
247273
offsetC += local_size_C;
248274
}
249275
// Now compute the result
250-
cosma::local_multiply_cpu(
251-
globA.data(),
252-
globB.data(),
253-
globCcheck.data(),
254-
m,
255-
n,
256-
k,
257-
alpha,
258-
beta);
276+
cosma::local_multiply_cpu(globA.data(),
277+
globB.data(),
278+
globCcheck.data(),
279+
m,
280+
n,
281+
k,
282+
alpha,
283+
beta);
259284
#ifdef DEBUG
260285
std::cout << "Complete matrix A: " << std::endl;
261286
for (int i = 0; i < m; i++) {
@@ -289,7 +314,8 @@ bool test_cosma(Strategy s,
289314

290315
// Then rank0 asks for other ranks data
291316
if (rank == 0) {
292-
std::memcpy(Cs.data(), C.matrix_pointer(), C.matrix_size()*sizeof(Scalar));
317+
std::memcpy(
318+
Cs.data(), C.matrix_pointer(), C.matrix_size() * sizeof(Scalar));
293319

294320
int offsetC = sizeC;
295321

@@ -300,15 +326,20 @@ bool test_cosma(Strategy s,
300326
receive_size_C,
301327
mpi_type,
302328
i,
303-
tag*n_comm_rounds + 4,
329+
tag * n_comm_rounds + 4,
304330
comm,
305331
MPI_STATUSES_IGNORE);
306332
offsetC += receive_size_C;
307333
}
308334
}
309335
// Rank i sends data
310336
if (rank > 0 && rank < s.P) {
311-
MPI_Ssend(C.matrix_pointer(), sizeC, mpi_type, 0, tag*n_comm_rounds+4, comm);
337+
MPI_Ssend(C.matrix_pointer(),
338+
sizeC,
339+
mpi_type,
340+
0,
341+
tag * n_comm_rounds + 4,
342+
comm);
312343
}
313344

314345
// Then rank 0 must reorder data locally
@@ -333,26 +364,45 @@ bool test_cosma(Strategy s,
333364
// Now Check result
334365
isOK = globCcheck.size() == globC.size();
335366
for (int i = 0; i < globC.size(); ++i) {
336-
isOK = isOK && (std::abs(globC[i] - globCcheck[i]) < epsilon);
367+
// Use relative error for large values, absolute error for small
368+
// values
369+
double abs_error = std::abs(globC[i] - globCcheck[i]);
370+
double scale =
371+
std::max(std::abs(globC[i]), std::abs(globCcheck[i]));
372+
double rel_error = (scale > 1e-10) ? abs_error / scale : abs_error;
373+
// For float32, relative error tolerance should be ~1e-6
374+
// For float64, relative error tolerance should be ~1e-12
375+
double tolerance = (sizeof(Scalar) == 4) ? 1e-5 : epsilon;
376+
isOK = isOK && (rel_error < tolerance);
337377
}
338378

339379
if (!isOK) {
340380
std::cout << "Result is NOT OK" << std::endl;
381+
int error_count = 0;
382+
const int MAX_ERRORS_TO_PRINT = 20;
341383
for (int i = 0; i < m * n; i++) {
342384
if (globCcheck[i] != globC[i]) {
343-
int x = i % m;
344-
int y = i / m;
345-
int locidx, rank;
346-
std::tie(locidx, rank) = C.local_coordinates(x, y);
347-
std::cout << "global(" << x << ", " << y
348-
<< ") = (loc = " << locidx << ", rank = " << rank
349-
<< ") = " << globC.at(i) << " and should be "
350-
<< globCcheck.at(i) << std::endl;
385+
error_count++;
386+
if (error_count <= MAX_ERRORS_TO_PRINT) {
387+
int x = i % m;
388+
int y = i / m;
389+
int locidx, rank;
390+
std::tie(locidx, rank) = C.local_coordinates(x, y);
391+
std::cout
392+
<< "global(" << x << ", " << y
393+
<< ") = (loc = " << locidx << ", rank = " << rank
394+
<< ") = " << globC.at(i) << " and should be "
395+
<< globCcheck.at(i) << " (diff = "
396+
<< std::abs(globC.at(i) - globCcheck.at(i)) << ")"
397+
<< std::endl;
398+
}
351399
}
352400
}
353-
}
354-
else {
355-
std::cout <<"Result is OK"<<std::endl;
401+
std::cout << "Total errors: " << error_count << " out of "
402+
<< (m * n) << " elements ("
403+
<< (100.0 * error_count / (m * n)) << "%)" << std::endl;
404+
} else {
405+
std::cout << "Result is OK" << std::endl;
356406
}
357407
}
358408
#ifdef DEBUG
@@ -376,5 +426,9 @@ bool test_cosma(Strategy s,
376426
MPI_Barrier(comm);
377427
}
378428
#endif // DEBUG
429+
430+
// Synchronize all ranks before returning to prevent hangs
431+
MPI_Barrier(comm);
432+
379433
return rank > 0 || isOK;
380434
}

0 commit comments

Comments
 (0)