Skip to content

Commit 1895a88

Browse files
committed
Parallelize tuple extraction in LAGraph_Matrix_Sum
The per-matrix extraction loop precomputes an offset (prefix-sum) array so each matrix's tuples occupy a disjoint region of the shared (I, J, X) buffer. That removes the loop-carried offset dependency and lets the extraction run across LG_nthreads_outer threads with OpenMP; each GrB_Matrix_extractTuples is still parallelized internally by GraphBLAS with LG_nthreads_inner threads (the two-level nested model). The public signature is unchanged: the thread count follows the usual LAGraph convention via LAGraph_SetNumThreads. Because GRB_TRY cannot return out of an OpenMP region, the first extraction error is captured under a critical section and checked after the loop. Adds test_Matrix_Sum_parallel, which sums many overlapping matrices with multiple outer threads and compares against an independently accumulated result.
1 parent 43ee0ac commit 1895a88

2 files changed

Lines changed: 109 additions & 21 deletions

File tree

src/test/test_Matrix_Sum.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,64 @@ void test_Matrix_Sum_brutal (void)
193193
}
194194
#endif
195195

196+
//------------------------------------------------------------------------------
197+
// test_Matrix_Sum_parallel: exercise the parallel extraction path
198+
//------------------------------------------------------------------------------
199+
200+
// Sum many matrices with multiple outer threads and confirm the result matches
201+
// an independently accumulated expected. This stresses the disjoint-offset
202+
// extraction under real outer parallelism.
203+
204+
void test_Matrix_Sum_parallel (void)
205+
{
206+
setup ( ) ;
207+
208+
// request 4 outer threads (saving and restoring the prior settings)
209+
int save_outer, save_inner ;
210+
OK (LAGraph_GetNumThreads (&save_outer, &save_inner, msg)) ;
211+
OK (LAGraph_SetNumThreads (4, save_inner, msg)) ;
212+
213+
#define NMAT 16
214+
GrB_Matrix Mats [NMAT] ;
215+
OK (GrB_Matrix_new (&Expected, GrB_FP64, 10, 10)) ;
216+
217+
for (int k = 0 ; k < NMAT ; k++)
218+
{
219+
// each matrix has 3 distinct (i,j) entries; the (7,7) entry is shared
220+
// by every matrix and others overlap across matrices, so duplicates
221+
// must be summed when the matrices are combined
222+
GrB_Index Mi [ ] = { (GrB_Index) (k % 10), 2, 7 } ;
223+
GrB_Index Mj [ ] = { 3, (GrB_Index) (k % 10), 7 } ;
224+
double Mx [ ] = { (double) (k + 1), 1, 2 } ;
225+
Mats [k] = NULL ;
226+
OK (GrB_Matrix_new (&Mats [k], GrB_FP64, 10, 10)) ;
227+
OK (GrB_Matrix_build_FP64 (Mats [k], Mi, Mj, Mx, 3, NULL)) ;
228+
// accumulate into Expected independently
229+
OK (GrB_eWiseAdd (Expected, NULL, NULL, GrB_PLUS_FP64, Expected,
230+
Mats [k], NULL)) ;
231+
}
232+
233+
OK (LAGraph_Matrix_Sum (&C, Mats, NMAT, GrB_PLUS_FP64, msg)) ;
234+
235+
bool ok ;
236+
OK (LAGraph_Matrix_IsEqual (&ok, C, Expected, msg)) ;
237+
TEST_CHECK (ok) ;
238+
TEST_MSG ("parallel sum of %d matrices did not match expected", NMAT) ;
239+
240+
for (int k = 0 ; k < NMAT ; k++)
241+
{
242+
OK (GrB_free (&Mats [k])) ;
243+
}
244+
OK (GrB_free (&C)) ;
245+
OK (GrB_free (&Expected)) ;
246+
#undef NMAT
247+
248+
// restore the original thread settings
249+
OK (LAGraph_SetNumThreads (save_outer, save_inner, msg)) ;
250+
251+
teardown ( ) ;
252+
}
253+
196254
//------------------------------------------------------------------------------
197255
// test_Matrix_Sum_failures: test error handling
198256
//------------------------------------------------------------------------------
@@ -266,6 +324,7 @@ TEST_LIST =
266324
{
267325
{ "Matrix_Sum", test_Matrix_Sum },
268326
{ "Matrix_Sum_types", test_Matrix_Sum_types },
327+
{ "Matrix_Sum_parallel", test_Matrix_Sum_parallel },
269328
{ "Matrix_Sum_failures", test_Matrix_Sum_failures },
270329
#if LG_BRUTAL_TESTS
271330
{ "Matrix_Sum_brutal", test_Matrix_Sum_brutal },

src/utility/LAGraph_Matrix_Sum.c

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,14 @@
1616
//------------------------------------------------------------------------------
1717

1818
// LAGraph_Matrix_Sum combines an array of matrices into a single matrix C. It
19-
// computes the total number of entries across all inputs, allocates a single
20-
// tuple buffer (I, J, X) large enough to hold every entry, extracts the tuples
21-
// of each input matrix into that buffer, and then calls GrB_Matrix_build with
22-
// the binary operator dup to combine any duplicate (i,j) entries. With dup =
19+
// computes the total number of entries across all inputs and the offset at
20+
// which each matrix's tuples begin in a single shared tuple buffer (I, J, X)
21+
// large enough to hold every entry. Because each matrix writes to a disjoint
22+
// region of that buffer, the per-matrix extraction is parallelized across
23+
// LG_nthreads_outer threads with OpenMP; SuiteSparse:GraphBLAS parallelizes
24+
// each GrB_Matrix_extractTuples internally with LG_nthreads_inner threads. The
25+
// concatenated tuples are then passed to GrB_Matrix_build, using the binary
26+
// operator dup to combine any duplicate (i,j) entries. With dup =
2327
// GrB_PLUS_FP64 (for example) this computes the element-wise sum of all input
2428
// matrices.
2529

@@ -31,6 +35,7 @@
3135
LAGraph_Free ((void **) &I, NULL) ; \
3236
LAGraph_Free ((void **) &J, NULL) ; \
3337
LAGraph_Free ((void **) &X, NULL) ; \
38+
LAGraph_Free ((void **) &Offsets, NULL) ; \
3439
}
3540

3641
#define LG_FREE_ALL \
@@ -58,7 +63,7 @@ int LAGraph_Matrix_Sum
5863
//--------------------------------------------------------------------------
5964

6065
LG_CLEAR_MSG ;
61-
GrB_Index *I = NULL, *J = NULL ;
66+
GrB_Index *I = NULL, *J = NULL, *Offsets = NULL ;
6267
void *X = NULL ;
6368
LG_ASSERT_MSG (C != NULL, GrB_NULL_POINTER, "&C != NULL") ;
6469
LG_ASSERT (Matrices != NULL, GrB_NULL_POINTER) ;
@@ -78,10 +83,17 @@ int LAGraph_Matrix_Sum
7883
GRB_TRY (GrB_get (Matrices [0], &typecode, GrB_EL_TYPE_CODE)) ;
7984

8085
//--------------------------------------------------------------------------
81-
// validate every matrix and accumulate the total number of entries
86+
// validate every matrix and compute where its tuples begin in the buffer
8287
//--------------------------------------------------------------------------
8388

84-
GrB_Index total = 0 ;
89+
// Offsets [k] is the position in (I, J, X) at which the tuples of matrix k
90+
// begin; Offsets [k+1] - Offsets [k] is its number of entries. This prefix
91+
// sum gives each matrix a disjoint buffer region so the extraction below
92+
// can run in parallel without any data races.
93+
94+
LG_TRY (LAGraph_Malloc ((void **) &Offsets, nmatrices + 1,
95+
sizeof (GrB_Index), msg)) ;
96+
Offsets [0] = 0 ;
8597
for (GrB_Index k = 0 ; k < nmatrices ; k++)
8698
{
8799
GrB_Matrix Ak = Matrices [k] ;
@@ -96,8 +108,9 @@ int LAGraph_Matrix_Sum
96108
LG_ASSERT_MSG (code == typecode, GrB_DOMAIN_MISMATCH,
97109
"all input matrices must have the same type") ;
98110
GRB_TRY (GrB_Matrix_nvals (&n, Ak)) ;
99-
total += n ;
111+
Offsets [k+1] = Offsets [k] + n ;
100112
}
113+
GrB_Index total = Offsets [nmatrices] ;
101114

102115
//--------------------------------------------------------------------------
103116
// allocate the shared row/column index buffers (guard against size 0)
@@ -107,14 +120,25 @@ int LAGraph_Matrix_Sum
107120
LG_TRY (LAGraph_Malloc ((void **) &I, alloc, sizeof (GrB_Index), msg)) ;
108121
LG_TRY (LAGraph_Malloc ((void **) &J, alloc, sizeof (GrB_Index), msg)) ;
109122

123+
//--------------------------------------------------------------------------
124+
// determine the number of threads for the outer extraction loop
125+
//--------------------------------------------------------------------------
126+
127+
int nthreads = LG_nthreads_outer ;
128+
nthreads = LAGRAPH_MIN (nthreads, (int) nmatrices) ;
129+
nthreads = LAGRAPH_MAX (nthreads, 1) ;
130+
110131
//--------------------------------------------------------------------------
111132
// extract tuples from every matrix, then build the result
112133
//--------------------------------------------------------------------------
113134

114135
// For each built-in type: allocate the value buffer X with the correct
115-
// element size, extract the tuples of every input matrix into the shared
116-
// buffer at the running offset, create C, and build it with the dup
117-
// operator to combine duplicate (i,j) entries.
136+
// element size, extract the tuples of every input matrix into its disjoint
137+
// region of the shared buffer (in parallel, since the regions never
138+
// overlap), create C, and build it with the dup operator to combine
139+
// duplicate (i,j) entries. GRB_TRY cannot be used inside an OpenMP region
140+
// (it returns from the function), so the first error is captured into
141+
// sum_status under a critical section and checked after the region.
118142

119143
#define LG_SUM_CASE(code, ctype, gtype, suffix) \
120144
case code : \
@@ -123,18 +147,23 @@ int LAGraph_Matrix_Sum
123147
LG_TRY (LAGraph_Malloc ((void **) &Xt, alloc, sizeof (ctype), \
124148
msg)) ; \
125149
X = (void *) Xt ; \
126-
GrB_Index offset = 0 ; \
127-
for (GrB_Index k = 0 ; k < nmatrices ; k++) \
150+
int sum_status = GrB_SUCCESS ; \
151+
int64_t k ; \
152+
_Pragma ("omp parallel for num_threads(nthreads) schedule(dynamic,1)") \
153+
for (k = 0 ; k < (int64_t) nmatrices ; k++) \
128154
{ \
129-
GrB_Index n, got ; \
130-
GRB_TRY (GrB_Matrix_nvals (&n, Matrices [k])) ; \
131-
if (n == 0) continue ; \
132-
got = n ; \
133-
GRB_TRY (GrB_Matrix_extractTuples_ ## suffix ( \
134-
I + offset, J + offset, Xt + offset, &got, \
135-
Matrices [k])) ; \
136-
offset += n ; \
155+
GrB_Index off = Offsets [k] ; \
156+
GrB_Index got = Offsets [k+1] - off ; \
157+
if (got == 0) continue ; \
158+
GrB_Info info = GrB_Matrix_extractTuples_ ## suffix ( \
159+
I + off, J + off, Xt + off, &got, Matrices [k]) ; \
160+
if (info < GrB_SUCCESS) \
161+
{ \
162+
_Pragma ("omp critical") \
163+
{ if (sum_status >= GrB_SUCCESS) sum_status = info ; } \
164+
} \
137165
} \
166+
GRB_TRY (sum_status) ; \
138167
GRB_TRY (GrB_Matrix_new (C, gtype, nrows, ncols)) ; \
139168
GRB_TRY (GrB_Matrix_build_ ## suffix (*C, I, J, Xt, total, \
140169
dup)) ; \

0 commit comments

Comments
 (0)