1616//------------------------------------------------------------------------------
1717
1818// LAGraph_Matrix_Sum combines an array of matrices into a single matrix C. It
19- // computes the total number of entries across all inputs, allocates a single
20- // tuple buffer (I, J, X) large enough to hold every entry, extracts the tuples
21- // of each input matrix into that buffer, and then calls GrB_Matrix_build with
22- // the binary operator dup to combine any duplicate (i,j) entries. With dup =
19+ // computes the total number of entries across all inputs and the offset at
20+ // which each matrix's tuples begin in a single shared tuple buffer (I, J, X)
21+ // large enough to hold every entry. Because each matrix writes to a disjoint
22+ // region of that buffer, the per-matrix extraction is parallelized across
23+ // LG_nthreads_outer threads with OpenMP; SuiteSparse:GraphBLAS parallelizes
24+ // each GrB_Matrix_extractTuples internally with LG_nthreads_inner threads. The
25+ // concatenated tuples are then passed to GrB_Matrix_build, using the binary
26+ // operator dup to combine any duplicate (i,j) entries. With dup =
2327// GrB_PLUS_FP64 (for example) this computes the element-wise sum of all input
2428// matrices.
2529
3135 LAGraph_Free ((void **) &I, NULL) ; \
3236 LAGraph_Free ((void **) &J, NULL) ; \
3337 LAGraph_Free ((void **) &X, NULL) ; \
38+ LAGraph_Free ((void **) &Offsets, NULL) ; \
3439}
3540
3641#define LG_FREE_ALL \
@@ -58,7 +63,7 @@ int LAGraph_Matrix_Sum
5863 //--------------------------------------------------------------------------
5964
6065 LG_CLEAR_MSG ;
61- GrB_Index * I = NULL , * J = NULL ;
66+ GrB_Index * I = NULL , * J = NULL , * Offsets = NULL ;
6267 void * X = NULL ;
6368 LG_ASSERT_MSG (C != NULL , GrB_NULL_POINTER , "&C != NULL" ) ;
6469 LG_ASSERT (Matrices != NULL , GrB_NULL_POINTER ) ;
@@ -78,10 +83,17 @@ int LAGraph_Matrix_Sum
7883 GRB_TRY (GrB_get (Matrices [0 ], & typecode , GrB_EL_TYPE_CODE )) ;
7984
8085 //--------------------------------------------------------------------------
81- // validate every matrix and accumulate the total number of entries
86+ // validate every matrix and compute where its tuples begin in the buffer
8287 //--------------------------------------------------------------------------
8388
84- GrB_Index total = 0 ;
89+ // Offsets [k] is the position in (I, J, X) at which the tuples of matrix k
90+ // begin; Offsets [k+1] - Offsets [k] is its number of entries. This prefix
91+ // sum gives each matrix a disjoint buffer region so the extraction below
92+ // can run in parallel without any data races.
93+
94+ LG_TRY (LAGraph_Malloc ((void * * ) & Offsets , nmatrices + 1 ,
95+ sizeof (GrB_Index ), msg )) ;
96+ Offsets [0 ] = 0 ;
8597 for (GrB_Index k = 0 ; k < nmatrices ; k ++ )
8698 {
8799 GrB_Matrix Ak = Matrices [k ] ;
@@ -96,8 +108,9 @@ int LAGraph_Matrix_Sum
96108 LG_ASSERT_MSG (code == typecode , GrB_DOMAIN_MISMATCH ,
97109 "all input matrices must have the same type" ) ;
98110 GRB_TRY (GrB_Matrix_nvals (& n , Ak )) ;
99- total += n ;
111+ Offsets [ k + 1 ] = Offsets [ k ] + n ;
100112 }
113+ GrB_Index total = Offsets [nmatrices ] ;
101114
102115 //--------------------------------------------------------------------------
103116 // allocate the shared row/column index buffers (guard against size 0)
@@ -107,14 +120,25 @@ int LAGraph_Matrix_Sum
107120 LG_TRY (LAGraph_Malloc ((void * * ) & I , alloc , sizeof (GrB_Index ), msg )) ;
108121 LG_TRY (LAGraph_Malloc ((void * * ) & J , alloc , sizeof (GrB_Index ), msg )) ;
109122
123+ //--------------------------------------------------------------------------
124+ // determine the number of threads for the outer extraction loop
125+ //--------------------------------------------------------------------------
126+
127+ int nthreads = LG_nthreads_outer ;
128+ nthreads = LAGRAPH_MIN (nthreads , (int ) nmatrices ) ;
129+ nthreads = LAGRAPH_MAX (nthreads , 1 ) ;
130+
110131 //--------------------------------------------------------------------------
111132 // extract tuples from every matrix, then build the result
112133 //--------------------------------------------------------------------------
113134
114135 // For each built-in type: allocate the value buffer X with the correct
115- // element size, extract the tuples of every input matrix into the shared
116- // buffer at the running offset, create C, and build it with the dup
117- // operator to combine duplicate (i,j) entries.
136+ // element size, extract the tuples of every input matrix into its disjoint
137+ // region of the shared buffer (in parallel, since the regions never
138+ // overlap), create C, and build it with the dup operator to combine
139+ // duplicate (i,j) entries. GRB_TRY cannot be used inside an OpenMP region
140+ // (it returns from the function), so the first error is captured into
141+ // sum_status under a critical section and checked after the region.
118142
119143 #define LG_SUM_CASE (code , ctype , gtype , suffix ) \
120144 case code : \
@@ -123,18 +147,23 @@ int LAGraph_Matrix_Sum
123147 LG_TRY (LAGraph_Malloc ((void **) &Xt, alloc, sizeof (ctype), \
124148 msg)) ; \
125149 X = (void *) Xt ; \
126- GrB_Index offset = 0 ; \
127- for (GrB_Index k = 0 ; k < nmatrices ; k++) \
150+ int sum_status = GrB_SUCCESS ; \
151+ int64_t k ; \
152+ _Pragma ("omp parallel for num_threads(nthreads) schedule(dynamic,1)") \
153+ for (k = 0 ; k < (int64_t) nmatrices ; k++) \
128154 { \
129- GrB_Index n, got ; \
130- GRB_TRY (GrB_Matrix_nvals (&n, Matrices [k])) ; \
131- if (n == 0) continue ; \
132- got = n ; \
133- GRB_TRY (GrB_Matrix_extractTuples_ ## suffix ( \
134- I + offset, J + offset, Xt + offset, &got, \
135- Matrices [k])) ; \
136- offset += n ; \
155+ GrB_Index off = Offsets [k] ; \
156+ GrB_Index got = Offsets [k+1] - off ; \
157+ if (got == 0) continue ; \
158+ GrB_Info info = GrB_Matrix_extractTuples_ ## suffix ( \
159+ I + off, J + off, Xt + off, &got, Matrices [k]) ; \
160+ if (info < GrB_SUCCESS) \
161+ { \
162+ _Pragma ("omp critical") \
163+ { if (sum_status >= GrB_SUCCESS) sum_status = info ; } \
164+ } \
137165 } \
166+ GRB_TRY (sum_status) ; \
138167 GRB_TRY (GrB_Matrix_new (C, gtype, nrows, ncols)) ; \
139168 GRB_TRY (GrB_Matrix_build_ ## suffix (*C, I, J, Xt, total, \
140169 dup)) ; \
0 commit comments