Skip to content

Commit b5d9008

Browse files
committed
sync streams
1 parent 6929a08 commit b5d9008

File tree

2 files changed

+68
-30
lines changed

2 files changed

+68
-30
lines changed

include/tensor.cuh

Lines changed: 66 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ private:
156156
m_cublasHandles.resize(m_numCublasHandlesStreams);
157157
m_cublasStreams.resize(m_numCublasHandlesStreams);
158158
m_cusolverHandles.resize(m_numCublasHandlesStreams);
159-
for (size_t i=0; i<m_numCublasHandlesStreams; i++) {
159+
for (size_t i = 0; i < m_numCublasHandlesStreams; i++) {
160160
gpuErrChk(cublasCreate(&m_cublasHandles[i]));
161161
gpuErrChk(cudaStreamCreate(&m_cublasStreams[i]));
162162
gpuErrChk(cublasSetStream(m_cublasHandles[i], m_cublasStreams[i]));
@@ -166,7 +166,7 @@ private:
166166
}
167167

168168
~Session() {
169-
for (size_t i=0; i<m_numCublasHandlesStreams; i++) {
169+
for (size_t i = 0; i < m_numCublasHandlesStreams; i++) {
170170
gpuErrChk(cublasDestroy(m_cublasHandles[i]));
171171
gpuErrChk(cusolverDnDestroy(m_cusolverHandles[i]));
172172
}
@@ -188,14 +188,14 @@ public:
188188
* @param idx index of stream
189189
* @return cuBLAS handle
190190
*/
191-
cublasHandle_t &cuBlasHandle(size_t idx=0) { return m_cublasHandles[idx]; }
191+
cublasHandle_t &cuBlasHandle(size_t idx = 0) { return m_cublasHandles[idx]; }
192192

193193
/**
194194
* cuSolver handle
195195
* @param idx index of stream
196196
* @return cuSolver handle
197197
*/
198-
cusolverDnHandle_t &cuSolverHandle(size_t idx=0) { return m_cusolverHandles[idx]; }
198+
cusolverDnHandle_t &cuSolverHandle(size_t idx = 0) { return m_cusolverHandles[idx]; }
199199

200200
/**
201201
* Preferred method for CUDA memory allocation; it allocated memory on the device
@@ -206,7 +206,7 @@ public:
206206
* @param s size to be allocated
207207
* @return CUDA error
208208
*/
209-
cudaError_t cudaAllocate(void** d, size_t s) {
209+
cudaError_t cudaAllocate(void **d, size_t s) {
210210
cudaError_t err = cudaMalloc(d, s);
211211
if (err == cudaSuccess) {
212212
m_bytesAllocated += s;
@@ -224,6 +224,26 @@ public:
224224
* @param s allocated bytes (can be negative)
225225
*/
226226
void incrementAllocatedBytes(int s) { m_bytesAllocated += s; }
227+
228+
/**
229+
* Synchronize stream
230+
* @param idx stream index
231+
*/
232+
void synchronizeStream(size_t idx = 0) const {
233+
if (idx >= m_numCublasHandlesStreams) {
234+
throw std::runtime_error("stream index out of range");
235+
}
236+
gpuErrChk(cudaStreamSynchronize(m_cublasStreams[idx]));
237+
}
238+
239+
/**
240+
* Synchronize all streams
241+
*/
242+
void synchronizeAllStreams() const {
243+
for (size_t i = 0; i < m_numCublasHandlesStreams; i++) {
244+
synchronizeStream(i);
245+
}
246+
}
227247
};
228248

229249

@@ -974,41 +994,46 @@ inline float DTensor<float>::dotF(const DTensor<float> &other) {
974994
template<>
975995
inline double DTensor<double>::normF() const {
976996
double the_norm;
977-
gpuErrChk(cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
978-
&the_norm));
997+
gpuErrChk(
998+
cublasDnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
999+
&the_norm));
9791000
return the_norm;
9801001
}
9811002

9821003
template<>
9831004
inline float DTensor<float>::normF() const {
9841005
float the_norm;
985-
gpuErrChk(cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
986-
&the_norm));
1006+
gpuErrChk(
1007+
cublasSnrm2(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1008+
&the_norm));
9871009
return the_norm;
9881010
}
9891011

9901012
template<>
9911013
inline float DTensor<float>::sumAbs() const {
9921014
float sumAbsAllElements;
993-
gpuErrChk(cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
994-
&sumAbsAllElements));
1015+
gpuErrChk(
1016+
cublasSasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1017+
&sumAbsAllElements));
9951018
return sumAbsAllElements;
9961019
}
9971020

9981021
template<>
9991022
inline double DTensor<double>::sumAbs() const {
10001023
double sumAbsAllElements;
1001-
gpuErrChk(cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1002-
&sumAbsAllElements));
1024+
gpuErrChk(
1025+
cublasDasum(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1026+
&sumAbsAllElements));
10031027
return sumAbsAllElements;
10041028
}
10051029

10061030
template<>
10071031
inline float DTensor<float>::maxAbs() const {
10081032
int idx;
10091033
float hostDst;
1010-
gpuErrChk(cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1011-
&idx));
1034+
gpuErrChk(
1035+
cublasIsamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1036+
&idx));
10121037
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
10131038
return std::signbit(hostDst) ? -hostDst : hostDst;
10141039
}
@@ -1017,8 +1042,9 @@ template<>
10171042
inline double DTensor<double>::maxAbs() const {
10181043
int idx;
10191044
double hostDst;
1020-
gpuErrChk(cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1021-
&idx));
1045+
gpuErrChk(
1046+
cublasIdamax(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1047+
&idx));
10221048
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
10231049
return std::signbit(hostDst) ? -hostDst : hostDst;
10241050
}
@@ -1027,8 +1053,9 @@ template<>
10271053
inline float DTensor<float>::minAbs() const {
10281054
int idx;
10291055
float hostDst;
1030-
gpuErrChk(cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1031-
&idx));
1056+
gpuErrChk(
1057+
cublasIsamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1058+
&idx));
10321059
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(float), cudaMemcpyDeviceToHost));
10331060
return std::signbit(hostDst) ? -hostDst : hostDst;
10341061
}
@@ -1037,8 +1064,9 @@ template<>
10371064
inline double DTensor<double>::minAbs() const {
10381065
int idx;
10391066
double hostDst;
1040-
gpuErrChk(cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1041-
&idx));
1067+
gpuErrChk(
1068+
cublasIdamin(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1,
1069+
&idx));
10421070
gpuErrChk(cudaMemcpy(&hostDst, m_d_data + idx - 1, sizeof(double), cudaMemcpyDeviceToHost));
10431071
return std::signbit(hostDst) ? -hostDst : hostDst;
10441072
}
@@ -1087,7 +1115,7 @@ inline void DTensor<T>::allocateOnDevice(size_t size, bool zero) {
10871115

10881116
if (numMats() > 1) {
10891117
m_doDestroyPtrMatrices = true;
1090-
cudaStatus = Session::getInstance().cudaAllocate((void**) &m_d_ptrMatrices, numMats() * sizeof(T *));
1118+
cudaStatus = Session::getInstance().cudaAllocate((void **) &m_d_ptrMatrices, numMats() * sizeof(T *));
10911119
if (cudaStatus != cudaSuccess) {
10921120
gpuErrChk(cudaFree(m_d_data)); // ... free previously allocated memory
10931121
gpuErrChk(cudaStatus); // ... and memento mori
@@ -1183,7 +1211,8 @@ template<>
11831211
inline DTensor<double> &DTensor<double>::operator*=(double scalar) {
11841212
double alpha = scalar;
11851213
gpuErrChk(
1186-
cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
1214+
cublasDscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
1215+
m_d_data, 1));
11871216
return *this;
11881217
}
11891218

@@ -1202,15 +1231,17 @@ template<>
12021231
inline DTensor<float> &DTensor<float>::operator*=(float scalar) {
12031232
float alpha = scalar;
12041233
gpuErrChk(
1205-
cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
1234+
cublasSscal(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
1235+
m_d_data, 1));
12061236
return *this;
12071237
}
12081238

12091239
template<>
12101240
inline DTensor<double> &DTensor<double>::operator+=(const DTensor<double> &rhs) {
12111241
const double alpha = 1.;
12121242
gpuErrChk(
1213-
cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
1243+
cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
1244+
m_d_data,
12141245
1, m_d_data, 1));
12151246
return *this;
12161247
}
@@ -1219,15 +1250,17 @@ template<>
12191250
inline DTensor<float> &DTensor<float>::operator+=(const DTensor<float> &rhs) {
12201251
const float alpha = 1.;
12211252
gpuErrChk(
1222-
cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
1253+
cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
1254+
m_d_data,
12231255
1, m_d_data, 1));
12241256
return *this;
12251257
}
12261258

12271259
template<>
12281260
inline DTensor<float> &DTensor<float>::operator-=(const DTensor<float> &rhs) {
12291261
const float alpha = -1.;
1230-
cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, 1,
1262+
cublasSaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
1263+
rhs.m_d_data, 1,
12311264
m_d_data, 1);
12321265
return *this;
12331266
}
@@ -1236,7 +1269,8 @@ template<>
12361269
inline DTensor<double> &DTensor<double>::operator-=(const DTensor<double> &rhs) {
12371270
const double alpha = -1.;
12381271
gpuErrChk(
1239-
cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
1272+
cublasDaxpy(Session::getInstance().cuBlasHandle(m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
1273+
m_d_data,
12401274
1, m_d_data, 1));
12411275
return *this;
12421276
}
@@ -2259,7 +2293,8 @@ inline void GivensAnnihilator<T>::annihilate(size_t i, size_t k, size_t j) {
22592293
* Pass cosine and sine as device pointers
22602294
* (Avoid having to download first)
22612295
*/
2262-
gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE));
2296+
gpuErrChk(
2297+
cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_DEVICE));
22632298

22642299
/* Useful definitions */
22652300
T *aux = m_d_rhyp_cos_sin->raw();
@@ -2272,7 +2307,8 @@ inline void GivensAnnihilator<T>::annihilate(size_t i, size_t k, size_t j) {
22722307
m_matrix->applyLeftGivensRotation(i, k, aux + 1, aux + 2);
22732308

22742309
/* Change back to default behaviour */
2275-
gpuErrChk(cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST));
2310+
gpuErrChk(
2311+
cublasSetPointerMode(Session::getInstance().cuBlasHandle(m_matrix->streamIdx()), CUBLAS_POINTER_MODE_HOST));
22762312
}
22772313

22782314

main.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ void xyz() {
2121
std::cout << "Memory: " << std::setprecision(3)
2222
<< (float) Session::getInstance().totalAllocatedBytes() / 1e6
2323
<< " MB" << std::endl;
24+
25+
Session::getInstance().synchronizeAllStreams();
2426
}
2527

2628

0 commit comments

Comments
 (0)