@@ -156,7 +156,7 @@ private:
156156 m_cublasHandles.resize (m_numCublasHandlesStreams);
157157 m_cublasStreams.resize (m_numCublasHandlesStreams);
158158 m_cusolverHandles.resize (m_numCublasHandlesStreams);
159- for (size_t i= 0 ; i< m_numCublasHandlesStreams; i++) {
159+ for (size_t i = 0 ; i < m_numCublasHandlesStreams; i++) {
160160 gpuErrChk (cublasCreate (&m_cublasHandles[i]));
161161 gpuErrChk (cudaStreamCreate (&m_cublasStreams[i]));
162162 gpuErrChk (cublasSetStream (m_cublasHandles[i], m_cublasStreams[i]));
@@ -166,7 +166,7 @@ private:
166166 }
167167
168168 ~Session () {
169- for (size_t i= 0 ; i< m_numCublasHandlesStreams; i++) {
169+ for (size_t i = 0 ; i < m_numCublasHandlesStreams; i++) {
170170 gpuErrChk (cublasDestroy (m_cublasHandles[i]));
171171 gpuErrChk (cusolverDnDestroy (m_cusolverHandles[i]));
172172 }
@@ -188,14 +188,14 @@ public:
188188 * @param idx index of stream
189189 * @return cuBLAS handle
190190 */
191- cublasHandle_t &cuBlasHandle (size_t idx= 0 ) { return m_cublasHandles[idx]; }
191+ cublasHandle_t &cuBlasHandle (size_t idx = 0 ) { return m_cublasHandles[idx]; }
192192
193193 /* *
194194 * cuSolver handle
195195 * @param idx index of stream
196196 * @return cuSolver handle
197197 */
198- cusolverDnHandle_t &cuSolverHandle (size_t idx= 0 ) { return m_cusolverHandles[idx]; }
198+ cusolverDnHandle_t &cuSolverHandle (size_t idx = 0 ) { return m_cusolverHandles[idx]; }
199199
200200 /* *
201201 * Preferred method for CUDA memory allocation; it allocated memory on the device
@@ -206,7 +206,7 @@ public:
206206 * @param s size to be allocated
207207 * @return CUDA error
208208 */
209- cudaError_t cudaAllocate (void ** d, size_t s) {
209+ cudaError_t cudaAllocate (void ** d, size_t s) {
210210 cudaError_t err = cudaMalloc (d, s);
211211 if (err == cudaSuccess) {
212212 m_bytesAllocated += s;
@@ -224,6 +224,26 @@ public:
224224 * @param s allocated bytes (can be negative)
225225 */
226226 void incrementAllocatedBytes (int s) { m_bytesAllocated += s; }
227+
228+ /* *
229+ * Synchronize stream
230+ * @param idx stream index
231+ */
232+ void synchronizeStream (size_t idx = 0 ) const {
233+ if (idx >= m_numCublasHandlesStreams) {
234+ throw std::runtime_error (" stream index out of range" );
235+ }
236+ gpuErrChk (cudaStreamSynchronize (m_cublasStreams[idx]));
237+ }
238+
239+ /* *
240+ * Synchronize all streams
241+ */
242+ void synchronizeAllStreams () const {
243+ for (size_t i = 0 ; i < m_numCublasHandlesStreams; i++) {
244+ synchronizeStream (i);
245+ }
246+ }
227247};
228248
229249
@@ -974,41 +994,46 @@ inline float DTensor<float>::dotF(const DTensor<float> &other) {
974994template <>
975995inline double DTensor<double >::normF() const {
976996 double the_norm;
977- gpuErrChk (cublasDnrm2 (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
978- &the_norm));
997+ gpuErrChk (
998+ cublasDnrm2 (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
999+ &the_norm));
9791000 return the_norm;
9801001}
9811002
9821003template <>
9831004inline float DTensor<float >::normF() const {
9841005 float the_norm;
985- gpuErrChk (cublasSnrm2 (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
986- &the_norm));
1006+ gpuErrChk (
1007+ cublasSnrm2 (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1008+ &the_norm));
9871009 return the_norm;
9881010}
9891011
9901012template <>
9911013inline float DTensor<float >::sumAbs() const {
9921014 float sumAbsAllElements;
993- gpuErrChk (cublasSasum (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
994- &sumAbsAllElements));
1015+ gpuErrChk (
1016+ cublasSasum (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1017+ &sumAbsAllElements));
9951018 return sumAbsAllElements;
9961019}
9971020
9981021template <>
9991022inline double DTensor<double >::sumAbs() const {
10001023 double sumAbsAllElements;
1001- gpuErrChk (cublasDasum (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1002- &sumAbsAllElements));
1024+ gpuErrChk (
1025+ cublasDasum (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1026+ &sumAbsAllElements));
10031027 return sumAbsAllElements;
10041028}
10051029
10061030template <>
10071031inline float DTensor<float >::maxAbs() const {
10081032 int idx;
10091033 float hostDst;
1010- gpuErrChk (cublasIsamax (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1011- &idx));
1034+ gpuErrChk (
1035+ cublasIsamax (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1036+ &idx));
10121037 gpuErrChk (cudaMemcpy (&hostDst, m_d_data + idx - 1 , sizeof (float ), cudaMemcpyDeviceToHost));
10131038 return std::signbit (hostDst) ? -hostDst : hostDst;
10141039}
@@ -1017,8 +1042,9 @@ template<>
10171042inline double DTensor<double >::maxAbs() const {
10181043 int idx;
10191044 double hostDst;
1020- gpuErrChk (cublasIdamax (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1021- &idx));
1045+ gpuErrChk (
1046+ cublasIdamax (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1047+ &idx));
10221048 gpuErrChk (cudaMemcpy (&hostDst, m_d_data + idx - 1 , sizeof (double ), cudaMemcpyDeviceToHost));
10231049 return std::signbit (hostDst) ? -hostDst : hostDst;
10241050}
@@ -1027,8 +1053,9 @@ template<>
10271053inline float DTensor<float >::minAbs() const {
10281054 int idx;
10291055 float hostDst;
1030- gpuErrChk (cublasIsamin (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1031- &idx));
1056+ gpuErrChk (
1057+ cublasIsamin (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1058+ &idx));
10321059 gpuErrChk (cudaMemcpy (&hostDst, m_d_data + idx - 1 , sizeof (float ), cudaMemcpyDeviceToHost));
10331060 return std::signbit (hostDst) ? -hostDst : hostDst;
10341061}
@@ -1037,8 +1064,9 @@ template<>
10371064inline double DTensor<double >::minAbs() const {
10381065 int idx;
10391066 double hostDst;
1040- gpuErrChk (cublasIdamin (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1041- &idx));
1067+ gpuErrChk (
1068+ cublasIdamin (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, m_d_data, 1 ,
1069+ &idx));
10421070 gpuErrChk (cudaMemcpy (&hostDst, m_d_data + idx - 1 , sizeof (double ), cudaMemcpyDeviceToHost));
10431071 return std::signbit (hostDst) ? -hostDst : hostDst;
10441072}
@@ -1087,7 +1115,7 @@ inline void DTensor<T>::allocateOnDevice(size_t size, bool zero) {
10871115
10881116 if (numMats () > 1 ) {
10891117 m_doDestroyPtrMatrices = true ;
1090- cudaStatus = Session::getInstance ().cudaAllocate ((void **) &m_d_ptrMatrices, numMats () * sizeof (T *));
1118+ cudaStatus = Session::getInstance ().cudaAllocate ((void **) &m_d_ptrMatrices, numMats () * sizeof (T *));
10911119 if (cudaStatus != cudaSuccess) {
10921120 gpuErrChk (cudaFree (m_d_data)); // ... free previously allocated memory
10931121 gpuErrChk (cudaStatus); // ... and memento mori
@@ -1183,7 +1211,8 @@ template<>
11831211inline DTensor<double > &DTensor<double >::operator *=(double scalar) {
11841212 double alpha = scalar;
11851213 gpuErrChk (
1186- cublasDscal (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1 ));
1214+ cublasDscal (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
1215+ m_d_data, 1 ));
11871216 return *this ;
11881217}
11891218
@@ -1202,15 +1231,17 @@ template<>
12021231inline DTensor<float > &DTensor<float >::operator *=(float scalar) {
12031232 float alpha = scalar;
12041233 gpuErrChk (
1205- cublasSscal (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1 ));
1234+ cublasSscal (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
1235+ m_d_data, 1 ));
12061236 return *this ;
12071237}
12081238
12091239template <>
12101240inline DTensor<double > &DTensor<double >::operator +=(const DTensor<double > &rhs) {
12111241 const double alpha = 1 .;
12121242 gpuErrChk (
1213- cublasDaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data ,
1243+ cublasDaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
1244+ m_d_data,
12141245 1 , m_d_data, 1 ));
12151246 return *this ;
12161247}
@@ -1219,15 +1250,17 @@ template<>
12191250inline DTensor<float > &DTensor<float >::operator +=(const DTensor<float > &rhs) {
12201251 const float alpha = 1 .;
12211252 gpuErrChk (
1222- cublasSaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data ,
1253+ cublasSaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
1254+ m_d_data,
12231255 1 , m_d_data, 1 ));
12241256 return *this ;
12251257}
12261258
12271259template <>
12281260inline DTensor<float > &DTensor<float >::operator -=(const DTensor<float > &rhs) {
12291261 const float alpha = -1 .;
1230- cublasSaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data , 1 ,
1262+ cublasSaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha,
1263+ rhs.m_d_data , 1 ,
12311264 m_d_data, 1 );
12321265 return *this ;
12331266}
@@ -1236,7 +1269,8 @@ template<>
12361269inline DTensor<double > &DTensor<double >::operator -=(const DTensor<double > &rhs) {
12371270 const double alpha = -1 .;
12381271 gpuErrChk (
1239- cublasDaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data ,
1272+ cublasDaxpy (Session::getInstance ().cuBlasHandle (m_idxStream), m_numRows * m_numCols * m_numMats, &alpha, rhs.
1273+ m_d_data,
12401274 1 , m_d_data, 1 ));
12411275 return *this ;
12421276}
@@ -2259,7 +2293,8 @@ inline void GivensAnnihilator<T>::annihilate(size_t i, size_t k, size_t j) {
22592293 * Pass cosine and sine as device pointers
22602294 * (Avoid having to download first)
22612295 */
2262- gpuErrChk (cublasSetPointerMode (Session::getInstance ().cuBlasHandle (m_matrix->streamIdx ()), CUBLAS_POINTER_MODE_DEVICE));
2296+ gpuErrChk (
2297+ cublasSetPointerMode (Session::getInstance ().cuBlasHandle (m_matrix->streamIdx ()), CUBLAS_POINTER_MODE_DEVICE));
22632298
22642299 /* Useful definitions */
22652300 T *aux = m_d_rhyp_cos_sin->raw ();
@@ -2272,7 +2307,8 @@ inline void GivensAnnihilator<T>::annihilate(size_t i, size_t k, size_t j) {
22722307 m_matrix->applyLeftGivensRotation (i, k, aux + 1 , aux + 2 );
22732308
22742309 /* Change back to default behaviour */
2275- gpuErrChk (cublasSetPointerMode (Session::getInstance ().cuBlasHandle (m_matrix->streamIdx ()), CUBLAS_POINTER_MODE_HOST));
2310+ gpuErrChk (
2311+ cublasSetPointerMode (Session::getInstance ().cuBlasHandle (m_matrix->streamIdx ()), CUBLAS_POINTER_MODE_HOST));
22762312}
22772313
22782314
0 commit comments