cuopt/cpp/src/pdlp/cusparse_view.hpp at 88693c14210582694b4245ce4721aa23be1a9cd9 · NVIDIA/cuopt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/* clang-format off */
/*
 * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 */
/* clang-format on */
#pragma once

#include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
#include <pdlp/pdlp_climber_strategy.hpp>
#include <pdlp/saddle_point.hpp>

#include <mip_heuristics/problem/problem.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <raft/sparse/detail/cusparse_wrappers.h>
#include <raft/core/cusparse_macros.hpp>

#include <cusparse_v2.h>

#define CUDA_VER_13_2_UP (CUDART_VERSION >= 13020)

namespace cuopt::linear_programming::detail {

template <typename i_t, typename f_t>
class cusparse_sp_mat_descr_wrapper_t {
 public:
  cusparse_sp_mat_descr_wrapper_t();
  ~cusparse_sp_mat_descr_wrapper_t();

  cusparse_sp_mat_descr_wrapper_t(const cusparse_sp_mat_descr_wrapper_t& other);

  cusparse_sp_mat_descr_wrapper_t& operator=(const cusparse_sp_mat_descr_wrapper_t& other) = delete;

  void create(int64_t m, int64_t n, int64_t nnz, i_t* offsets, i_t* indices, f_t* values);

  operator cusparseSpMatDescr_t() const;

 private:
  cusparseSpMatDescr_t descr_;
  bool need_destruction_;
};

template <typename f_t>
class cusparse_dn_vec_descr_wrapper_t {
 public:
  cusparse_dn_vec_descr_wrapper_t();
  ~cusparse_dn_vec_descr_wrapper_t();

  cusparse_dn_vec_descr_wrapper_t(const cusparse_dn_vec_descr_wrapper_t& other);
  cusparse_dn_vec_descr_wrapper_t& operator=(cusparse_dn_vec_descr_wrapper_t&& other);
  cusparse_dn_vec_descr_wrapper_t& operator=(const cusparse_dn_vec_descr_wrapper_t& other) = delete;

  void create(int64_t size, f_t* values);

  operator cusparseDnVecDescr_t() const;

 private:
  cusparseDnVecDescr_t descr_;
  bool need_destruction_;
};

template <typename f_t>
class cusparse_dn_mat_descr_wrapper_t {
 public:
  cusparse_dn_mat_descr_wrapper_t();
  ~cusparse_dn_mat_descr_wrapper_t();

  cusparse_dn_mat_descr_wrapper_t(const cusparse_dn_mat_descr_wrapper_t& other);
  cusparse_dn_mat_descr_wrapper_t& operator=(cusparse_dn_mat_descr_wrapper_t&& other);
  cusparse_dn_mat_descr_wrapper_t& operator=(const cusparse_dn_mat_descr_wrapper_t& other) = delete;

  void create(int64_t row, int64_t col, int64_t ld, f_t* values, cusparseOrder_t order);

  operator cusparseDnMatDescr_t() const;

 private:
  cusparseDnMatDescr_t descr_;
  bool need_destruction_;
};

#if CUDA_VER_13_2_UP
class cusparse_spmvop_descr_wrapper_t {
 public:
  cusparse_spmvop_descr_wrapper_t();
  ~cusparse_spmvop_descr_wrapper_t();

  cusparse_spmvop_descr_wrapper_t(const cusparse_spmvop_descr_wrapper_t& other);
  cusparse_spmvop_descr_wrapper_t& operator=(cusparse_spmvop_descr_wrapper_t&& other);
  cusparse_spmvop_descr_wrapper_t& operator=(const cusparse_spmvop_descr_wrapper_t& other) = delete;

  void create(cusparseHandle_t handle,
              cusparseOperation_t opA,
              cusparseSpMatDescr_t matA,
              cusparseDnVecDescr_t vecX,
              cusparseDnVecDescr_t vecY,
              cusparseDnVecDescr_t vecZ,
              cudaDataType computeType,
              void* buffer);

  operator cusparseSpMVOpDescr_t() const;

 private:
  cusparseSpMVOpDescr_t descr_;
  bool need_destruction_;
};

class cusparse_spmvop_plan_wrapper_t {
 public:
  cusparse_spmvop_plan_wrapper_t();
  ~cusparse_spmvop_plan_wrapper_t();

  cusparse_spmvop_plan_wrapper_t(const cusparse_spmvop_plan_wrapper_t& other);
  cusparse_spmvop_plan_wrapper_t& operator=(cusparse_spmvop_plan_wrapper_t&& other);
  cusparse_spmvop_plan_wrapper_t& operator=(const cusparse_spmvop_plan_wrapper_t& other) = delete;

  void create(cusparseHandle_t handle,
                cusparseSpMVOpDescr_t descr,
                char* lto_buffer,
                size_t lto_buffer_size);

  operator cusparseSpMVOpPlan_t() const;

 private:
  cusparseSpMVOpPlan_t plan_;
  bool need_destruction_;
};
#endif

template <typename i_t, typename f_t>
class cusparse_view_t {
 public:
  cusparse_view_t(raft::handle_t const* handle_ptr,
                  const problem_t<i_t, f_t>& op_problem,
                  saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
                  rmm::device_uvector<f_t>& _tmp_primal,
                  rmm::device_uvector<f_t>& _tmp_dual,
                  rmm::device_uvector<f_t>& _potential_next_dual_solution,
                  rmm::device_uvector<f_t>& _reflected_primal_solution,
                  const std::vector<pdlp_climber_strategy_t>& climber_strategies,
                  const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params,
                  bool enable_mixed_precision_spmv);

  cusparse_view_t(raft::handle_t const* handle_ptr,
                  const problem_t<i_t, f_t>& op_problem,
                  rmm::device_uvector<f_t>& _primal_solution,
                  rmm::device_uvector<f_t>& _dual_solution,
                  rmm::device_uvector<f_t>& _tmp_primal,
                  rmm::device_uvector<f_t>& _tmp_dual,
                  rmm::device_uvector<f_t>& _potential_next_primal,
                  rmm::device_uvector<f_t>& _potential_next_dual,
                  const rmm::device_uvector<f_t>& _A_T,
                  const rmm::device_uvector<i_t>& _A_T_offsets,
                  const rmm::device_uvector<i_t>& _A_T_indices,
                  const std::vector<pdlp_climber_strategy_t>& climber_strategies,
                  const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params);

  cusparse_view_t(raft::handle_t const* handle_ptr,
                  const problem_t<i_t, f_t>& op_problem,
                  const cusparse_view_t<i_t, f_t>& existing_cusparse_view,
                  f_t* _primal_solution,
                  f_t* _dual_solution,
                  f_t* _primal_gradient,
                  f_t* _dual_gradient);

  cusparse_view_t(raft::handle_t const* handle_ptr,
                  const rmm::device_uvector<f_t>&,               // Empty just to init the const&
                  const rmm::device_uvector<i_t>&,               // Empty just to init the const&
                  const std::vector<pdlp_climber_strategy_t>&);  // Empty just to init the const&

  const bool batch_mode_{false};

  raft::handle_t const* handle_ptr_{nullptr};

  // cusparse view of linear program
  cusparse_sp_mat_descr_wrapper_t<i_t, f_t> A;
  cusparse_sp_mat_descr_wrapper_t<i_t, f_t> A_T;
  cusparse_dn_vec_descr_wrapper_t<f_t> c;

  // cusparse view of solutions
  cusparse_dn_vec_descr_wrapper_t<f_t> primal_solution;
  cusparse_dn_vec_descr_wrapper_t<f_t> dual_solution;

  // cusparse view of gradients
  cusparse_dn_vec_descr_wrapper_t<f_t> primal_gradient;
  cusparse_dn_vec_descr_wrapper_t<f_t> dual_gradient;

  // cusparse view of batch gradients
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_dual_gradients;

  // cusparse view of batch solutions
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_primal_solutions;
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_dual_solutions;
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_potential_next_dual_solution;
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_next_AtYs;
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_tmp_duals;
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_reflected_primal_solutions;
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_delta_primal_solutions;
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_delta_dual_solutions;

  // cusparse view of At * Y batch computation
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_current_AtYs;

  // cusparse view of auxillirary space needed for some spmm computations
  cusparse_dn_mat_descr_wrapper_t<f_t> batch_tmp_primals;

  // cusparse view of At * Y computation
  cusparse_dn_vec_descr_wrapper_t<f_t>
    current_AtY;  // Only used at very first iteration and after each restart to average
  cusparse_dn_vec_descr_wrapper_t<f_t>
    next_AtY;  // Next value is swapped out with current after each valid PDHG
               // step to save the first AtY SpMV in compute next primal
  cusparse_dn_vec_descr_wrapper_t<f_t> potential_next_dual_solution;

  // cusparse view of auxiliary space needed for some spmv computations
  cusparse_dn_vec_descr_wrapper_t<f_t> tmp_primal;
  cusparse_dn_vec_descr_wrapper_t<f_t> tmp_dual;

  // reuse buffers for cusparse spmv
  rmm::device_uvector<uint8_t> buffer_non_transpose;
  rmm::device_uvector<uint8_t> buffer_transpose;

  // SpMVOp buffers for A and A_T
  rmm::device_uvector<uint8_t> buffer_non_transpose_spmvop{0, handle_ptr_->get_stream()};
  rmm::device_uvector<uint8_t> buffer_transpose_spmvop{0, handle_ptr_->get_stream()};

#if CUDA_VER_13_2_UP
  // SpMVOp descriptors and plans for A and A_T (descr before plan so dtor destroys plan first)
  cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_;
  cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_;
  cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_t_;
  cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_t_;
#endif
  // reuse buffers for cusparse spmm
  rmm::device_uvector<uint8_t> buffer_transpose_batch;
  rmm::device_uvector<uint8_t> buffer_non_transpose_batch;
  rmm::device_uvector<uint8_t> buffer_transpose_batch_row_row_;
  rmm::device_uvector<uint8_t> buffer_non_transpose_batch_row_row_;
  // Only when using reflection
  cusparse_dn_vec_descr_wrapper_t<f_t> reflected_primal_solution;

  // Ref to the A_T found in either
  // Initial problem, we use it to have an unscaled A_T
  // PDLP copy of the problem which holds the scaled version
  // This works under the assumption that while PDLP is optimizing a problem, the original problem
  // is never modified by anyone (including MIP)
  const rmm::device_uvector<f_t>& A_T_;
  const rmm::device_uvector<i_t>& A_T_offsets_;
  const rmm::device_uvector<i_t>& A_T_indices_;

  // original A non-transpose matrix
  const rmm::device_uvector<f_t>& A_;
  const rmm::device_uvector<i_t>& A_offsets_;
  const rmm::device_uvector<i_t>& A_indices_;

  const std::vector<pdlp_climber_strategy_t>& climber_strategies_;

  // Mixed precision SpMV support (FP32 matrix with FP64 vectors/compute)
  // Only used when mixed_precision_enabled_ is true and f_t = double
  rmm::device_uvector<float> A_float_;                       // FP32 copy of A values
  rmm::device_uvector<float> A_T_float_;                     // FP32 copy of A_T values
  cusparse_sp_mat_descr_wrapper_t<i_t, float> A_mixed_;      // FP32 matrix descriptor for A
  cusparse_sp_mat_descr_wrapper_t<i_t, float> A_T_mixed_;    // FP32 matrix descriptor for A_T
  rmm::device_uvector<uint8_t> buffer_non_transpose_mixed_;  // SpMV buffer for mixed precision A
  rmm::device_uvector<uint8_t> buffer_transpose_mixed_;      // SpMV buffer for mixed precision A_T
  bool mixed_precision_enabled_{false};

  // Update FP32 matrix copies after scaling (must be called after scale_problem())
  void update_mixed_precision_matrices();

  // Creates SpMVOp plans. Must be called after scale_problem() so plans use the scaled matrix.
  void create_spmv_op_plans(bool is_reflected);
};

// Mixed precision SpMV: FP32 matrix with FP64 vectors and FP64 compute type
void mixed_precision_spmv(cusparseHandle_t handle,
                          cusparseOperation_t opA,
                          const double* alpha,
                          cusparseSpMatDescr_t matA,  // FP32 matrix
                          cusparseDnVecDescr_t vecX,  // FP64 vector
                          const double* beta,
                          cusparseDnVecDescr_t vecY,  // FP64 vector
                          cusparseSpMVAlg_t alg,
                          void* externalBuffer,
                          cudaStream_t stream);

size_t mixed_precision_spmv_buffersize(cusparseHandle_t handle,
                                       cusparseOperation_t opA,
                                       const double* alpha,
                                       cusparseSpMatDescr_t matA,  // FP32 matrix
                                       cusparseDnVecDescr_t vecX,  // FP64 vector
                                       const double* beta,
                                       cusparseDnVecDescr_t vecY,  // FP64 vector
                                       cusparseSpMVAlg_t alg,
                                       cudaStream_t stream);

#if CUDA_VER_12_4_UP
void mixed_precision_spmv_preprocess(cusparseHandle_t handle,
                                     cusparseOperation_t opA,
                                     const double* alpha,
                                     cusparseSpMatDescr_t matA,  // FP32 matrix
                                     cusparseDnVecDescr_t vecX,  // FP64 vector
                                     const double* beta,
                                     cusparseDnVecDescr_t vecY,  // FP64 vector
                                     cusparseSpMVAlg_t alg,
                                     void* externalBuffer,
                                     cudaStream_t stream);
#endif

#if CUDA_VER_12_4_UP
template <
  typename T,
  typename std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double>>* = nullptr>
void my_cusparsespmm_preprocess(cusparseHandle_t handle,
                                cusparseOperation_t opA,
                                cusparseOperation_t opB,
                                const T* alpha,
                                const cusparseSpMatDescr_t matA,
                                const cusparseDnMatDescr_t matB,
                                const T* beta,
                                const cusparseDnMatDescr_t matC,
                                cusparseSpMMAlg_t alg,
                                void* externalBuffer,
                                cudaStream_t stream);
#endif

bool is_cusparse_runtime_mixed_precision_supported();

}  // namespace cuopt::linear_programming::detail