Skip to content

Commit b796515

Browse files
committed
Remove CUDA use from top-level scope
Move GPU_blocks from being a constant to existing in functions
1 parent 1f5326f commit b796515

2 files changed

Lines changed: 29 additions & 5 deletions

File tree

src/BatchPDLP.jl

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,6 @@ module BatchPDLP
3030
import CUDA: CuArray, sync_threads, unsafe_load, threadIdx, blockIdx, blockDim, gridDim
3131
import CUDA: @cuda, @cuDynamicSharedMem, @cuStaticSharedMem
3232

33-
# This constant checks to see how many streaming multiprocessors the user's GPU has,
34-
# and uses this number to determine block sizes for all CUDA kernels
35-
const GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
36-
export GPU_blocks
37-
3833
# Export the main struct and the PDLP function itself
3934
export PDLPData, PDLP
4035

src/lower_level_subroutines.jl

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ function ruiz_rescaling(
99
dims::PDLPDims,
1010
)
1111

12+
# Identify the number of blocks to use
13+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
14+
1215
# Set up temporary variables for intermediate scaling results
1316
temp_variable_rescaling = CuArray{Float64}(undef, size(variable_rescaling))
1417
temp_constraint_rescaling = CuArray{Float64}(undef, size(constraint_rescaling))
@@ -57,6 +60,9 @@ function pock_chambolle_rescaling(
5760
dims::PDLPDims,
5861
)
5962

63+
# Identify the number of blocks to use
64+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
65+
6066
# Preallocate space for intermediate rescaling terms
6167
temp_variable_rescaling = CuArray{Float64}(undef, size(variable_rescaling))
6268
temp_constraint_rescaling = CuArray{Float64}(undef, size(constraint_rescaling))
@@ -102,6 +108,9 @@ function scale_problem(
102108
dims::PDLPDims,
103109
)
104110

111+
# Identify the number of blocks to use
112+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
113+
105114
# Perform the following steps:
106115
# 1) problem.objective_vector = problem.objective_vector ./ variable_rescaling
107116
# 2) problem.variable_lower_bound = problem.variable_lower_bound .* variable_rescaling
@@ -140,6 +149,7 @@ function select_initial_primal_weight(
140149
# Theoretically the primal importance can change, but the default in the MOI_wrapper
141150
# is to set it to 1.0. The other parameters are un-settable in cuPDLP but theoretically
142151
# could be changed as well
152+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
143153
CUDA.@sync @cuda blocks=GPU_blocks threads=512 primal_weight_kernel(
144154
primal_weight,
145155
problem.objective_vector,
@@ -152,6 +162,7 @@ function select_initial_primal_weight(
152162
end
153163

154164
function update_step_size(problem::LinearProgramSet, step_size::CuArray{Float64}, dims::PDLPDims)
165+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
155166
CUDA.@sync @cuda blocks=GPU_blocks threads=512 group_max_kernel(
156167
step_size,
157168
problem.constraint_matrix,
@@ -177,6 +188,9 @@ function add_LP_objective_constraint(
177188
active_constraint::CuArray{Bool},
178189
dims::PDLPDims
179190
)
191+
192+
# Identify the number of blocks to use
193+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
180194

181195
# Add the constraint
182196
CUDA.@sync @cuda blocks=GPU_blocks threads=256 add_LP_constraint_kernel(
@@ -210,6 +224,9 @@ function add_LP_constraint(
210224
dims::PDLPDims;
211225
geq::Bool=true
212226
)
227+
228+
# Identify the number of blocks to use
229+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
213230

214231
# Add the constraint
215232
CUDA.@sync @cuda blocks=GPU_blocks threads=256 add_LP_constraint_kernel(
@@ -240,6 +257,9 @@ function add_LP_lower_bound(
240257
active_constraint::CuArray{Bool},
241258
dims::PDLPDims
242259
)
260+
261+
# Identify the number of blocks to use
262+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
243263

244264
# Add the constraint
245265
CUDA.@sync @cuda blocks=GPU_blocks threads=256 add_LP_lower_bound_kernel(
@@ -351,6 +371,9 @@ function add_best_obj_LP_constraints(
351371
n_points::Int32,
352372
num_linearizations::Int,
353373
)
374+
375+
# Identify the number of blocks to use
376+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
354377

355378
# Add the best constraint, based on the comparison vector, num_linearizations times
356379
CUDA.@sync @cuda blocks=GPU_blocks threads=256 add_best_obj_LP_constraints_kernel(
@@ -393,6 +416,9 @@ function add_best_cons_LP_constraints(
393416
num_linearizations::Int;
394417
geq::Bool=true,
395418
)
419+
420+
# Identify the number of blocks to use
421+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
396422

397423
# Add the best constraint, based on the comparison vector, num_linearizations times
398424
CUDA.@sync @cuda blocks=GPU_blocks threads=256 add_best_cons_LP_constraints_kernel(
@@ -431,6 +457,9 @@ function add_multiple_LP_lower_bound(
431457
active_constraint::CuArray{Bool},
432458
n_points::Int32,
433459
)
460+
461+
# Identify the number of blocks to use
462+
GPU_blocks = Int32(CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
434463

435464
# Add the constraint
436465
CUDA.@sync @cuda blocks=GPU_blocks threads=256 add_multiple_LP_lower_bound_kernel(

0 commit comments

Comments
 (0)