@@ -97,20 +97,25 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
9797 sizes_v = [v_nvariables (system) * n_integrated_particles (system)
9898 for system in systems]
9999
100- # Align sizes to 64 bytes by adding padding if necessary.
101- # This ensures that aligned loads can be used on the integration arrays, which can
102- # significantly improve performance on GPUs. Performance benefits on CPUs remain
103- # to be investigated.
100+ start_u = 1
101+ start_v = 1
102+ ranges_u_vec = Vector {UnitRange{Int}} (undef, length (systems))
103+ ranges_v_vec = Vector {UnitRange{Int}} (undef, length (systems))
104104 for i in eachindex (systems)
105+ ranges_u_vec[i] = start_u: (start_u + sizes_u[i] - 1 )
106+ ranges_v_vec[i] = start_v: (start_v + sizes_v[i] - 1 )
107+
108+ # Align sizes to 64 bytes by adding padding if necessary.
109+ # This ensures that aligned loads can be used on the integration arrays, which can
110+ # significantly improve performance on GPUs. Performance benefits on CPUs remain
111+ # to be investigated.
105112 block_size = div (64 , sizeof (eltype (systems[i])))
106- sizes_u[i] = div (sizes_u[i], block_size, RoundUp) * block_size
107- sizes_v[i] = div (sizes_v[i], block_size, RoundUp) * block_size
113+ start_u + = div (sizes_u[i], block_size, RoundUp) * block_size
114+ start_v + = div (sizes_v[i], block_size, RoundUp) * block_size
108115 end
109116
110- ranges_u = Tuple ((sum (sizes_u[1 : (i - 1 )]) + 1 ): sum (sizes_u[1 : i])
111- for i in eachindex (sizes_u))
112- ranges_v = Tuple ((sum (sizes_v[1 : (i - 1 )]) + 1 ): sum (sizes_v[1 : i])
113- for i in eachindex (sizes_v))
117+ ranges_u = Tuple (ranges_u_vec)
118+ ranges_v = Tuple (ranges_v_vec)
114119
115120 # Create a n x n matrix of n neighborhood searches for each of the n systems.
116121 # We will need one neighborhood search for each pair of systems.
@@ -256,13 +261,13 @@ function semidiscretize(semi, tspan; reset_threads=true)
256261 Polyester. reset_threads! ()
257262 end
258263
259- sizes_u = ( u_nvariables (system) * n_integrated_particles (system) for system in systems)
260- sizes_v = ( v_nvariables (system) * n_integrated_particles (system) for system in systems)
264+ size_u_ode = semi . ranges_u[ end ] . stop
265+ size_v_ode = semi . ranges_v[ end ] . stop
261266
262267 # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or
263268 # use CPU vectors for all CPU backends.
264- u0_ode_ = allocate (semi. parallelization_backend, cELTYPE, sum (sizes_u) )
265- v0_ode_ = allocate (semi. parallelization_backend, ELTYPE, sum (sizes_v) )
269+ u0_ode_ = allocate (semi. parallelization_backend, cELTYPE, size_u_ode )
270+ v0_ode_ = allocate (semi. parallelization_backend, ELTYPE, size_v_ode )
266271
267272 if semi. parallelization_backend isa KernelAbstractions. GPU
268273 u0_ode = u0_ode_
@@ -277,6 +282,9 @@ function semidiscretize(semi, tspan; reset_threads=true)
277282 parallelization_backend= semi. parallelization_backend)
278283 end
279284
285+ u0_ode .= 0
286+ v0_ode .= 0
287+
280288 # Set initial condition
281289 foreach_system_wrapped (semi, v0_ode, u0_ode) do system, v0_system, u0_system
282290 write_u0! (u0_system, system)
367375 range = ranges_v[system_indices (system, semi)]
368376
369377 @boundscheck begin
370- if length (range) != v_nvariables (system) * n_integrated_particles (system)
371- throw (DimensionMismatch (" `v_ode` range length $range_length does not match " *
378+ expected = v_nvariables (system) * n_integrated_particles (system)
379+ if length (range) != expected
380+ throw (DimensionMismatch (" `v_ode` range length $(length (range)) does not match " *
372381 " expected number of entries $expected " ))
373382 end
374383 end
0 commit comments