@@ -243,9 +243,30 @@ void FC_FUNC_(compute_seismograms_cuda,
243243
244244 // seismogram buffers are 1D and components appended; size for one single component record
245245 int size = mp->nrec_local * nlength_seismogram;
246+ int valid_length = seismo_current + 1 ;
246247
247- // copies from GPU to CPU (note: could use async mem copy in future...)
248- print_CUDA_error_if_any (cudaMemcpy (h_seismo, d_seismo, sizeof (realw) * 2 * size, cudaMemcpyDeviceToHost),72001 );
248+ if (valid_length > nlength_seismogram) valid_length = nlength_seismogram;
249+
250+ // clears host buffers first, then copies only the samples that were actually written.
251+ // This prevents stale tail values from a previous chunk from leaking into the final trace.
252+ for (int i = 0 ; i < 2 * size; i++) {
253+ h_seismo[i] = 0 .0f ;
254+ }
255+
256+ // copies only the valid part from GPU to CPU (note: could use async mem copy in future...)
257+ for (int irec = 0 ; irec < mp->nrec_local ; irec++) {
258+ int device_offset = irec * nlength_seismogram;
259+ int host_offset = irec * nlength_seismogram;
260+
261+ print_CUDA_error_if_any (cudaMemcpy (h_seismo + host_offset,
262+ d_seismo + device_offset,
263+ sizeof (realw) * valid_length,
264+ cudaMemcpyDeviceToHost),72001 );
265+ print_CUDA_error_if_any (cudaMemcpy (h_seismo + size + host_offset,
266+ d_seismo + size + device_offset,
267+ sizeof (realw) * valid_length,
268+ cudaMemcpyDeviceToHost),72002 );
269+ }
249270
250271 // copies values into host array
251272 for (int irec=0 ; irec < mp->nrec_local ; irec++){
@@ -258,3 +279,62 @@ void FC_FUNC_(compute_seismograms_cuda,
258279
259280 GPU_ERROR_CHECKING (" compute_seismograms_cuda" );
260281}
282+
283+ extern " C"
284+ void FC_FUNC_ (flush_seismograms_cuda,
285+ FLUSH_SEISMOGRAMS_CUDA)(long * Mesh_pointer_f,
286+ int * i_sigf,
287+ double * sisux, double * sisuz,
288+ int * seismo_currentf,
289+ int * nlength_seismogramf) {
290+
291+ TRACE (" flush_seismograms_cuda" );
292+
293+ Mesh* mp = (Mesh*)(*Mesh_pointer_f);
294+
295+ synchronize_cuda ();
296+
297+ if (mp->nrec_local == 0 ) return ;
298+
299+ int i_sig = *i_sigf - 1 ;
300+ int seismo_current = *seismo_currentf;
301+ int nlength_seismogram = *nlength_seismogramf;
302+
303+ if (seismo_current <= 0 ) return ;
304+
305+ if (seismo_current > nlength_seismogram) seismo_current = nlength_seismogram;
306+
307+ realw* h_seismo = mp->h_seismograms [i_sig];
308+ realw* d_seismo = mp->d_seismograms [i_sig];
309+
310+ cudaStreamSynchronize (mp->compute_stream );
311+
312+ int size = mp->nrec_local * nlength_seismogram;
313+
314+ for (int i = 0 ; i < 2 * size; i++) {
315+ h_seismo[i] = 0 .0f ;
316+ }
317+
318+ for (int irec = 0 ; irec < mp->nrec_local ; irec++) {
319+ int device_offset = irec * nlength_seismogram;
320+ int host_offset = irec * nlength_seismogram;
321+
322+ print_CUDA_error_if_any (cudaMemcpy (h_seismo + host_offset,
323+ d_seismo + device_offset,
324+ sizeof (realw) * seismo_current,
325+ cudaMemcpyDeviceToHost),73001 );
326+ print_CUDA_error_if_any (cudaMemcpy (h_seismo + size + host_offset,
327+ d_seismo + size + device_offset,
328+ sizeof (realw) * seismo_current,
329+ cudaMemcpyDeviceToHost),73002 );
330+ }
331+
332+ for (int irec = 0 ; irec < mp->nrec_local ; irec++){
333+ for (int j = 0 ; j < nlength_seismogram; j++){
334+ sisux[j + nlength_seismogram * irec] = (double ) h_seismo[j + nlength_seismogram * irec];
335+ sisuz[j + nlength_seismogram * irec] = (double ) h_seismo[j + nlength_seismogram * irec + size];
336+ }
337+ }
338+
339+ GPU_ERROR_CHECKING (" flush_seismograms_cuda" );
340+ }
0 commit comments