|
12 | 12 |
|
13 | 13 | #include <cuda_runtime.h> |
14 | 14 |
|
| 15 | +#include <algorithm> |
15 | 16 | #include <unistd.h> |
16 | 17 | #include <vector> |
17 | 18 |
|
@@ -532,6 +533,87 @@ void TimeFrameGPU<NLayers>::createTrackITSExtDevice(const size_t nSeeds) |
532 | 533 | GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt))); |
533 | 534 | } |
534 | 535 |
|
| 536 | +template <int NLayers> |
| 537 | +void TimeFrameGPU<NLayers>::loadTrackExtensionStartStatesDevice() |
| 538 | +{ |
| 539 | + GPUTimer timer("loading track extension start states"); |
| 540 | + GPULog("gpu-transfer: loading {} track extension start states, for {:.2f} MB.", this->mTracks.size(), this->mTracks.size() * sizeof(o2::its::TrackExtensionStartState<NLayers>) / constants::MB); |
| 541 | + mTrackExtensionStartStatesDevice = nullptr; |
| 542 | + mTrackExtensionStartStates = bounded_vector<TrackExtensionStartState<NLayers>>(this->mTracks.size(), {}, this->getMemoryPool().get()); |
| 543 | + if (this->mTracks.empty()) { |
| 544 | + return; |
| 545 | + } |
| 546 | + for (size_t iTrack{0}; iTrack < this->mTracks.size(); ++iTrack) { |
| 547 | + const auto& track = this->mTracks[iTrack]; |
| 548 | + auto& state = mTrackExtensionStartStates[iTrack]; |
| 549 | + state.paramIn = track.getParamIn(); |
| 550 | + state.paramOut = track.getParamOut(); |
| 551 | + state.time = track.getTimeStamp(); |
| 552 | + state.chi2 = track.getChi2(); |
| 553 | + state.nClusters = track.getNClusters(); |
| 554 | + state.firstClusterLayer = static_cast<int>(track.getFirstClusterLayer()); |
| 555 | + state.lastClusterLayer = static_cast<int>(track.getLastClusterLayer()); |
| 556 | + for (int iLayer{0}; iLayer < NLayers; ++iLayer) { |
| 557 | + state.clusters[iLayer] = track.getClusterIndex(iLayer); |
| 558 | + } |
| 559 | + } |
| 560 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionStartStatesDevice), mTrackExtensionStartStates.size() * sizeof(o2::its::TrackExtensionStartState<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 561 | + GPUChkErrS(cudaMemcpy(mTrackExtensionStartStatesDevice, mTrackExtensionStartStates.data(), mTrackExtensionStartStates.size() * sizeof(o2::its::TrackExtensionStartState<NLayers>), cudaMemcpyHostToDevice)); |
| 562 | +} |
| 563 | + |
| 564 | +template <int NLayers> |
| 565 | +void TimeFrameGPU<NLayers>::createTrackExtensionCandidatesDevice(const size_t nTracks) |
| 566 | +{ |
| 567 | + GPUTimer timer("reserving track extension candidates"); |
| 568 | + const size_t nCandidates = nTracks * MaxTrackExtensionCandidatesPerTrack; |
| 569 | + GPULog("gpu-allocation: reserving {} track extension candidates, for {:.2f} MB.", nCandidates, nCandidates * sizeof(o2::its::TrackExtensionCandidate<NLayers>) / constants::MB); |
| 570 | + mTrackExtensionCandidates = bounded_vector<TrackExtensionCandidate<NLayers>>(nCandidates, {}, this->getMemoryPool().get()); |
| 571 | + mTrackExtensionCandidatesDevice = nullptr; |
| 572 | + mTrackExtensionCandidateOffsetsDevice = nullptr; |
| 573 | + if (mTrackExtensionCandidates.empty()) { |
| 574 | + return; |
| 575 | + } |
| 576 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionCandidatesDevice), nCandidates * sizeof(o2::its::TrackExtensionCandidate<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 577 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionCandidateOffsetsDevice), (nTracks + 1) * sizeof(int), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 578 | +} |
| 579 | + |
| 580 | +template <int NLayers> |
| 581 | +void TimeFrameGPU<NLayers>::createTrackExtensionScratchDevice(const int nThreads, const int beamWidth) |
| 582 | +{ |
| 583 | + GPUTimer timer("reserving track extension scratch"); |
| 584 | + const size_t nHypotheses = static_cast<size_t>(std::max(1, nThreads)) * std::max(1, beamWidth); |
| 585 | + GPULog("gpu-allocation: reserving {} track extension hypotheses per scratch buffer, for {:.2f} MB each.", nHypotheses, nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>) / constants::MB); |
| 586 | + mActiveTrackExtensionHypothesesDevice = nullptr; |
| 587 | + mNextTrackExtensionHypothesesDevice = nullptr; |
| 588 | + if (nHypotheses == 0) { |
| 589 | + return; |
| 590 | + } |
| 591 | + allocMem(reinterpret_cast<void**>(&mActiveTrackExtensionHypothesesDevice), nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 592 | + allocMem(reinterpret_cast<void**>(&mNextTrackExtensionHypothesesDevice), nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 593 | +} |
| 594 | + |
| 595 | +template <int NLayers> |
| 596 | +void TimeFrameGPU<NLayers>::createTrackExtensionResultsDevice(const size_t nTracks) |
| 597 | +{ |
| 598 | + GPUTimer timer("reserving fitted track extension results"); |
| 599 | + mNTrackExtensionResults = 0; |
| 600 | + if (nTracks == 0 || mTrackExtensionCandidateOffsetsDevice == nullptr) { |
| 601 | + mTrackExtensionResults = bounded_vector<TrackExtensionResult<NLayers>>(0, {}, this->getMemoryPool().get()); |
| 602 | + mTrackExtensionResultsDevice = nullptr; |
| 603 | + return; |
| 604 | + } |
| 605 | + int nResults{0}; |
| 606 | + GPUChkErrS(cudaMemcpy(&nResults, mTrackExtensionCandidateOffsetsDevice + nTracks, sizeof(int), cudaMemcpyDeviceToHost)); |
| 607 | + mNTrackExtensionResults = nResults; |
| 608 | + GPULog("gpu-allocation: reserving {} fitted track extension results, for {:.2f} MB.", mNTrackExtensionResults, mNTrackExtensionResults * sizeof(o2::its::TrackExtensionResult<NLayers>) / constants::MB); |
| 609 | + mTrackExtensionResults = bounded_vector<TrackExtensionResult<NLayers>>(mNTrackExtensionResults, {}, this->getMemoryPool().get()); |
| 610 | + mTrackExtensionResultsDevice = nullptr; |
| 611 | + if (mTrackExtensionResults.empty()) { |
| 612 | + return; |
| 613 | + } |
| 614 | + allocMem(reinterpret_cast<void**>(&mTrackExtensionResultsDevice), mNTrackExtensionResults * sizeof(o2::its::TrackExtensionResult<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); |
| 615 | +} |
| 616 | + |
535 | 617 | template <int NLayers> |
536 | 618 | void TimeFrameGPU<NLayers>::downloadCellsDevice() |
537 | 619 | { |
@@ -578,6 +660,28 @@ void TimeFrameGPU<NLayers>::downloadTrackITSExtDevice() |
578 | 660 | GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, mTrackITSExt.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost)); |
579 | 661 | } |
580 | 662 |
|
| 663 | +template <int NLayers> |
| 664 | +void TimeFrameGPU<NLayers>::downloadTrackExtensionCandidatesDevice() |
| 665 | +{ |
| 666 | + GPUTimer timer("downloading track extension candidates"); |
| 667 | + GPULog("gpu-transfer: downloading {} track extension candidates, for {:.2f} MB.", mTrackExtensionCandidates.size(), mTrackExtensionCandidates.size() * sizeof(o2::its::TrackExtensionCandidate<NLayers>) / constants::MB); |
| 668 | + if (mTrackExtensionCandidates.empty()) { |
| 669 | + return; |
| 670 | + } |
| 671 | + GPUChkErrS(cudaMemcpy(mTrackExtensionCandidates.data(), mTrackExtensionCandidatesDevice, mTrackExtensionCandidates.size() * sizeof(o2::its::TrackExtensionCandidate<NLayers>), cudaMemcpyDeviceToHost)); |
| 672 | +} |
| 673 | + |
| 674 | +template <int NLayers> |
| 675 | +void TimeFrameGPU<NLayers>::downloadTrackExtensionResultsDevice() |
| 676 | +{ |
| 677 | + GPUTimer timer("downloading fitted track extension results"); |
| 678 | + GPULog("gpu-transfer: downloading {} fitted track extension results, for {:.2f} MB.", mTrackExtensionResults.size(), mTrackExtensionResults.size() * sizeof(o2::its::TrackExtensionResult<NLayers>) / constants::MB); |
| 679 | + if (mTrackExtensionResults.empty()) { |
| 680 | + return; |
| 681 | + } |
| 682 | + GPUChkErrS(cudaMemcpy(mTrackExtensionResults.data(), mTrackExtensionResultsDevice, mTrackExtensionResults.size() * sizeof(o2::its::TrackExtensionResult<NLayers>), cudaMemcpyDeviceToHost)); |
| 683 | +} |
| 684 | + |
581 | 685 | template <int NLayers> |
582 | 686 | void TimeFrameGPU<NLayers>::unregisterHostMemory(const int maxLayers) |
583 | 687 | { |
|
0 commit comments