@@ -270,6 +270,7 @@ CUpti_PCSamplingData allocPCSamplingData(size_t collectNumPCs,
270270 static_cast <CUpti_PCSamplingPCData *>(
271271 std::calloc (collectNumPCs, sizeof (CUpti_PCSamplingPCData)))};
272272 for (size_t i = 0 ; i < collectNumPCs; ++i) {
273+ pcSamplingData.pPcData [i].size = sizeof (CUpti_PCSamplingPCData);
273274 pcSamplingData.pPcData [i].stallReason =
274275 static_cast <CUpti_PCSamplingStallReason *>(std::calloc (
275276 numValidStallReasons, sizeof (CUpti_PCSamplingStallReason)));
@@ -647,16 +648,22 @@ void PCSampling::collectData(CUcontext context) {
647648 contextId, configureData->pcSamplingData .totalNumPcs ,
648649 configureData->pcSamplingData .remainingNumPcs );
649650
650- // Use the separate output buffer for getData — the configured
651- // pcSamplingData buffer is owned by CUPTI.
652- bool ok = getPCSamplingData (context, &configureData->outputData );
653- DEBUG_PRINTF (" getData: ok=%d output total=%zu remaining=%zu "
654- " cfg total=%zu remaining=%zu\n " ,
655- ok, configureData->outputData .totalNumPcs ,
656- configureData->outputData .remainingNumPcs ,
657- configureData->pcSamplingData .totalNumPcs ,
658- configureData->pcSamplingData .remainingNumPcs );
659- processPCSamplingData (configureData);
651+ // Drain all available PCs in a loop. Each getData call returns at most
652+ // DataBufferPCCount (1024) PCs; a single sampling window can produce
653+ // tens of thousands. Failing to drain leaves data in CUPTI's internal
654+ // buffers, which eventually causes CUPTI_ERROR_OUT_OF_MEMORY (error 8).
655+ do {
656+ bool ok = getPCSamplingData (context, &configureData->outputData );
657+ DEBUG_PRINTF (" getData: ok=%d output total=%zu remaining=%zu "
658+ " cfg total=%zu remaining=%zu\n " ,
659+ ok, configureData->outputData .totalNumPcs ,
660+ configureData->outputData .remainingNumPcs ,
661+ configureData->pcSamplingData .totalNumPcs ,
662+ configureData->pcSamplingData .remainingNumPcs );
663+ if (!ok)
664+ break ;
665+ processPCSamplingData (configureData);
666+ } while (configureData->outputData .remainingNumPcs > 0 );
660667}
661668
662669void PCSampling::collectAllData () {
@@ -671,7 +678,14 @@ void PCSampling::collectAllData() {
671678 }
672679 auto *configureData = &result->get ();
673680 DEBUG_PRINTF (" Draining PC sampling data for context %u\n " , contextId);
674- processPCSamplingData (configureData);
681+ // Fetch and drain all pending data from CUPTI.
682+ do {
683+ bool ok = getPCSamplingData (configureData->context ,
684+ &configureData->outputData );
685+ if (!ok)
686+ break ;
687+ processPCSamplingData (configureData);
688+ } while (configureData->outputData .remainingNumPcs > 0 );
675689 }
676690}
677691
@@ -706,14 +720,16 @@ void PCSampling::finalize(CUcontext context) {
706720 }
707721 }
708722
709- // Drain remaining data before disabling
723+ // Drain all remaining PC data before disabling.
710724 auto *configureData = getConfigureData (contextId);
711- processPCSamplingData (configureData);
712-
713- // After disable, CUPTI may fill remaining records — drain once more
714- if (configureData-> pcSamplingData . totalNumPcs > 0 ) {
725+ do {
726+ bool ok = getPCSamplingData (context, &configureData-> outputData );
727+ if (!ok)
728+ break ;
715729 processPCSamplingData (configureData);
716- }
730+ } while (configureData->outputData .remainingNumPcs > 0 );
731+
732+ disablePCSampling (context);
717733
718734 contextIdToConfigureData.erase (contextId);
719735 contextInitialized.erase (contextId);
0 commit comments