From d6319078d0175c25f4e526a15d6f08829afe3bb6 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Thu, 22 Jan 2026 14:46:01 -0600
Subject: [PATCH 01/14] Rewrite parcagpu to use Proton's CUPTI infrastructure

Major changes:
- Use Proton as a git submodule for CUPTI callback handling
- Rewrite in C++ using Proton's CuptiApi and callback patterns
- Add PC sampling support for continuous GPU profiling
- Simplify build to single library (works with any CUDA version at runtime)
- Use CMake build system
- Consolidate GitHub workflows into single build.yml
- Update Dockerfile to Ubuntu 24.04 (fixes USDT probe generation)

The library now uses Proton's dynamic CUPTI loading, so a single build
works with CUDA 12.x and 13.x at runtime.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .dockerignore                   |  17 +
 .github/workflows/build.yml     | 118 +++++++
 .github/workflows/container.yml | 135 --------
 .gitmodules                     |   3 +
 CMakeLists.txt                  | 142 ++++++++
 Dockerfile                      |  84 ++---
 Dockerfile.test                 |  18 +-
 Makefile                        | 190 +++--------
 README_TEST.md                  |   8 +-
 cupti/CMakeLists.txt            |  68 ----
 cupti/cupti-prof.c              | 575 --------------------------------
 cupti/toolchain-arm64.cmake     |  19 --
 parcagpu.bt                     |  72 ++--
 proton                          |   1 +
 src/correlation_filter.cpp      |  97 ++++++
 src/correlation_filter.h        |  60 ++++
 src/parcagpu_cupti.cpp          | 424 +++++++++++++++++++++++
 src/parcagpu_pc_sampling.cpp    | 522 +++++++++++++++++++++++++++++
 src/parcagpu_pc_sampling.h      | 107 ++++++
 test.sh                         |  26 +-
 test/README.md                  |  25 +-
 test/mock_cuda.c                |  26 ++
 test/mock_cupti.c               |   7 +
 23 files changed, 1663 insertions(+), 1081 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 .github/workflows/build.yml
 delete mode 100644 .github/workflows/container.yml
 create mode 100644 CMakeLists.txt
 delete mode 100644 cupti/CMakeLists.txt
 delete mode 100644 cupti/cupti-prof.c
 delete mode 100644 cupti/toolchain-arm64.cmake
 create mode 160000 proton
 create mode 100644 src/correlation_filter.cpp
 create mode 100644 src/correlation_filter.h
 create mode 100644 src/parcagpu_cupti.cpp
 create mode 100644 src/parcagpu_pc_sampling.cpp
 create mode 100644 src/parcagpu_pc_sampling.h
 create mode 100644 test/mock_cuda.c

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..fe8a30f
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,17 @@
+# Git metadata (especially important for proton submodule)
+.git/
+.gitmodules
+.github/
+
+# Build artifacts (in case they exist in directories we copy)
+build/
+build-*/
+*.o
+*.so
+*.so.*
+
+# CMake artifacts
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+compile_commands.json
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..b98edf8
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,118 @@
+name: Build
+
+on:
+  push:
+    branches: [main, proton]
+    tags: ['v*']
+  pull_request:
+    branches: [main, proton]
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+  CUDA_HEADERS: ghcr.io/parca-dev/cuda-headers:12
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build library (amd64)
+        run: |
+          mkdir -p build/amd64
+          docker buildx build -f Dockerfile \
+            --build-arg CUDA_HEADERS=${{ env.CUDA_HEADERS }} \
+            --target export \
+            --output type=local,dest=build/amd64 \
+            --platform linux/amd64 \
+            .
+
+      - name: Verify build artifacts
+        run: |
+          echo "=== Build artifacts ==="
+          ls -lh build/amd64/
+          file build/amd64/libparcagpucupti.so
+          readelf -n build/amd64/libparcagpucupti.so | grep -A2 "stapsdt" | head -20
+
+      - name: Log in to Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels)
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=raw,value={{date 'YYYYMMDD'}}-{{sha}},enable={{is_default_branch}}
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build and push multi-arch image
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./Dockerfile
+          build-args: |
+            CUDA_HEADERS=${{ env.CUDA_HEADERS }}
+          platforms: linux/amd64,linux/arm64
+          target: runtime
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+  release-binaries:
+    if: startsWith(github.ref, 'refs/tags/v')
+    needs: build
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+
+    strategy:
+      matrix:
+        arch: [amd64, arm64]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build and extract binary for ${{ matrix.arch }}
+        run: |
+          mkdir -p build/${{ matrix.arch }}
+          docker buildx build -f Dockerfile \
+            --build-arg CUDA_HEADERS=${{ env.CUDA_HEADERS }} \
+            --target export \
+            --output type=local,dest=build/${{ matrix.arch }} \
+            --platform linux/${{ matrix.arch }} \
+            .
+          mv build/${{ matrix.arch }}/libparcagpucupti.so \
+             build/${{ matrix.arch }}/libparcagpucupti-${{ matrix.arch }}.so
+
+      - name: Upload to GitHub Release
+        uses: softprops/action-gh-release@v1
+        with:
+          files: build/${{ matrix.arch }}/libparcagpucupti-${{ matrix.arch }}.so
diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml
deleted file mode 100644
index 7e21e01..0000000
--- a/.github/workflows/container.yml
+++ /dev/null
@@ -1,135 +0,0 @@
-name: Container Image Build
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'v*'
-  pull_request:
-    branches:
-      - main
-
-env:
-  REGISTRY: ghcr.io
-  IMAGE_NAME: ${{ github.repository }}
-  CUDA_HEADERS_REGISTRY: ghcr.io/parca-dev/cuda-headers
-
-jobs:
-  build-and-push:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Container Registry
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata (tags, labels)
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
-          tags: |
-            # For releases: use semantic version tags
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            # For main branch: use date and commit SHA
-            type=raw,value={{date 'YYYYMMDD'}}-{{sha}},enable={{is_default_branch}}
-            # Tag as latest for main branch
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Build and push multi-arch image with both CUDA versions
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile
-          build-args: |
-            CUDA_12_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:12
-            CUDA_13_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:13
-          platforms: linux/amd64,linux/arm64
-          target: runtime
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-  extract-release-binaries:
-    # Only run this job for tagged releases
-    if: startsWith(github.ref, 'refs/tags/v')
-    needs: build-and-push
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-
-    strategy:
-      matrix:
-        include:
-          - cuda_major: "12"
-            cuda_full: "12.9.1"
-            arch: amd64
-            platform: linux/amd64
-          - cuda_major: "12"
-            cuda_full: "12.9.1"
-            arch: arm64
-            platform: linux/arm64
-          - cuda_major: "13"
-            cuda_full: "13.0.2"
-            arch: amd64
-            platform: linux/amd64
-          - cuda_major: "13"
-            cuda_full: "13.0.2"
-            arch: arm64
-            platform: linux/arm64
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build and extract binary for ${{ matrix.arch }} (CUDA ${{ matrix.cuda_major }})
-        env:
-          LIB_NAME: libparcagpucupti.so.${{ matrix.cuda_major }}
-        run: |
-          mkdir -p build/${{ matrix.arch }}
-          docker buildx build -f Dockerfile \
-            --build-arg CUDA_12_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:12 \
-            --build-arg CUDA_13_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:13 \
-            --target export-cuda${{ matrix.cuda_major }} \
-            --output type=local,dest=build/${{ matrix.arch }} \
-            --platform ${{ matrix.platform }} \
-            .
-
-      - name: Rename binary with arch and CUDA version suffix
-        env:
-          LIB_NAME: libparcagpucupti.so.${{ matrix.cuda_major }}
-          OUTPUT_NAME: libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
-        run: |
-          mv build/${{ matrix.arch }}/${LIB_NAME} \
-             build/${{ matrix.arch }}/${OUTPUT_NAME}
-
-      - name: Upload binary as artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
-          path: build/${{ matrix.arch }}/libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
-
-      - name: Upload to GitHub Release
-        uses: softprops/action-gh-release@v1
-        with:
-          files: build/${{ matrix.arch }}/libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
diff --git a/.gitmodules b/.gitmodules
index e7d7dd2..73f0eb4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "vendor/opentelemetry-ebpf-profiler"]
 	path = vendor/opentelemetry-ebpf-profiler
 	url = https://github.com/parca-dev/opentelemetry-ebpf-profiler.git
+[submodule "proton"]
+	path = proton
+	url = https://github.com/parca-dev/proton.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..6b01b3a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,142 @@
+cmake_minimum_required(VERSION 3.18)
+project(parcagpu LANGUAGES C CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Default to Release with debug symbols
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE RelWithDebInfo)
+endif()
+
+# Compiler flags for different build types
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -O0")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -g -O2")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -g -O2")
+
+# Set proton directory (git submodule or Docker build)
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/proton")
+    set(PROTON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/proton")
+elseif(EXISTS "/build/proton")
+    set(PROTON_DIR "/build/proton")
+else()
+    message(FATAL_ERROR "Could not find proton directory. Expected at ${CMAKE_CURRENT_SOURCE_DIR}/proton or /build/proton")
+endif()
+message(STATUS "Using Proton from: ${PROTON_DIR}")
+
+# Find CUDA headers (allow fallback to manual path for header-only builds)
+if(NOT DEFINED CUDA_INCLUDE_DIR)
+    find_package(CUDAToolkit QUIET)
+    if(CUDAToolkit_FOUND)
+        set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS})
+    else()
+        # Fallback for header-only builds (e.g., Docker with ghcr.io/parca-dev/cuda-headers)
+        set(CUDA_INCLUDE_DIR "/usr/local/cuda/include")
+        if(NOT EXISTS "${CUDA_INCLUDE_DIR}/cuda.h")
+            message(FATAL_ERROR "Could not find CUDA headers. Set CUDA_INCLUDE_DIR or install CUDA Toolkit.")
+        endif()
+        message(STATUS "Using CUDA headers from: ${CUDA_INCLUDE_DIR}")
+    endif()
+endif()
+
+# Include directories from proton
+include_directories(
+    ${PROTON_DIR}/csrc/include
+    ${PROTON_DIR}/common/include
+    ${CUDA_INCLUDE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+# Collect proton driver source files we need
+# Note: We only link driver APIs and callback utilities (no profiler dependencies)
+set(PROTON_SOURCES
+    ${PROTON_DIR}/csrc/lib/Driver/GPU/CuptiApi.cpp
+    ${PROTON_DIR}/csrc/lib/Driver/GPU/CudaApi.cpp
+    ${PROTON_DIR}/csrc/lib/Profiler/Cupti/CuptiCallbacks.cpp
+)
+
+# Build the shared library
+add_library(parcagpucupti SHARED
+    src/parcagpu_cupti.cpp
+    src/parcagpu_pc_sampling.cpp
+    src/correlation_filter.cpp
+    ${PROTON_SOURCES}
+)
+
+# Link against system libraries only (NOT libcupti - loaded dynamically by Proton)
+target_link_libraries(parcagpucupti
+    dl
+    pthread
+)
+
+# Set output directory
+set_target_properties(parcagpucupti PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    OUTPUT_NAME "parcagpucupti"
+)
+
+# Install target
+install(TARGETS parcagpucupti
+    LIBRARY DESTINATION lib
+)
+
+# Build test infrastructure
+option(BUILD_TESTS "Build test infrastructure" ON)
+
+if(BUILD_TESTS)
+    # Mock CUDA library for testing (minimal - just cuDriverGetVersion)
+    add_library(cuda_mock SHARED
+        test/mock_cuda.c
+    )
+    target_include_directories(cuda_mock PRIVATE
+        ${CUDA_INCLUDE_DIR}
+    )
+    set_target_properties(cuda_mock PROPERTIES
+        OUTPUT_NAME "cuda"
+        LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    )
+
+    # Mock CUPTI library for testing
+    add_library(cupti_mock SHARED
+        test/mock_cupti.c
+    )
+    target_include_directories(cupti_mock PRIVATE
+        ${CUDA_INCLUDE_DIR}
+    )
+    set_target_properties(cupti_mock PROPERTIES
+        OUTPUT_NAME "cupti"
+        LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    )
+
+    # Test executable
+    add_executable(test_cupti_prof
+        test/test_cupti_prof.c
+    )
+    target_include_directories(test_cupti_prof PRIVATE
+        ${CUDA_INCLUDE_DIR}
+    )
+    target_link_libraries(test_cupti_prof
+        dl
+    )
+    set_target_properties(test_cupti_prof PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+    )
+
+    # Create libcuda.so.1 symlink for test (Proton expects libcuda.so.1)
+    add_custom_command(TARGET cuda_mock POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink
+            libcuda.so
+            ${CMAKE_BINARY_DIR}/lib/libcuda.so.1
+        COMMENT "Creating libcuda.so.1 symlink"
+    )
+
+    # Create libcupti.so.12 symlink for test
+    add_custom_command(TARGET cupti_mock POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink
+            libcupti.so
+            ${CMAKE_BINARY_DIR}/lib/libcupti.so.12
+        COMMENT "Creating libcupti.so.12 symlink"
+    )
+endif()
diff --git a/Dockerfile b/Dockerfile
index b691c6e..b8eddd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,83 +1,47 @@
 # Slim multi-platform build for libparcagpucupti.so
 # Uses pre-built CUDA header images instead of full CUDA development images
 # This significantly reduces build time and disk space requirements
+#
+# Thanks to Proton's dynamic CUPTI loading, we only need to build once
+# and the library works with any CUDA version at runtime.
 
-# CUDA header images (can be overridden at build time)
-ARG CUDA_12_HEADERS=ghcr.io/parca-dev/cuda-headers:12
-ARG CUDA_13_HEADERS=ghcr.io/parca-dev/cuda-headers:13
+# CUDA header image (can be overridden at build time)
+ARG CUDA_HEADERS=ghcr.io/parca-dev/cuda-headers:12
 
-# Import CUDA 12 headers
-FROM ${CUDA_12_HEADERS} AS cuda12-headers
+# Import CUDA headers
+FROM ${CUDA_HEADERS} AS cuda-headers
 
-# Import CUDA 13 headers
-FROM ${CUDA_13_HEADERS} AS cuda13-headers
-
-# Build stage for CUDA 12
-FROM ubuntu:22.04 AS builder-cuda12
+# Build stage
+FROM ubuntu:24.04 AS builder
 
 # Install only build tools (no CUDA toolkit needed)
 RUN apt-get update && apt-get install -y \
     cmake \
     make \
-    gcc \
     g++ \
     systemtap-sdt-dev \
     && rm -rf /var/lib/apt/lists/*
 
-WORKDIR /build/cupti
+WORKDIR /build
 
-# Copy CUDA headers and libraries from header image
-COPY --from=cuda12-headers /usr/local/cuda /usr/local/cuda
+# Copy CUDA headers from header image
+COPY --from=cuda-headers /usr/local/cuda /usr/local/cuda
 
-# Copy source code
-COPY cupti/cupti-prof.c cupti/correlation_filter.cpp cupti/correlation_filter.h cupti/CMakeLists.txt ./
+# Copy parcagpu source files and proton submodule
+COPY src /build/src
+COPY proton /build/proton
+COPY CMakeLists.txt /build/
 
-# Build the library for CUDA 12
-ENV CUDA_ROOT=/usr/local/cuda
+# Build the library (disable tests for Docker build)
 RUN mkdir -p build && \
     cd build && \
-    cmake -DCUDA_ROOT=${CUDA_ROOT} -DCMAKE_BUILD_TYPE=RelWithDebInfo .. && \
-    make VERBOSE=1 && \
-    mv libparcagpucupti.so libparcagpucupti.so.12
-
-# Build stage for CUDA 13
-FROM ubuntu:22.04 AS builder-cuda13
-
-# Install only build tools (no CUDA toolkit needed)
-RUN apt-get update && apt-get install -y \
-    cmake \
-    make \
-    gcc \
-    g++ \
-    systemtap-sdt-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /build/cupti
-
-# Copy CUDA headers and libraries from header image
-COPY --from=cuda13-headers /usr/local/cuda /usr/local/cuda
-
-# Copy source code
-COPY cupti/cupti-prof.c cupti/correlation_filter.cpp cupti/correlation_filter.h cupti/CMakeLists.txt ./
-
-# Build the library for CUDA 13
-ENV CUDA_ROOT=/usr/local/cuda
-RUN mkdir -p build && \
-    cd build && \
-    cmake -DCUDA_ROOT=${CUDA_ROOT} .. && \
-    make VERBOSE=1 && \
-    mv libparcagpucupti.so libparcagpucupti.so.13
-
-# Export stages for extracting single libraries (used by Makefile and release binaries)
-FROM scratch AS export-cuda12
-COPY --from=builder-cuda12 /build/cupti/build/libparcagpucupti.so.12 /
+    cmake -DCUDA_INCLUDE_DIR=/usr/local/cuda/include -DBUILD_TESTS=OFF .. && \
+    make -j$(nproc)
 
-FROM scratch AS export-cuda13
-COPY --from=builder-cuda13 /build/cupti/build/libparcagpucupti.so.13 /
+# Export stage for extracting the library (used by Makefile and release binaries)
+FROM scratch AS export
+COPY --from=builder /build/build/lib/libparcagpucupti.so /
 
-# Runtime image with both CUDA versions (for container registry)
+# Runtime image (for container registry)
 FROM busybox:latest AS runtime
-COPY --from=builder-cuda12 /build/cupti/build/libparcagpucupti.so.12 /usr/lib/
-COPY --from=builder-cuda13 /build/cupti/build/libparcagpucupti.so.13 /usr/lib/
-# Default symlink points to CUDA 12
-RUN ln -s /usr/lib/libparcagpucupti.so.12 /usr/lib/libparcagpucupti.so
+COPY --from=builder /build/build/lib/libparcagpucupti.so /usr/lib/libparcagpucupti.so
diff --git a/Dockerfile.test b/Dockerfile.test
index 4137314..65bf375 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -9,20 +9,22 @@ RUN apt-get update && apt-get install -y \
 WORKDIR /test
 
 # Create directory structure
-RUN mkdir -p cupti/build-amd64 zig-out/bin zig-out/lib
+RUN mkdir -p build/bin build/lib
 
 # Copy the built library
-COPY cupti/build-amd64/libparcagpucupti.so ./cupti/build-amd64/
+COPY build/lib/libparcagpucupti.so ./build/lib/
 
-# Copy the zig-built test files
-COPY zig-out/bin/test_cupti_prof ./zig-out/bin/
-COPY zig-out/lib/libcupti.so ./zig-out/lib/
-COPY zig-out/lib/libcupti.so.12 ./zig-out/lib/
+# Copy the test files
+COPY build/bin/test_cupti_prof ./build/bin/
+COPY build/lib/libcuda.so ./build/lib/
+COPY build/lib/libcuda.so.1 ./build/lib/
+COPY build/lib/libcupti.so ./build/lib/
+COPY build/lib/libcupti.so.12 ./build/lib/
 
 # Set environment
-ENV LD_LIBRARY_PATH=/test/zig-out/lib
+ENV LD_LIBRARY_PATH=/test/build/lib
 ENV PARCAGPU_DEBUG=1
 
 # Run tests by default
-ENTRYPOINT ["/test/zig-out/bin/test_cupti_prof", "/test/cupti/build-amd64/libparcagpucupti.so"]
+ENTRYPOINT ["/test/build/bin/test_cupti_prof", "/test/build/lib/libparcagpucupti.so"]
 CMD []
diff --git a/Makefile b/Makefile
index fedd0c8..13c91a3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,153 +1,89 @@
-.PHONY: all clean test cupti-amd64 cupti-arm64 cupti-all cupti-all-versions cross test-infra docker-push push-cuda-headers docker-test-build docker-test-run format bpf-test test-multi
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug
 
-# CUDA version configuration
-CUDA_MAJOR ?= 12
-CUDA_FULL_VERSION ?= 12.9.1
-LIB_NAME = libparcagpucupti.so.$(CUDA_MAJOR)
+LIB_NAME = libparcagpucupti.so
 
-# Default target: build all CUDA versions (12 & 13) for both architectures and test infrastructure
-all: cupti-all-versions test-infra
-
-local:
-	mkdir -p build && \
-    cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -B build -S cupti && \
-    cd build && make -j$(nproc)
+# Default target: build for both architectures
+all: build-all
 
 # Build libparcagpucupti.so for AMD64 using Docker
-cupti-amd64:
-	@echo "=== Building $(LIB_NAME) for AMD64 with Docker (CUDA $(CUDA_MAJOR)) ==="
+build-amd64:
+	@echo "=== Building $(LIB_NAME) for AMD64 with Docker ==="
 	@mkdir -p /tmp/parcagpu-build-amd64
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
-		--target export-cuda$(CUDA_MAJOR) \
+		--target export \
 		--output type=local,dest=/tmp/parcagpu-build-amd64 \
 		--platform linux/amd64 .
-	@mkdir -p build/$(CUDA_MAJOR)/amd64
-	@cp /tmp/parcagpu-build-amd64/$(LIB_NAME) build/$(CUDA_MAJOR)/amd64/
-	@ln -sf $(LIB_NAME) build/$(CUDA_MAJOR)/amd64/libparcagpucupti.so
-	@echo "AMD64 library built: build/$(CUDA_MAJOR)/amd64/$(LIB_NAME)"
+	@mkdir -p build/amd64
+	@cp /tmp/parcagpu-build-amd64/$(LIB_NAME) build/amd64/
+	@echo "AMD64 library built: build/amd64/$(LIB_NAME)"
 
 # Build libparcagpucupti.so for ARM64 using Docker
-cupti-arm64:
-	@echo "=== Building $(LIB_NAME) for ARM64 with Docker (CUDA $(CUDA_MAJOR)) ==="
+build-arm64:
+	@echo "=== Building $(LIB_NAME) for ARM64 with Docker ==="
 	@mkdir -p /tmp/parcagpu-build-arm64
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
-		--target export-cuda$(CUDA_MAJOR) \
+		--target export \
 		--output type=local,dest=/tmp/parcagpu-build-arm64 \
 		--platform linux/arm64 .
-	@mkdir -p build/$(CUDA_MAJOR)/arm64
-	@cp /tmp/parcagpu-build-arm64/$(LIB_NAME) build/$(CUDA_MAJOR)/arm64/
-	@ln -sf $(LIB_NAME) build/$(CUDA_MAJOR)/arm64/libparcagpucupti.so
-	@echo "ARM64 library built: build/$(CUDA_MAJOR)/arm64/$(LIB_NAME)"
-
-# Build both architectures for current CUDA version (controlled by CUDA_MAJOR variable)
-# Example: make cupti-all CUDA_MAJOR=13
-cupti-all: cupti-amd64 cupti-arm64
+	@mkdir -p build/arm64
+	@cp /tmp/parcagpu-build-arm64/$(LIB_NAME) build/arm64/
+	@echo "ARM64 library built: build/arm64/$(LIB_NAME)"
 
-# Build all local artifacts: CUDA 12 & 13 for both amd64 and arm64
-# This extracts the .so files to build/12/{amd64,arm64}/ and build/13/{amd64,arm64}/
-# Use this target when you want local build artifacts for testing
-cupti-all-versions:
-	@echo "=== Building all CUDA versions (12 and 13) for both architectures ==="
-	@$(MAKE) cupti-amd64 CUDA_MAJOR=12 CUDA_FULL_VERSION=12.9.1
-	@$(MAKE) cupti-arm64 CUDA_MAJOR=12 CUDA_FULL_VERSION=12.9.1
-	@$(MAKE) cupti-amd64 CUDA_MAJOR=13 CUDA_FULL_VERSION=13.0.2
-	@$(MAKE) cupti-arm64 CUDA_MAJOR=13 CUDA_FULL_VERSION=13.0.2
+# Build both architectures
+build-all: build-amd64 build-arm64
 	@echo "=== All artifacts built ==="
-	@echo "CUDA 12 amd64: build/12/amd64/libparcagpucupti.so.12"
-	@echo "CUDA 12 arm64: build/12/arm64/libparcagpucupti.so.12"
-	@echo "CUDA 13 amd64: build/13/amd64/libparcagpucupti.so.13"
-	@echo "CUDA 13 arm64: build/13/arm64/libparcagpucupti.so.13"
+	@echo "AMD64: build/amd64/$(LIB_NAME)"
+	@echo "ARM64: build/arm64/$(LIB_NAME)"
 
-# Build runtime container image with both CUDA versions for both architectures
-# Note: This builds a Docker image but doesn't extract local artifacts or load into Docker
+# Build runtime container image for both architectures
 # Multi-platform images stay in buildx cache. Use docker-push to push to registry.
 cross:
-	@echo "=== Building runtime container for AMD64 and ARM64 (includes CUDA 12 and 13) ==="
+	@echo "=== Building runtime container for AMD64 and ARM64 ==="
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
 		--target runtime \
 		--platform linux/amd64,linux/arm64 \
 		.
 	@echo "Runtime container built for both platforms (cached, not loaded into Docker)"
 
-# CUDA header image configuration
-# Can be overridden to use local images (e.g., make cupti-all CUDA_12_HEADERS=cuda-headers:12)
-CUDA_HEADERS_REGISTRY ?= ghcr.io/parca-dev/cuda-headers
-CUDA_12_HEADERS ?= $(CUDA_HEADERS_REGISTRY):12
-CUDA_13_HEADERS ?= $(CUDA_HEADERS_REGISTRY):13
-
-# Build and push CUDA header images to registry
-# These are lightweight images (~35MB each) containing only CUDA headers and libcupti
-# Note: Only needs to be run manually when:
-#   - CUDA versions are updated (12.9.1 -> 12.x.x, 13.0.2 -> 13.x.x)
-#   - New CUDA major versions are added
-#   - CUPTI API changes require header updates
-push-cuda-headers:
-	@echo "=== Building and pushing CUDA header images ==="
-	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
-	@echo "Building CUDA 12 headers..."
-	@docker buildx build -f Dockerfile.cuda-headers \
-		--build-arg CUDA_VERSION=12.9.1 \
-		--platform linux/amd64,linux/arm64 \
-		--tag $(CUDA_HEADERS_REGISTRY):12 \
-		--push \
-		.
-	@echo "Building CUDA 13 headers..."
-	@docker buildx build -f Dockerfile.cuda-headers \
-		--build-arg CUDA_VERSION=13.0.2 \
-		--platform linux/amd64,linux/arm64 \
-		--tag $(CUDA_HEADERS_REGISTRY):13 \
-		--push \
-		.
-	@echo "CUDA header images pushed to $(CUDA_HEADERS_REGISTRY):12 and :13"
-
-# Build test infrastructure with CMake
-test-infra:
-	@echo "=== Building test infrastructure with CMake ==="
-	@mkdir -p test/build
-	@cd test/build && cmake .. -DCUDA_MAJOR_VERSION=$(CUDA_MAJOR) && make
-
-# Run tests (using AMD64 library)
-test: cupti-amd64 test-infra
-	@./test.sh
-
-# Run advanced test (8 GPUs @ 2500 launches/s, multi-threaded)
-test-advanced: cupti-amd64 test-infra
-	@echo "=== Running advanced test (8 GPUs @ 2500 launches/s) ==="
-	@cd test/build && LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \
-		./test_cupti_prof ../../build/$(CUDA_MAJOR)/amd64/libparcagpucupti.so \
-		--threads=4 --num-gpus=8 --launch-rate=2500 --duration=10
+# Local build with CMake (for development/testing) - default is Release with symbols
+local:
+	@echo "=== Building locally with CMake (RelWithDebInfo) ==="
+	@cmake -B build-local -S . -DCMAKE_BUILD_TYPE=RelWithDebInfo
+	@cmake --build build-local
+	@echo "Local build complete: build-local/lib/$(LIB_NAME)"
+
+# Debug build with CMake (full debug, no optimizations)
+debug:
+	@echo "=== Building debug version with CMake ==="
+	@cmake -B build-local -S . -DCMAKE_BUILD_TYPE=Debug
+	@cmake --build build-local
+	@echo "Debug build complete: build-local/lib/$(LIB_NAME)"
+
+# Run local tests
+test: local
+	@echo "=== Running tests ==="
+	@cd build-local && ctest --output-on-failure
 
 # Clean build artifacts
 clean:
 	@echo "=== Cleaning build artifacts ==="
-	@rm -rf cupti/build cupti/build-amd64 cupti/build-arm64 build
-	@rm -rf test/build
-	@rm -f test/bpf/activity_parser test/bpf/activityparser_*.go test/bpf/activityparser_*.o
+	@rm -rf build build-local bin lib zig-out .zig-cache
+	@rm -rf CMakeCache.txt CMakeFiles/ cmake_install.cmake compile_commands.json
 	@echo "Clean complete"
 
 # Build and push multi-arch Docker images to ghcr.io
 # Set IMAGE_TAG to override the default tag (e.g., make docker-push IMAGE_TAG=v1.0.0)
 # Set IMAGE to override the image name (e.g., make docker-push IMAGE=ghcr.io/myuser/parcagpu)
-# Set CUDA_12_HEADERS and CUDA_13_HEADERS to override header images (e.g., cuda-headers:12 for local)
-# Note: Runtime image includes both CUDA 12 and 13
 IMAGE ?= ghcr.io/parca-dev/parcagpu
 IMAGE_TAG ?= latest
 docker-push:
 	@echo "=== Setting up buildx builder ==="
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
-	@echo "=== Building and pushing multi-arch Docker images to $(IMAGE):$(IMAGE_TAG) (includes CUDA 12 and 13) ==="
+	@echo "=== Building and pushing multi-arch Docker images to $(IMAGE):$(IMAGE_TAG) ==="
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
 		--target runtime \
 		--platform linux/amd64,linux/arm64 \
 		--tag $(IMAGE):$(IMAGE_TAG) \
@@ -156,7 +92,7 @@ docker-push:
 	@echo "Images pushed successfully to $(IMAGE):$(IMAGE_TAG)"
 
 # Build test container image
-docker-test-build: cupti-amd64 test-infra
+docker-test-build: build-amd64
 	@echo "=== Building test container image ==="
 	@docker build -f Dockerfile.test -t parcagpu-test:latest .
 	@echo "Test container built: parcagpu-test:latest"
@@ -167,40 +103,6 @@ docker-test-run: docker-test-build
 	@echo "=== Running tests in container ==="
 	@docker run --rm parcagpu-test:latest $(ARGS)
 
-# Build the BPF activity parser test program
-# Requires: clang, libbpf-dev, bpftool (for vmlinux.h), Go 1.21+
-bpf-test:
-	@echo "=== Building BPF activity parser test ==="
-	@if [ ! -f test/bpf/vmlinux.h ]; then \
-		echo "Generating vmlinux.h from kernel BTF..."; \
-		bpftool btf dump file /sys/kernel/btf/vmlinux format c > test/bpf/vmlinux.h; \
-	fi
-	@cd test/bpf && go generate ./... && CGO_ENABLED=0 go build -o activity_parser .
-	@echo "BPF test built: test/bpf/activity_parser"
-
-# Run test_cupti_prof and BPF activity parser in parallel.
-# The BPF test attaches to the test program's PID and logs kernel activities.
-# Requires root (sudo) for BPF.
-test-multi: cupti-amd64 test-infra bpf-test
-	@echo "=== Running test with BPF activity parser ==="
-	@case "$$(uname -m)" in \
-		aarch64|arm64) ARCH=arm64 ;; \
-		*) ARCH=amd64 ;; \
-	esac; \
-	LIB_PATH="build/$(CUDA_MAJOR)/$${ARCH}/libparcagpucupti.so"; \
-	export LD_LIBRARY_PATH="$$(pwd)/test/build:$$LD_LIBRARY_PATH"; \
-	test/build/test_cupti_prof "$${LIB_PATH}" --kernel-names=kernel_names.txt --duration=10 & \
-	TEST_PID=$$!; \
-	sleep 1; \
-	echo "test_cupti_prof PID: $${TEST_PID}"; \
-	echo "Starting BPF activity parser (requires root)..."; \
-	sudo test/bpf/activity_parser -pid $${TEST_PID} -lib "$$(pwd)/$${LIB_PATH}" -v & \
-	BPF_PID=$$!; \
-	wait $${TEST_PID}; \
-	TEST_EXIT=$$?; \
-	sleep 1; \
-	sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
-	echo "=== test-multi completed (test exit: $${TEST_EXIT}) ==="
-
 format:
-	clang-format -i -style=file cupti/*.[ch]
+	@echo "=== Formatting source files ==="
+	@clang-format -i -style=file src/*.cpp src/*.h test/*.c
diff --git a/README_TEST.md b/README_TEST.md
index 1c90651..f797556 100644
--- a/README_TEST.md
+++ b/README_TEST.md
@@ -13,9 +13,9 @@ make test
 ```
 
 This builds:
-- `cupti/build/libparcagpucupti.so` - Production library (CMake + real CUPTI)
-- `zig-out/lib/libcupti.so` - Mock CUPTI for test infrastructure
-- `zig-out/bin/test_cupti_prof` - Test program
+- `build/lib/libparcagpucupti.so` - Production library (CMake)
+- `build/lib/libcupti.so` - Mock CUPTI for test infrastructure
+- `build/bin/test_cupti_prof` - Test program
 
 ## Quick Start
 
@@ -39,7 +39,7 @@ This will:
 
 For extended testing or continuous probe monitoring:
 ```bash
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so --forever
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so --forever
 ```
 
 This runs indefinitely at 1000 events/second until interrupted (Ctrl-C).
diff --git a/cupti/CMakeLists.txt b/cupti/CMakeLists.txt
deleted file mode 100644
index ead5ead..0000000
--- a/cupti/CMakeLists.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-project(parcagpucupti C CXX)
-
-# CUDA root directory (can be overridden via -DCUDA_ROOT=...)
-if(NOT DEFINED CUDA_ROOT)
-    set(CUDA_ROOT "/usr/local/cuda" CACHE PATH "CUDA installation directory")
-endif()
-
-# CUDA library directory
-if(NOT DEFINED CUDA_LIBDIR)
-    set(CUDA_LIBDIR "${CUDA_ROOT}/lib64" CACHE PATH "CUDA library directory")
-endif()
-
-# Set include directories for CUDA headers
-set(CUDAToolkit_INCLUDE_DIRS
-    "${CUDA_ROOT}/include"
-    "${CUDA_ROOT}/extras/CUPTI/include"
-)
-
-# Add CUDA library directory to link directories
-link_directories(${CUDA_LIBDIR})
-
-# Create shared library with both C and C++ sources
-add_library(parcagpucupti SHARED cupti-prof.c correlation_filter.cpp)
-
-# Set properties
-set_target_properties(parcagpucupti PROPERTIES
-    C_STANDARD 11
-    C_STANDARD_REQUIRED ON
-    CXX_STANDARD 17
-    CXX_STANDARD_REQUIRED ON
-    POSITION_INDEPENDENT_CODE ON
-)
-
-# Add debug symbols and disable C++ exceptions
-target_compile_options(parcagpucupti PRIVATE -g -fno-exceptions)
-
-# Include directories
-target_include_directories(parcagpucupti PRIVATE
-    ${CUDAToolkit_INCLUDE_DIRS}
-)
-
-# systemtap headers are now in /usr/include/sys/ (copied in Dockerfile for cross-compilation)
-
-# Find the CUPTI library explicitly
-find_library(CUPTI_LIBRARY
-    NAMES cupti
-    PATHS ${CUDA_LIBDIR}
-    REQUIRED
-)
-
-# Find the CUDA driver library
-find_library(CUDA_LIBRARY
-    NAMES cuda
-    PATHS ${CUDA_LIBDIR} ${CUDA_LIBDIR}/stubs
-    REQUIRED
-)
-
-# Link libraries
-target_link_libraries(parcagpucupti PRIVATE
-    ${CUPTI_LIBRARY}
-    ${CUDA_LIBRARY}
-)
-
-# Installation
-install(TARGETS parcagpucupti
-    LIBRARY DESTINATION lib
-)
\ No newline at end of file
diff --git a/cupti/cupti-prof.c b/cupti/cupti-prof.c
deleted file mode 100644
index 11b61b0..0000000
--- a/cupti/cupti-prof.c
+++ /dev/null
@@ -1,575 +0,0 @@
-#define _POSIX_C_SOURCE 199309L
-#include <execinfo.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/sdt.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <cupti.h>
-
-#include "correlation_filter.h"
-
-// Debug logging control
-static bool debug_enabled = false;
-
-// Activity buffer management
-// A kernel activity is around 224 bytes so a 128kb buffer
-// will hold ~500 activities, we want to flush regularly since
-// we are a continuous profiler so we don't need a huge buffer
-// like most CUPTI profilers.  Also a small size avoid malloc
-// just going to mmap every time so the allocator should cache
-// and re-use these for us.
-static size_t activityBufferSize = 128 * 1024;
-
-// Global variables
-static CUpti_SubscriberHandle subscriber = 0;
-
-static size_t outstandingEvents = 0;
-
-// Thread-local tracking: store correlation ID from runtime ENTER
-// so we can skip driver EXIT probe when it matches (driver calls happen under
-// runtime calls)
-static __thread uint32_t runtimeEnterCorrelationId = 0;
-
-// Rate limiting - token bucket algorithm (configurable via PARCAGPU_RATE_LIMIT)
-static double rateLimitPerSec = 100.0;
-
-// Thread-local token bucket state
-static __thread uint64_t lastRefillNs = 0;
-static __thread double tokens = 0;
-
-// Returns true if the sample should be emitted, false if rate limited
-static bool rateLimiterTryAcquire(uint64_t nowNs) {
-  // Refill tokens based on elapsed time
-  if (lastRefillNs > 0) {
-    double elapsedSec = (nowNs - lastRefillNs) / 1e9;
-    tokens = tokens + elapsedSec * rateLimitPerSec;
-    if (tokens > rateLimitPerSec) {
-      tokens = rateLimitPerSec;
-    }
-  } else {
-    tokens = rateLimitPerSec; // Start with full bucket
-  }
-  lastRefillNs = nowNs;
-
-  if (tokens >= 1.0) {
-    tokens -= 1.0;
-    return true;
-  }
-  return false;
-}
-
-// Correlation ID filter (for regular kernel launches)
-static CorrelationFilterHandle correlationFilter = NULL;
-
-// Graph correlation map (for graph launches with multiple kernels per
-// correlation ID)
-static GraphCorrelationMapHandle graphCorrelationMap = NULL;
-
-// Buffer processing cycle counter (for graph map state machine)
-static uint32_t bufferCycle = 0;
-
-static void init_debug(void) {
-  static bool initialized = false;
-  if (!initialized) {
-    debug_enabled = getenv("PARCAGPU_DEBUG") != NULL;
-    const char *rateEnv = getenv("PARCAGPU_RATE_LIMIT");
-    if (rateEnv != NULL) {
-      double rate = atof(rateEnv);
-      if (rate > 0) {
-        rateLimitPerSec = rate;
-      }
-    }
-    initialized = true;
-  }
-}
-
-#define DEBUG_PRINTF(...)                                                      \
-  do {                                                                         \
-    init_debug();                                                              \
-    if (debug_enabled) {                                                       \
-      struct timespec ts;                                                      \
-      clock_gettime(CLOCK_REALTIME, &ts);                                      \
-      printf("[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec);                           \
-      printf(__VA_ARGS__);                                                     \
-    }                                                                          \
-  } while (0)
-
-// Forward declarations
-static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
-                                  CUpti_CallbackId cbid,
-                                  const CUpti_CallbackData *cbdata);
-static void parcagpuBufferRequested(uint8_t **buffer, size_t *size,
-                                    size_t *maxNumRecords);
-static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId,
-                                    uint8_t *buffer, size_t size,
-                                    size_t validSize);
-
-void cleanup(void);
-
-// CUPTI initialization function required for CUDA_INJECTION64_PATH
-int InitializeInjection(void) {
-  DEBUG_PRINTF("[CUPTI] InitializeInjection called\n");
-  CUptiResult result;
-
-  // Set flush period BEFORE enabling activities (in milliseconds)
-  // Try a larger value like 1000ms (1 second) for better compatibility
-  result = cuptiActivityFlushPeriod(1000);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to set activity flush period: %s\n",
-            errstr);
-  } else {
-    DEBUG_PRINTF("[CUPTI] Set activity flush period to 1000ms\n");
-  }
-
-  // Try to subscribe to callbacks
-  result = cuptiSubscribe(&subscriber,
-                          (CUpti_CallbackFunc)parcagpuCuptiCallback, NULL);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to subscribe to callbacks: %s\n", errstr);
-    return 1; // Still return success to not break the injection
-  }
-
-  // Enable all runtime API kernel launch callbacks
-  CUpti_CallbackId runtimeCallbacks[] = {
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000,
-  };
-  for (size_t i = 0; i < sizeof(runtimeCallbacks) / sizeof(runtimeCallbacks[0]);
-       i++) {
-    result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
-                                 runtimeCallbacks[i]);
-    if (result != CUPTI_SUCCESS) {
-      const char *errstr;
-      cuptiGetResultString(result, &errstr);
-      fprintf(stderr, "[CUPTI] Failed to enable runtime callback %d: %s\n",
-              runtimeCallbacks[i], errstr);
-    }
-  }
-
-  // Enable all driver API kernel launch callbacks
-  CUpti_CallbackId driverCallbacks[] = {
-      CUPTI_DRIVER_TRACE_CBID_cuLaunch,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice,
-      CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch,
-      CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz,
-  };
-  for (size_t i = 0; i < sizeof(driverCallbacks) / sizeof(driverCallbacks[0]);
-       i++) {
-    result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
-                                 driverCallbacks[i]);
-    if (result != CUPTI_SUCCESS) {
-      const char *errstr;
-      cuptiGetResultString(result, &errstr);
-      fprintf(stderr, "[CUPTI] Failed to enable driver callback %d: %s\n",
-              driverCallbacks[i], errstr);
-    }
-  }
-
-  // Register activity buffer callbacks
-  result = cuptiActivityRegisterCallbacks(parcagpuBufferRequested,
-                                          parcagpuBufferCompleted);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to register activity callbacks: %s\n",
-            errstr);
-    return 1; // Still return success to not break the injection
-  }
-
-  result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to enable concurrent kernel activity: %s\n",
-            errstr);
-  } else {
-    DEBUG_PRINTF("[CUPTI] Enabled CONCURRENT_KERNEL activity\n");
-  }
-
-  // Create correlation filter
-  correlationFilter = correlation_filter_create();
-  if (correlationFilter) {
-    DEBUG_PRINTF("[CUPTI] Correlation filter created and enabled\n");
-  } else {
-    fprintf(stderr, "[CUPTI] Warning: Failed to create correlation filter\n");
-  }
-
-  // Create graph correlation map
-  graphCorrelationMap = graph_correlation_map_create();
-  if (graphCorrelationMap) {
-    DEBUG_PRINTF("[CUPTI] Graph correlation map created and enabled\n");
-  } else {
-    fprintf(stderr,
-            "[CUPTI] Warning: Failed to create graph correlation map\n");
-  }
-
-  atexit(cleanup);
-
-  DEBUG_PRINTF("[CUPTI] Successfully initialized CUPTI callbacks with external "
-               "correlation and activity API\n");
-
-  return 1;
-}
-
-// Helper function to print stack trace
-static void print_backtrace(const char *prefix) {
-  void *array[20];
-  size_t size;
-  char **strings;
-
-  size = backtrace(array, 20);
-  strings = backtrace_symbols(array, size);
-
-  if (strings != NULL) {
-    printf("%s Stack trace (%zu frames):\n", prefix, size);
-    for (size_t i = 0; i < size; i++) {
-      printf("  [%zu] %s\n", i, strings[i]);
-    }
-    free(strings);
-  }
-}
-
-// Callback handler for driver and runtime API
-static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
-                                  CUpti_CallbackId cbid,
-                                  const CUpti_CallbackData *cbdata) {
-  uint32_t correlationId = cbdata->correlationId;
-
-  // Track runtime ENTER so we can skip driver EXIT when they match
-  if (domain == CUPTI_CB_DOMAIN_RUNTIME_API &&
-      cbdata->callbackSite == CUPTI_API_ENTER) {
-    runtimeEnterCorrelationId = correlationId;
-    DEBUG_PRINTF("[CUPTI] Runtime API ENTER: correlationId=%u\n",
-                 correlationId);
-    return;
-  }
-
-  // We hook on EXIT because that makes our probe overhead not add to GPU
-  // launch latency and hopefully covers some of the overhead in the shadow of
-  // GPU async work.
-  if (cbdata->callbackSite != CUPTI_API_EXIT) {
-    if (cbdata->callbackSite == CUPTI_API_ENTER &&
-        domain == CUPTI_CB_DOMAIN_DRIVER_API) {
-      DEBUG_PRINTF(
-          "[CUPTI] Driver API ENTER: correlationId=%u (will check on EXIT)\n",
-          correlationId);
-    }
-    return;
-  }
-
-  const char *name =
-      cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
-  int signedCbid;
-
-  if (domain == CUPTI_CB_DOMAIN_DRIVER_API) {
-    // Skip if this driver call is under a runtime call (same correlation ID)
-    if (correlationId == runtimeEnterCorrelationId) {
-      DEBUG_PRINTF("[CUPTI] Skipping driver EXIT correlationId=%u "
-                   "(runtimeEnter=%u) - runtime "
-                   "will handle\n",
-                   correlationId, runtimeEnterCorrelationId);
-      return;
-    }
-    // Pure driver call (no runtime wrapper) - use negative cbid
-    signedCbid = -(int)cbid;
-    DEBUG_PRINTF("[CUPTI] Driver API EXIT callback: cbid=%d, correlationId=%u, "
-                 "runtimeEnter=%u, func=%s\n",
-                 cbid, correlationId, runtimeEnterCorrelationId, name);
-  } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
-    signedCbid = (int)cbid;
-    DEBUG_PRINTF("[CUPTI] Runtime API EXIT callback: cbid=%d, "
-                 "correlationId=%u, runtimeEnter=%u, func=%s\n",
-                 cbid, correlationId, runtimeEnterCorrelationId, name);
-    runtimeEnterCorrelationId = 0; // Clear after use
-  } else {
-    return;
-  }
-
-  // Rate limit probes
-  struct timespec ts;
-  clock_gettime(CLOCK_MONOTONIC, &ts);
-  uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
-  if (!rateLimiterTryAcquire(nowNs)) {
-    DEBUG_PRINTF("[CUPTI] Rate limited: skipping probe for correlationId=%u\n",
-                 correlationId);
-    return;
-  }
-
-  outstandingEvents++;
-  DTRACE_PROBE3(parcagpu, cuda_correlation, correlationId, signedCbid, name);
-
-  // Detect graph launches by callback ID
-  bool is_graph_launch =
-      (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000) ||
-      (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000) ||
-      (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch) ||
-      (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
-
-  // Insert into appropriate map based on launch type
-  if (is_graph_launch) {
-    // Graph launch - will generate multiple kernels with same correlation ID
-    if (graphCorrelationMap) {
-      graph_correlation_map_insert(graphCorrelationMap, correlationId);
-      DEBUG_PRINTF(
-          "[CUPTI] Inserted correlationId=%u into graph map (size=%zu)\n",
-          correlationId, graph_correlation_map_size(graphCorrelationMap));
-    }
-  } else {
-    // Regular kernel launch - single kernel per correlation ID
-    if (correlationFilter) {
-      correlation_filter_insert(correlationFilter, correlationId);
-      DEBUG_PRINTF("[CUPTI] Inserted correlationId=%u into filter (size=%zu)\n",
-                   correlationId, correlation_filter_size(correlationFilter));
-    }
-  }
-
-  // If we let too many events pile up it overwhelms the perf_event buffers,
-  // just another reason to explore just passing the activity buffer through to
-  // eBPF.
-  if (outstandingEvents > 3000) {
-    DEBUG_PRINTF("[CUPTI] Flushing: outstandingEvents=%zu\n",
-                 outstandingEvents);
-    cuptiActivityFlushAll(0);
-    outstandingEvents = 0;
-  }
-}
-
-// Out-of-line USDT probe site for activity batches.
-// Single call site ensures one probe location in the ELF .note.stapsdt section.
-__attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs,
-                                                     uint32_t count) {
-  DTRACE_PROBE2(parcagpu, activity_batch, ptrs, count);
-}
-
-// Buffer request callback
-static void parcagpuBufferRequested(uint8_t **buffer, size_t *size,
-                                    size_t *maxNumRecords) {
-  *buffer = (uint8_t *)aligned_alloc(8, activityBufferSize);
-  *size = activityBufferSize;
-  *maxNumRecords = 0; // Let CUPTI decide
-
-  DEBUG_PRINTF("[CUPTI:bufferRequested] Allocated buffer %p, size=%zu\n",
-               *buffer, *size);
-}
-
-// Buffer completion callback
-static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId,
-                                    uint8_t *buffer, size_t size,
-                                    size_t validSize) {
-  CUptiResult result;
-  CUpti_Activity *record = NULL;
-  int recordCount = 0;
-  static int calls = 0;
-
-  // Batch probe: collect pointers to activity records and pass them to
-  // BPF/USDT every ACTIVITY_BATCH_SIZE records. Stack-allocated array
-  // of pointers — no heap allocation, no copying, version-independent.
-  // BPF consumers filter by activity kind (kernel, memcpy, etc.).
-#define ACTIVITY_BATCH_SIZE 128
-  const void *batchPtrs[ACTIVITY_BATCH_SIZE];
-  uint32_t batchCount = 0;
-
-  DEBUG_PRINTF("[CUPTI] bufferCompleted called: buffer=%p validSize=%zu (%d)\n",
-               buffer, validSize, calls++);
-
-  // Start new cycle for graph correlation map
-  uint32_t currentCycle = bufferCycle++;
-  if (graphCorrelationMap) {
-    graph_correlation_map_cycle_start(graphCorrelationMap, currentCycle);
-    DEBUG_PRINTF("[CUPTI] Started graph correlation map cycle %u\n",
-                 currentCycle);
-  }
-
-  while (1) {
-    result = cuptiActivityGetNextRecord(buffer, validSize, &record);
-    if (result == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-      break;
-    } else if (result != CUPTI_SUCCESS) {
-      const char *errstr;
-      cuptiGetResultString(result, &errstr);
-      fprintf(stderr, "[CUPTI] Error reading activity record: %s\n", errstr);
-      break;
-    }
-
-    recordCount++;
-    if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL ||
-        record->kind == CUPTI_ACTIVITY_KIND_KERNEL) {
-      CUpti_ActivityKernel5 *k = (CUpti_ActivityKernel5 *)record;
-
-      DEBUG_PRINTF("[CUPTI] Kernel activity: graphId=%u graphNodeId=%lu "
-                   "name=%s, correlationId=%u, deviceId=%u, "
-                   "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n",
-                   k->graphId, k->graphNodeId, k->name, k->correlationId,
-                   k->deviceId, k->streamId, k->start, k->end,
-                   k->end - k->start);
-
-      // Route to appropriate map based on whether this is a graph kernel
-      bool should_fire = true;
-      if (k->graphId != 0) {
-        // Graph kernel - check graph correlation map
-        if (graphCorrelationMap) {
-          should_fire = graph_correlation_map_check_and_mark_seen(
-              graphCorrelationMap, k->correlationId, currentCycle);
-          if (!should_fire) {
-            DEBUG_PRINTF(
-                "[CUPTI] Filtered out graph correlationId=%u (not tracked)\n",
-                k->correlationId);
-          } else {
-            DEBUG_PRINTF("[CUPTI] Matched graph correlationId=%u - firing "
-                         "kernel_executed (map size=%zu)\n",
-                         k->correlationId,
-                         graph_correlation_map_size(graphCorrelationMap));
-          }
-        }
-      } else {
-        // Regular kernel - check regular correlation filter
-        if (correlationFilter) {
-          should_fire = correlation_filter_check_and_remove(correlationFilter,
-                                                            k->correlationId);
-          if (!should_fire) {
-            DEBUG_PRINTF(
-                "[CUPTI] Filtered out correlationId=%u (not tracked)\n",
-                k->correlationId);
-          } else {
-            DEBUG_PRINTF("[CUPTI] Matched correlationId=%u - firing "
-                         "kernel_executed (filter size=%zu)\n",
-                         k->correlationId,
-                         correlation_filter_size(correlationFilter));
-          }
-        }
-      }
-
-      // Only fire probe if correlation ID was tracked (or filters disabled)
-      if (should_fire) {
-        DTRACE_PROBE8(parcagpu, kernel_executed, k->start, k->end,
-                      k->correlationId, k->deviceId, k->streamId, k->graphId,
-                      k->graphNodeId, k->name);
-      }
-    }
-
-    // Collect pointer for batch probe (all activity kinds).
-    // BPF consumers inspect the kind field to filter types they care about.
-    batchPtrs[batchCount++] = record;
-    if (batchCount < ACTIVITY_BATCH_SIZE)
-      continue;
-
-    parcagpuActivityBatch(batchPtrs, batchCount);
-    batchCount = 0;
-  }
-
-  if (batchCount > 0)
-    parcagpuActivityBatch(batchPtrs, batchCount);
-
-  DEBUG_PRINTF("[CUPTI] Processed %d activity records from buffer %p\n",
-               recordCount, buffer);
-
-  // End cycle for graph correlation map - clean up completed graph launches
-  if (graphCorrelationMap) {
-    graph_correlation_map_cycle_end(graphCorrelationMap);
-
-    size_t map_size = 0;
-    size_t oldest_age = 0;
-    graph_correlation_map_get_stats(graphCorrelationMap, &map_size,
-                                    &oldest_age);
-
-    DEBUG_PRINTF("[CUPTI] Ended graph correlation map cycle %u (map size=%zu, "
-                 "oldest_age=%zu cycles)\n",
-                 currentCycle, map_size, oldest_age);
-
-    // Log warning if we have old entries (potential leaked graph launches)
-    if (oldest_age > 50 && currentCycle % 10 == 0) {
-      DEBUG_PRINTF("[CUPTI] WARNING: Graph map has entries aged %zu cycles "
-                   "(may be dropped launches)\n",
-                   oldest_age);
-    }
-  }
-
-  // Reset to 0 rather than decrement - one API callback can produce N
-  // activities so decrementing by recordCount can cause underflow (size_t wraps
-  // to huge value)
-  outstandingEvents = 0;
-
-  // Free the buffer
-  DEBUG_PRINTF("[CUPTI] Freeing buffer %p\n", buffer);
-  free(buffer);
-
-  // Report any records dropped due to buffer overflow
-  size_t dropped;
-  result = cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped);
-  if (result == CUPTI_SUCCESS && dropped > 0) {
-    fprintf(stderr, "[CUPTI] Warning: %zu activity records dropped\n", dropped);
-  }
-}
-
-// Cleanup function (destructor disabled to prevent early cleanup)
-void cleanup(void) {
-  static bool cleanup_done = false;
-
-  // Make cleanup idempotent - safe to call multiple times
-  if (cleanup_done) {
-    return;
-  }
-  cleanup_done = true;
-
-  DEBUG_PRINTF("[CUPTI] Cleanup started\n");
-  // Flush any remaining activity records
-  cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
-
-  // Unsubscribe from callbacks
-  if (subscriber) {
-    cuptiUnsubscribe(subscriber);
-    subscriber = 0;
-  }
-
-  // Destroy correlation filter
-  if (correlationFilter) {
-    size_t remaining = correlation_filter_size(correlationFilter);
-    if (remaining > 0) {
-      DEBUG_PRINTF(
-          "[CUPTI] Warning: %zu correlation IDs still in filter at cleanup\n",
-          remaining);
-    }
-    correlation_filter_destroy(correlationFilter);
-    correlationFilter = NULL;
-  }
-
-  // Destroy graph correlation map
-  if (graphCorrelationMap) {
-    size_t remaining = graph_correlation_map_size(graphCorrelationMap);
-    if (remaining > 0) {
-      DEBUG_PRINTF("[CUPTI] Warning: %zu correlation IDs still in graph map at "
-                   "cleanup\n",
-                   remaining);
-    }
-    graph_correlation_map_destroy(graphCorrelationMap);
-    graphCorrelationMap = NULL;
-  }
-
-  DEBUG_PRINTF("[CUPTI] Cleanup completed\n");
-}
diff --git a/cupti/toolchain-arm64.cmake b/cupti/toolchain-arm64.cmake
deleted file mode 100644
index 5542249..0000000
--- a/cupti/toolchain-arm64.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-# CMake toolchain file for cross-compiling to ARM64
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR aarch64)
-
-# Specify the cross compiler
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
-
-# Where to look for the target environment
-set(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
-
-# Adjust the default behavior of the FIND_XXX() commands:
-# search headers and libraries in the target environment
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-# Force position-independent code for shared libraries
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
diff --git a/parcagpu.bt b/parcagpu.bt
index c35ca43..757c1eb 100755
--- a/parcagpu.bt
+++ b/parcagpu.bt
@@ -1,50 +1,42 @@
 #!/usr/bin/env bpftrace
 
-/*
- * Simple CUPTI USDT Test
- * Tests the three parcagpu USDT probes
- */
-
-BEGIN
-{
-    printf("Simple CUPTI USDT Test Started\n");
-    printf("Monitoring parcagpu USDT probes...\n\n");
+BEGIN {
+    printf("Monitoring CUDA kernel executions via parcagpu USDT probes...\n");
+    printf("%-20s %-12s %-10s %-10s %-10s %-10s %-10s %-10s %s\n",
+           "TIME", "DURATION_NS", "CORR_ID", "DEVICE", "STREAM", "GRAPH", "NODE", "START", "KERNEL");
 }
 
+usdt:*:parcagpu:kernel_executed {
+    $start = arg0;
+    $end = arg1;
+    $duration = $end - $start;
+    $correlation_id = arg2;
+    $device_id = arg3;
+    $stream_id = arg4;
+    $graph_id = arg5;
+    $graph_node_id = arg6;
+    $kernel_name = str(arg7);
 
-
-// Track CUDA API correlation events
-// Arguments: correlationId (uint32), signedCbid (int32), name (char*)
-usdt:*:parcagpu:cuda_correlation
-{
-    $corId = (uint32)arg0;
-    $cbid = (int32)arg1;
-    printf("[%d] cuda_correlation: correlationId=%u, cbid=%d, name=%s\n",
-           pid, $corId, $cbid, str(arg2));
-    @cuda_correlations = count();
+    printf("%-20s %-12lu %-10u %-10u %-10u %-10u %-10lu %-10lu %s\n",
+           strftime("%H:%M:%S.%f", nsecs),
+           $duration,
+           $correlation_id,
+           $device_id,
+           $stream_id,
+           $graph_id,
+           $graph_node_id,
+           $start,
+           $kernel_name);
 }
 
-// Track kernel executions
-// Arguments: start, end, correlationId, deviceId, streamId, name (pointer)
-usdt:*:parcagpu:kernel_executed
-{
-    $start = arg0;
-    $end = arg1;
-    printf("[%d] Kernel executed:\n", pid);
-    printf("  start=%lu, end=%lu, duration=%lu ns\n", $start, $end, $end - $start);
-    $devId = arg2 >>32;
-    $corId = (uint64)arg2 & 0xFFFFFFFF;		
-    printf("  correlationId=%u, deviceId=%u, streamId=%u\n",
-           $corId, $devId, arg3);
-    printf("  name=%s\n", str(arg4));
-    @kernel_executions = count();
+usdt:*:parcagpu:cuda_correlation {
+    $correlation_id = arg0;
+    $cbid = arg1;
+    $name = str(arg2);
+
+    printf("%-20s [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S.%f", nsecs),$correlation_id, $cbid, $name);
 }
 
-END
-{
-    printf("\n=== Summary ===\n");
-    printf("CUDA correlations: ");
-    print(@cuda_correlations);
-    printf("Kernel executions: ");
-    print(@kernel_executions);
+END {
+    printf("\nMonitoring complete.\n");
 }
diff --git a/proton b/proton
new file mode 160000
index 0000000..b0ffb66
--- /dev/null
+++ b/proton
@@ -0,0 +1 @@
+Subproject commit b0ffb664cef804328c872bf02f7961de660c18a9
diff --git a/src/correlation_filter.cpp b/src/correlation_filter.cpp
new file mode 100644
index 0000000..c246c77
--- /dev/null
+++ b/src/correlation_filter.cpp
@@ -0,0 +1,97 @@
+#include "correlation_filter.h"
+
+namespace parcagpu {
+
+//=============================================================================
+// CorrelationFilter implementation
+//=============================================================================
+
+void CorrelationFilter::insert(uint32_t correlation_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  set_.insert(correlation_id);
+}
+
+bool CorrelationFilter::check_and_remove(uint32_t correlation_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto it = set_.find(correlation_id);
+  if (it != set_.end()) {
+    set_.erase(it);
+    return true;
+  }
+  return false;
+}
+
+size_t CorrelationFilter::size() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return set_.size();
+}
+
+//=============================================================================
+// GraphCorrelationEntry implementation
+//=============================================================================
+
+GraphCorrelationEntry::GraphCorrelationEntry(uint32_t cycle)
+    : state{GRAPH_STATE_UNINITIALIZED, GRAPH_STATE_UNINITIALIZED},
+      ever_seen_kernel(false), insertion_cycle(cycle) {}
+
+//=============================================================================
+// GraphCorrelationMap implementation
+//=============================================================================
+
+GraphCorrelationMap::GraphCorrelationMap() : current_cycle_(0) {}
+
+void GraphCorrelationMap::insert(uint32_t correlation_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  map_.emplace(correlation_id, GraphCorrelationEntry(current_cycle_));
+}
+
+void GraphCorrelationMap::cycle_start(uint32_t cycle) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  current_cycle_ = cycle;
+  uint32_t slot = cycle % 2;
+  for (auto &pair : map_) {
+    pair.second.state[slot] = GRAPH_STATE_CYCLE_CLEARED;
+  }
+}
+
+bool GraphCorrelationMap::check_and_mark_seen(uint32_t correlation_id,
+                                               uint32_t cycle) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto it = map_.find(correlation_id);
+  if (it != map_.end()) {
+    uint32_t slot = cycle % 2;
+    it->second.state[slot] = GRAPH_STATE_KERNEL_SEEN;
+    it->second.ever_seen_kernel = true;
+    return true;
+  }
+  return false;
+}
+
+void GraphCorrelationMap::cycle_end() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  for (auto it = map_.begin(); it != map_.end();) {
+    bool should_remove = false;
+    if (it->second.state[0] == GRAPH_STATE_CYCLE_CLEARED &&
+        it->second.state[1] == GRAPH_STATE_CYCLE_CLEARED) {
+      if (it->second.ever_seen_kernel) {
+        // Graph completed normally
+        should_remove = true;
+      } else if ((current_cycle_ - it->second.insertion_cycle) > 100) {
+        // Fallback: never saw kernels and entry is very old
+        should_remove = true;
+      }
+    }
+    if (should_remove) {
+      it = map_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+size_t GraphCorrelationMap::size() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return map_.size();
+}
+
+} // namespace parcagpu
diff --git a/src/correlation_filter.h b/src/correlation_filter.h
new file mode 100644
index 0000000..953733f
--- /dev/null
+++ b/src/correlation_filter.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <cstdint>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace parcagpu {
+
+//=============================================================================
+// Correlation Filter - tracks which kernel launches we've sampled
+//=============================================================================
+
+// CorrelationFilter for non-graph kernel launches
+// Insert on API callback, check-and-remove on kernel activity
+class CorrelationFilter {
+public:
+  void insert(uint32_t correlation_id);
+  bool check_and_remove(uint32_t correlation_id);
+  size_t size() const;
+
+private:
+  std::unordered_set<uint32_t> set_;
+  mutable std::mutex mutex_;
+};
+
+// Graph correlation state values
+enum GraphCorrelationState {
+  GRAPH_STATE_UNINITIALIZED = 0, // Entry just created, slot not yet processed
+  GRAPH_STATE_CYCLE_CLEARED = 1, // Cycle started, no kernels seen yet
+  GRAPH_STATE_KERNEL_SEEN = 2    // At least one kernel seen this cycle
+};
+
+// GraphCorrelationMap for graph launches (multiple kernels per launch)
+// Uses 2-slot state machine to detect when all kernels from a graph have arrived
+struct GraphCorrelationEntry {
+  uint8_t state[2];         // State for alternating cycles
+  bool ever_seen_kernel;    // True once we've seen at least one kernel activity
+  uint32_t insertion_cycle; // Buffer cycle when entry was created
+
+  GraphCorrelationEntry(uint32_t cycle);
+};
+
+class GraphCorrelationMap {
+public:
+  GraphCorrelationMap();
+
+  void insert(uint32_t correlation_id);
+  void cycle_start(uint32_t cycle);
+  bool check_and_mark_seen(uint32_t correlation_id, uint32_t cycle);
+  void cycle_end();
+  size_t size() const;
+
+private:
+  std::unordered_map<uint32_t, GraphCorrelationEntry> map_;
+  uint32_t current_cycle_;
+  mutable std::mutex mutex_;
+};
+
+} // namespace parcagpu
diff --git a/src/parcagpu_cupti.cpp b/src/parcagpu_cupti.cpp
new file mode 100644
index 0000000..839fde6
--- /dev/null
+++ b/src/parcagpu_cupti.cpp
@@ -0,0 +1,424 @@
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <sys/sdt.h>
+#include <time.h>
+#include <unistd.h>
+
+// Include proton headers
+#include "Driver/GPU/CuptiApi.h"
+#include "Profiler/Cupti/CuptiCallbacks.h"
+#include "Utility/Singleton.h"
+#include "correlation_filter.h"
+#include "parcagpu_pc_sampling.h"
+
+#include <cupti.h>
+
+namespace parcagpu {
+
+// Debug logging control
+bool debug_enabled = false;
+bool limiter_disabled = false;
+
+// Global correlation tracking instances
+static CorrelationFilter g_correlationFilter;
+static GraphCorrelationMap g_graphCorrelationMap;
+static std::atomic<uint32_t> g_bufferCycle{0};
+
+// Thread-local tracking: store correlation ID from runtime ENTER
+// so we can skip driver EXIT probe when it matches (driver calls happen under runtime calls)
+thread_local uint32_t runtimeEnterCorrelationId = 0;
+
+// Thread-local rate limiting
+thread_local uint64_t lastProbeTimeNs = 0;
+
+void init_debug() {
+  static bool initialized = false;
+  if (!initialized) {
+    debug_enabled = getenv("PARCAGPU_DEBUG") != nullptr;
+    limiter_disabled = getenv("PARCAGPU_LIMITER_DISABLE") != nullptr;
+    initialized = true;
+  }
+}
+
+#define DEBUG_PRINTF(...)                                                      \
+  do {                                                                         \
+    parcagpu::init_debug();                                                    \
+    if (parcagpu::debug_enabled) {                                             \
+      struct timespec ts;                                                      \
+      clock_gettime(CLOCK_REALTIME, &ts);                                      \
+      fprintf(stderr, "[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec);                  \
+      fprintf(stderr, __VA_ARGS__);                                            \
+    }                                                                          \
+  } while (0)
+
+// Simplified profiler using Proton's patterns
+class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
+public:
+  CuptiProfiler() {
+    DEBUG_PRINTF("[PARCAGPU] Initializing ParcaGPUProfiler\n");
+  }
+
+  ~CuptiProfiler() { cleanup(); }
+
+  bool initialize() {
+    if (initialized.exchange(true)) {
+      return true; // Already initialized
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Starting initialization\n");
+
+    // Check if PC sampling is supported
+    pcSamplingEnabled = parcagpu::PCSampling::isSupported();
+    if (pcSamplingEnabled) {
+      DEBUG_PRINTF("[PARCAGPU] PC sampling enabled (continuous mode)\n");
+    } else {
+      DEBUG_PRINTF("[PARCAGPU] PC sampling disabled, using kernel activity only\n");
+    }
+
+    // Subscribe to callbacks
+    auto result = proton::cupti::subscribe<true>(&subscriber, callbackHandler,
+                                            nullptr);
+    if (result != CUPTI_SUCCESS) {
+      DEBUG_PRINTF("[PARCAGPU] Failed to subscribe to callbacks: error %d\n", result);
+      return false;
+    }
+
+    // Enable runtime and driver API callbacks (using Proton's utilities)
+    proton::setRuntimeCallbacks(subscriber, /*enable=*/true);
+    proton::setDriverCallbacks(subscriber, /*enable=*/true);
+
+    // Enable resource callbacks only if PC sampling is enabled
+    if (pcSamplingEnabled) {
+      proton::setResourceCallbacks(subscriber, /*enable=*/true);
+    }
+
+    // Register activity buffer callbacks (using Proton's pattern)
+    result = proton::cupti::activityRegisterCallbacks<true>(allocBuffer,
+                                                            completeBuffer);
+    if (result != CUPTI_SUCCESS) {
+      DEBUG_PRINTF("[PARCAGPU] Failed to register activity callbacks: error %d\n", result);
+      return false;
+    }
+
+    // Enable kernel activity recording
+    result = proton::cupti::activityEnable<true>(
+        CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+    if (result != CUPTI_SUCCESS) {
+      DEBUG_PRINTF("[PARCAGPU] Failed to enable concurrent kernel activity: error %d\n", result);
+    } else {
+      DEBUG_PRINTF("[PARCAGPU] Enabled CONCURRENT_KERNEL activity\n");
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Successfully initialized CUPTI callbacks\n");
+    return true;
+  }
+
+  void cleanup() {
+    if (!initialized.exchange(false)) {
+      return; // Already cleaned up
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Cleanup started\n");
+
+    // Flush any remaining activity records
+    proton::cupti::activityFlushAll<true>(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+
+    // Disable activity recording
+    proton::cupti::activityDisable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+
+    // Disable all callbacks (using Proton's utilities)
+    if (subscriber) {
+      proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
+      proton::setDriverCallbacks(subscriber, /*enable=*/false);
+      if (pcSamplingEnabled) {
+        proton::setResourceCallbacks(subscriber, /*enable=*/false);
+      }
+      proton::cupti::unsubscribe<true>(subscriber);
+      subscriber = nullptr;
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Cleanup completed\n");
+  }
+
+private:
+  std::atomic<bool> initialized{false};
+  bool pcSamplingEnabled = false;
+  CUpti_SubscriberHandle subscriber = nullptr;
+
+  // Outstanding event counter for flushing
+  size_t outstandingEvents = 0;
+
+  // Buffer management - using Proton's pattern (static methods)
+  // A kernel activity is around 224 bytes so a 128kb buffer
+  // will hold ~500 activities, we want to flush regularly since
+  // we are a continuous profiler so we don't need a huge buffer
+  // like most CUPTI profilers.  Also a small size avoids malloc
+  // just going to mmap every time so the allocator should cache
+  // and re-use these for us.
+  static constexpr size_t AlignSize = 8;
+  static constexpr size_t BufferSize = 128 * 1024;
+
+  static void allocBuffer(uint8_t **buffer, size_t *bufferSize,
+                          size_t *maxNumRecords) {
+    *buffer = static_cast<uint8_t *>(aligned_alloc(AlignSize, BufferSize));
+    if (*buffer == nullptr) {
+      DEBUG_PRINTF("[PARCAGPU] ERROR: aligned_alloc failed\n");
+      return;
+    }
+    *bufferSize = BufferSize;
+    *maxNumRecords = 0;
+    DEBUG_PRINTF("[PARCAGPU:allocBuffer] Allocated buffer at %p size %zu\n",
+                 *buffer, *bufferSize);
+  }
+
+  static void completeBuffer(CUcontext ctx, uint32_t streamId,
+                              uint8_t *buffer, size_t size, size_t validSize) {
+    CUpti_Activity *record = nullptr;
+    int recordCount = 0;
+    int filteredCount = 0;
+
+    DEBUG_PRINTF(
+        "[PARCAGPU] completeBuffer called: buffer=%p validSize=%zu\n",
+        buffer, validSize);
+
+    // Start a new buffer cycle for graph correlation tracking
+    uint32_t cycle = g_bufferCycle.fetch_add(1);
+    g_graphCorrelationMap.cycle_start(cycle);
+
+    while (true) {
+      CUptiResult result =
+          proton::cupti::activityGetNextRecord<false>(buffer, validSize, &record);
+      if (result == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+        break;
+      } else if (result != CUPTI_SUCCESS) {
+        DEBUG_PRINTF("[PARCAGPU] Error reading activity record: error %d\n", result);
+        break;
+      }
+
+      recordCount++;
+      switch (record->kind) {
+      case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
+      case CUPTI_ACTIVITY_KIND_KERNEL: {
+        auto *k = reinterpret_cast<CUpti_ActivityKernel5 *>(record);
+
+        // Check correlation filter - only emit probe if this kernel was sampled
+        bool shouldEmit = false;
+        if (k->graphId != 0) {
+          // Graph kernel - check graph correlation map
+          shouldEmit = g_graphCorrelationMap.check_and_mark_seen(k->correlationId, cycle);
+        } else {
+          // Regular kernel - check and remove from correlation filter
+          shouldEmit = g_correlationFilter.check_and_remove(k->correlationId);
+        }
+
+        if (!shouldEmit) {
+          filteredCount++;
+          DEBUG_PRINTF("[PARCAGPU] Filtered kernel activity: correlationId=%u "
+                       "graphId=%u (not in filter)\n",
+                       k->correlationId, k->graphId);
+          break;
+        }
+
+        DEBUG_PRINTF("[PARCAGPU] Kernel activity: graphId=%u graphNodeId=%lu "
+                     "name=%s, correlationId=%u, deviceId=%u, "
+                     "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n",
+                     k->graphId, k->graphNodeId, k->name, k->correlationId,
+                     k->deviceId, k->streamId, k->start, k->end,
+                     k->end - k->start);
+
+        // Emit USDT probe for kernel execution
+        STAP_PROBE8(parcagpu, kernel_executed, k->start, k->end,
+                    k->correlationId, k->deviceId, k->streamId, k->graphId,
+                    k->graphNodeId, k->name);
+        break;
+      }
+      default:
+        DEBUG_PRINTF("[PARCAGPU] Activity record %d: kind=%d\n", recordCount,
+                     record->kind);
+        break;
+      }
+    }
+
+    // End cycle - cleanup completed graph entries
+    g_graphCorrelationMap.cycle_end();
+
+    DEBUG_PRINTF("[PARCAGPU] Processed %d activity records (%d filtered) from buffer %p\n",
+                 recordCount, filteredCount, buffer);
+
+    // Free the buffer (Proton's pattern)
+    std::free(buffer);
+  }
+
+  static void callbackHandler(void *userdata, CUpti_CallbackDomain domain,
+                              CUpti_CallbackId cbid,
+                              const void *cbdata_void) {
+    auto &profiler = CuptiProfiler::instance();
+
+    if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
+      // Handle resource callbacks for PC sampling (only if enabled)
+      if (!profiler.pcSamplingEnabled) {
+        return;
+      }
+
+      const CUpti_ResourceData *resData =
+          static_cast<const CUpti_ResourceData *>(cbdata_void);
+
+      switch (cbid) {
+      case CUPTI_CBID_RESOURCE_MODULE_LOADED: {
+        const CUpti_ModuleResourceData *modData =
+            static_cast<const CUpti_ModuleResourceData *>(resData->resourceDescriptor);
+        if (modData && modData->pCubin && modData->cubinSize > 0) {
+          DEBUG_PRINTF("[PARCAGPU] Module loaded: cubin=%p size=%zu\n",
+                       modData->pCubin, modData->cubinSize);
+          parcagpu::PCSampling::instance().loadModule(
+              modData->pCubin, modData->cubinSize);
+        }
+        break;
+      }
+      case CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING: {
+        const CUpti_ModuleResourceData *modData =
+            static_cast<const CUpti_ModuleResourceData *>(resData->resourceDescriptor);
+        if (modData && modData->pCubin && modData->cubinSize > 0) {
+          DEBUG_PRINTF("[PARCAGPU] Module unloading: cubin=%p size=%zu\n",
+                       modData->pCubin, modData->cubinSize);
+          parcagpu::PCSampling::instance().unloadModule(
+              modData->pCubin, modData->cubinSize);
+        }
+        break;
+      }
+      case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: {
+        CUcontext ctx = resData->context;
+        DEBUG_PRINTF("[PARCAGPU] Context created: %p\n", ctx);
+        parcagpu::PCSampling::instance().initialize(ctx);
+        break;
+      }
+      case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: {
+        CUcontext ctx = resData->context;
+        DEBUG_PRINTF("[PARCAGPU] Context destroying: %p\n", ctx);
+        parcagpu::PCSampling::instance().finalize(ctx);
+        break;
+      }
+      default:
+        break;
+      }
+    } else {
+      // Handle both Runtime and Driver API callbacks
+      // Track runtime ENTER so we can skip driver EXIT when they match
+      const CUpti_CallbackData *cbdata =
+          static_cast<const CUpti_CallbackData *>(cbdata_void);
+      uint32_t correlationId = cbdata->correlationId;
+
+      if (domain == CUPTI_CB_DOMAIN_RUNTIME_API &&
+          cbdata->callbackSite == CUPTI_API_ENTER) {
+        runtimeEnterCorrelationId = correlationId;
+        return;
+      }
+
+      // Process on EXIT to avoid adding latency to GPU launch
+      if (cbdata->callbackSite != CUPTI_API_EXIT) {
+        return;
+      }
+
+      const char *name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
+      int signedCbid;
+
+      if (domain == CUPTI_CB_DOMAIN_DRIVER_API) {
+        // Skip if this driver call is under a runtime call (same correlation ID)
+        if (correlationId == runtimeEnterCorrelationId) {
+          DEBUG_PRINTF("[PARCAGPU] Skipping driver EXIT correlationId=%u - runtime will handle\n",
+                       correlationId);
+          return;
+        }
+        // Pure driver call (no runtime wrapper) - use negative cbid
+        signedCbid = -(int)cbid;
+        DEBUG_PRINTF("[PARCAGPU] Driver API callback: cbid=%d, correlationId=%u, func=%s\n",
+                     cbid, correlationId, name);
+      } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
+        signedCbid = (int)cbid;
+        runtimeEnterCorrelationId = 0; // Clear after use
+        DEBUG_PRINTF("[PARCAGPU] Runtime API callback: cbid=%d, correlationId=%u, func=%s\n",
+                     cbid, correlationId, name);
+      } else {
+        return;
+      }
+
+      // Check if this is a graph launch (never rate limit these)
+      bool isGraphLaunch = false;
+      if (signedCbid < 0) {
+        // Driver API: cuGraphLaunch = 514, cuGraphLaunch_ptsz = 515
+        int driverCbid = -signedCbid;
+        isGraphLaunch = (driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch ||
+                         driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
+      } else {
+        // Runtime API: cudaGraphLaunch = 311, cudaGraphLaunch_ptsz = 312
+        isGraphLaunch = (signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 ||
+                         signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000);
+      }
+
+      // Rate limit probes (skip for graph launches)
+      if (!limiter_disabled && !isGraphLaunch) {
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+        constexpr uint64_t PROBE_MIN_INTERVAL_NS = 500000; // 500μs
+        if (nowNs - lastProbeTimeNs < PROBE_MIN_INTERVAL_NS) {
+          DEBUG_PRINTF("[PARCAGPU] Rate limited: skipping probe for correlationId=%u\n",
+                       correlationId);
+          return;
+        }
+        lastProbeTimeNs = nowNs;
+      }
+
+      profiler.outstandingEvents++;
+      // Emit USDT probe with signed cbid (negative for driver, positive for runtime)
+      STAP_PROBE3(parcagpu, cuda_correlation, correlationId, signedCbid, name);
+
+      // Insert into correlation filter so we can match kernel activities later
+      if (isGraphLaunch) {
+        g_graphCorrelationMap.insert(correlationId);
+        DEBUG_PRINTF("[PARCAGPU] Inserted correlationId=%u into graph map\n",
+                     correlationId);
+      } else {
+        g_correlationFilter.insert(correlationId);
+        DEBUG_PRINTF("[PARCAGPU] Inserted correlationId=%u into correlation filter\n",
+                     correlationId);
+      }
+
+      // Flush if too many events pile up
+      if (profiler.outstandingEvents > 3000) {
+        DEBUG_PRINTF("[PARCAGPU] Flushing: outstandingEvents=%zu\n",
+                     profiler.outstandingEvents);
+        proton::cupti::activityFlushAll<true>(0);
+        profiler.outstandingEvents = 0;
+      }
+
+      // Collect PC sampling data after kernel launch (continuous mode)
+      if (profiler.pcSamplingEnabled) {
+        parcagpu::PCSampling::instance().collectData(cbdata->context,
+                                                   correlationId);
+      }
+    }
+  }
+};
+
+} // namespace parcagpu
+
+// CUPTI initialization function required for CUDA_INJECTION64_PATH
+extern "C" int InitializeInjection(void) {
+  DEBUG_PRINTF("[PARCAGPU] InitializeInjection called\n");
+
+  auto &profiler = parcagpu::CuptiProfiler::instance();
+  if (!profiler.initialize()) {
+    return 0; // Return 0 on failure, but don't break injection
+  }
+
+  // Register cleanup at exit
+  atexit([]() { parcagpu::CuptiProfiler::instance().cleanup(); });
+
+  return 1; // Success
+}
diff --git a/src/parcagpu_pc_sampling.cpp b/src/parcagpu_pc_sampling.cpp
new file mode 100644
index 0000000..7eb766d
--- /dev/null
+++ b/src/parcagpu_pc_sampling.cpp
@@ -0,0 +1,522 @@
+#include "parcagpu_pc_sampling.h"
+#include "Driver/GPU/CudaApi.h"
+#include "Driver/GPU/CuptiApi.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+
+namespace parcagpu {
+
+// CUDA driver version for 12.8.1 (when continuous PC sampling became stable)
+// Version format: major * 1000 + minor * 10 + patch
+#define CUDA_VERSION_12_8_1 12081
+
+// Debug logging (reuse from main file)
+extern bool debug_enabled;
+extern void init_debug();
+
+#define DEBUG_PRINTF(...)                                                      \
+  do {                                                                         \
+    parcagpu::init_debug();                                                    \
+    if (parcagpu::debug_enabled) {                                             \
+      fprintf(stderr, "[PARCAGPU:PCSampling] ");                               \
+      fprintf(stderr, __VA_ARGS__);                                            \
+    }                                                                          \
+  } while (0)
+
+namespace {
+
+// CUPTI helper functions (adapted from Proton's CuptiPCSamplingUtils.h)
+// These wrap Proton's cupti API calls with PARCAGPU-specific setup
+
+uint64_t getCubinCrc(const char *cubin, size_t size) {
+  CUpti_GetCubinCrcParams cubinCrcParams = {
+      /*size=*/CUpti_GetCubinCrcParamsSize,
+      /*cubinSize=*/size,
+      /*cubin=*/cubin,
+      /*cubinCrc=*/0,
+  };
+  proton::cupti::getCubinCrc<true>(&cubinCrcParams);
+  return cubinCrcParams.cubinCrc;
+}
+
+void enablePCSampling(CUcontext context) {
+  CUpti_PCSamplingEnableParams params = {
+      /*size=*/CUpti_PCSamplingEnableParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  proton::cupti::pcSamplingEnable<true>(&params);
+}
+
+void disablePCSampling(CUcontext context) {
+  CUpti_PCSamplingDisableParams params = {
+      /*size=*/CUpti_PCSamplingDisableParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  proton::cupti::pcSamplingDisable<true>(&params);
+}
+
+void getPCSamplingData(CUcontext context,
+                       CUpti_PCSamplingData *pcSamplingData) {
+  CUpti_PCSamplingGetDataParams params = {
+      /*size=*/CUpti_PCSamplingGetDataParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*pcSamplingData=*/pcSamplingData,
+  };
+  proton::cupti::pcSamplingGetData<true>(&params);
+}
+
+void setConfigurationAttribute(
+    CUcontext context,
+    std::vector<CUpti_PCSamplingConfigurationInfo> &configurationInfos) {
+  CUpti_PCSamplingConfigurationInfoParams infoParams = {
+      /*size=*/CUpti_PCSamplingConfigurationInfoParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*numAttributes=*/configurationInfos.size(),
+      /*pPCSamplingConfigurationInfo=*/configurationInfos.data(),
+  };
+  proton::cupti::pcSamplingSetConfigurationAttribute<true>(&infoParams);
+}
+
+std::tuple<uint32_t, std::string, std::string>
+getSassToSourceCorrelation(const char *functionName, uint64_t pcOffset,
+                           const char *cubin, size_t cubinSize) {
+  CUpti_GetSassToSourceCorrelationParams sassToSourceParams = {
+      /*size=*/CUpti_GetSassToSourceCorrelationParamsSize,
+      /*cubin=*/cubin,
+      /*functionName=*/functionName,
+      /*cubinSize=*/cubinSize,
+      /*lineNumber=*/0,
+      /*pcOffset=*/pcOffset,
+      /*fileName=*/NULL,
+      /*dirName=*/NULL,
+  };
+  // Get source can fail if the line mapping is not available
+  proton::cupti::getSassToSourceCorrelation<false>(&sassToSourceParams);
+  auto fileNameStr = sassToSourceParams.fileName
+                         ? std::string(sassToSourceParams.fileName)
+                         : "";
+  auto dirNameStr =
+      sassToSourceParams.dirName ? std::string(sassToSourceParams.dirName) : "";
+  // Free the memory
+  if (sassToSourceParams.fileName)
+    std::free(sassToSourceParams.fileName);
+  if (sassToSourceParams.dirName)
+    std::free(sassToSourceParams.dirName);
+  return std::make_tuple(sassToSourceParams.lineNumber, fileNameStr,
+                         dirNameStr);
+}
+
+// Double-checked locking helper
+template <typename CheckFn, typename ActionFn>
+void doubleCheckedLock(CheckFn check, std::mutex &mutex, ActionFn action) {
+  if (check()) {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (check()) {
+      action();
+    }
+  }
+}
+
+// Helper to get PARCAGPU's custom sampling frequency from environment
+uint32_t getParcaGPUSamplingFrequency() {
+  // Default frequency for PARCAGPU is 18 (Proton uses 10)
+  constexpr uint32_t PARCAGPU_DEFAULT_FREQUENCY = 18;
+
+  uint32_t samplingPeriod = PARCAGPU_DEFAULT_FREQUENCY;
+  const char *sampling_factor_env = getenv("PARCAGPU_SAMPLING_FACTOR");
+  if (sampling_factor_env) {
+    int factor = atoi(sampling_factor_env);
+    if (factor >= 5 && factor <= 31) {
+      samplingPeriod = factor;
+      DEBUG_PRINTF("Using PARCAGPU_SAMPLING_FACTOR=%u\n", samplingPeriod);
+    } else if (factor != 0) { // 0 is handled in isSupported()
+      fprintf(stderr, "[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%d out of range [5,31], using default %u\n",
+              factor, PARCAGPU_DEFAULT_FREQUENCY);
+    }
+  }
+  return samplingPeriod;
+}
+
+// Get number of stall reasons
+size_t getNumStallReasons(CUcontext context) {
+  size_t numStallReasons = 0;
+  CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = {
+      /*size=*/CUpti_PCSamplingGetNumStallReasonsParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*numStallReasons=*/&numStallReasons};
+  proton::cupti::pcSamplingGetNumStallReasons<true>(&numStallReasonsParams);
+  return numStallReasons;
+}
+
+// Get stall reason names and indices
+std::pair<char **, uint32_t *>
+getStallReasonNamesAndIndices(CUcontext context, size_t numStallReasons) {
+  char **stallReasonNames =
+      static_cast<char **>(std::calloc(numStallReasons, sizeof(char *)));
+  for (size_t i = 0; i < numStallReasons; i++) {
+    stallReasonNames[i] = static_cast<char *>(
+        std::calloc(CUPTI_STALL_REASON_STRING_SIZE, sizeof(char)));
+  }
+  uint32_t *stallReasonIndices =
+      static_cast<uint32_t *>(std::calloc(numStallReasons, sizeof(uint32_t)));
+  CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = {
+      /*size=*/CUpti_PCSamplingGetStallReasonsParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*numStallReasons=*/numStallReasons,
+      /*stallReasonIndex=*/stallReasonIndices,
+      /*stallReasons=*/stallReasonNames,
+  };
+  proton::cupti::pcSamplingGetStallReasons<true>(&stallReasonsParams);
+  return std::make_pair(stallReasonNames, stallReasonIndices);
+}
+
+// Match stall reasons to indices (PARCAGPU emits all stall reasons)
+size_t matchStallReasonsToIndices(
+    size_t numStallReasons, char **stallReasonNames,
+    uint32_t *stallReasonIndices,
+    std::map<size_t, size_t> &stallReasonIndexToMetricIndex,
+    std::set<size_t> &notIssuedStallReasonIndices) {
+  // PARCAGPU emits all stall reasons
+  size_t numValidStalls = 0;
+  for (size_t i = 0; i < numStallReasons; i++) {
+    std::string cuptiStallName = std::string(stallReasonNames[i]);
+    bool notIssued = cuptiStallName.find("not_issued") != std::string::npos ||
+                     cuptiStallName.find("Not Issued") != std::string::npos;
+
+    if (notIssued)
+      notIssuedStallReasonIndices.insert(stallReasonIndices[i]);
+    stallReasonIndexToMetricIndex[stallReasonIndices[i]] = i;
+    numValidStalls++;
+  }
+  return numValidStalls;
+}
+
+// Allocate PC sampling data buffer
+CUpti_PCSamplingData allocPCSamplingData(size_t collectNumPCs,
+                                         size_t numValidStallReasons) {
+  CUpti_PCSamplingData pcSamplingData{
+      /*size=*/sizeof(CUpti_PCSamplingData),
+      /*collectNumPcs=*/collectNumPCs,
+      /*totalSamples=*/0,
+      /*droppedSamples=*/0,
+      /*totalNumPcs=*/0,
+      /*remainingNumPcs=*/0,
+      /*rangeId=*/0,
+      /*pPcData=*/
+      static_cast<CUpti_PCSamplingPCData *>(
+          std::calloc(collectNumPCs, sizeof(CUpti_PCSamplingPCData)))};
+  for (size_t i = 0; i < collectNumPCs; ++i) {
+    pcSamplingData.pPcData[i].stallReason =
+        static_cast<CUpti_PCSamplingStallReason *>(std::calloc(
+            numValidStallReasons, sizeof(CUpti_PCSamplingStallReason)));
+  }
+  return pcSamplingData;
+}
+
+} // namespace
+
+// ConfigureData implementation
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureStallReasons() {
+  numStallReasons = getNumStallReasons(context);
+  std::tie(this->stallReasonNames, this->stallReasonIndices) =
+      getStallReasonNamesAndIndices(context, numStallReasons);
+  numValidStallReasons = matchStallReasonsToIndices(
+      numStallReasons, stallReasonNames, stallReasonIndices,
+      stallReasonIndexToMetricIndex, notIssuedStallReasonIndices);
+
+  CUpti_PCSamplingConfigurationInfo stallReasonInfo{};
+  stallReasonInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON;
+  stallReasonInfo.attributeData.stallReasonData.stallReasonCount =
+      numValidStallReasons;
+  stallReasonInfo.attributeData.stallReasonData.pStallReasonIndex =
+      stallReasonIndices;
+  return stallReasonInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingPeriod() {
+  CUpti_PCSamplingConfigurationInfo samplingPeriodInfo{};
+  samplingPeriodInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD;
+
+  // Use PARCAGPU's custom sampling frequency
+  uint32_t frequency = getParcaGPUSamplingFrequency();
+
+  samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod = frequency;
+  return samplingPeriodInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingBuffer() {
+  CUpti_PCSamplingConfigurationInfo samplingBufferInfo{};
+  samplingBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER;
+  this->pcSamplingData =
+      allocPCSamplingData(DataBufferPCCount, numValidStallReasons);
+  samplingBufferInfo.attributeData.samplingDataBufferData.samplingDataBuffer =
+      &this->pcSamplingData;
+  return samplingBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() {
+  CUpti_PCSamplingConfigurationInfo scratchBufferInfo{};
+  scratchBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE;
+  scratchBufferInfo.attributeData.scratchBufferSizeData.scratchBufferSize =
+      ScratchBufferSize;
+  return scratchBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() {
+  CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{};
+  hardwareBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE;
+  hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize =
+      HardwareBufferSize;
+  return hardwareBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureCollectionMode() {
+  CUpti_PCSamplingConfigurationInfo collectionModeInfo{};
+  collectionModeInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE;
+  collectionModeInfo.attributeData.collectionModeData.collectionMode =
+      CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS;
+  return collectionModeInfo;
+}
+
+void ConfigureData::initialize(CUcontext context) {
+  this->context = context;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n",
+               context, contextId);
+
+  configurationInfos.emplace_back(configureStallReasons());
+  configurationInfos.emplace_back(configureSamplingPeriod());
+  configurationInfos.emplace_back(configureHardwareBufferSize());
+  configurationInfos.emplace_back(configureScratchBuffer());
+  configurationInfos.emplace_back(configureSamplingBuffer());
+  configurationInfos.emplace_back(configureCollectionMode());
+
+  setConfigurationAttribute(context, configurationInfos);
+
+  DEBUG_PRINTF("PC sampling configured with %u stall reasons (%u valid)\n",
+               numStallReasons, numValidStallReasons);
+}
+
+// GPUPCSampling implementation
+
+bool PCSampling::isSupported() {
+  // Check PARCAGPU_SAMPLING_FACTOR - 0 means disabled
+  const char *sampling_factor_env = getenv("PARCAGPU_SAMPLING_FACTOR");
+  if (sampling_factor_env && atoi(sampling_factor_env) == 0) {
+    DEBUG_PRINTF("PC sampling disabled via PARCAGPU_SAMPLING_FACTOR=0\n");
+    return false;
+  }
+
+  // Check CUDA driver version >= 12.8.1
+  int driverVersion = 0;
+  proton::cuda::driverGetVersion<true>(&driverVersion);
+
+  if (driverVersion < CUDA_VERSION_12_8_1) {
+    int major = driverVersion / 1000;
+    int minor = (driverVersion % 1000) / 10;
+    int patch = driverVersion % 10;
+    DEBUG_PRINTF("PC sampling not supported: CUDA driver version %d.%d.%d < required 12.8.1\n",
+                 major, minor, patch);
+    return false;
+  }
+
+  int major = driverVersion / 1000;
+  int minor = (driverVersion % 1000) / 10;
+  int patch = driverVersion % 10;
+  DEBUG_PRINTF("PC sampling supported: CUDA driver version %d.%d.%d\n", major, minor, patch);
+  return true;
+}
+
+ConfigureData *PCSampling::getConfigureData(uint32_t contextId) {
+  return &contextIdToConfigureData[contextId];
+}
+
+CubinData *PCSampling::getCubinData(uint64_t cubinCrc) {
+  return &(cubinCrcToCubinData[cubinCrc].first);
+}
+
+void PCSampling::initialize(CUcontext context) {
+  uint32_t contextId = 0;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  doubleCheckedLock([&]() { return !contextInitialized.contain(contextId); },
+                    contextMutex,
+                    [&]() {
+                      enablePCSampling(context);
+                      getConfigureData(contextId)->initialize(context);
+
+                      contextInitialized.insert(contextId);
+                      DEBUG_PRINTF("PC sampling started in continuous mode for context %u\n", contextId);
+                    });
+}
+
+void PCSampling::processPCSamplingData(ConfigureData *configureData,
+                                                uint32_t correlationId) {
+  auto *pcSamplingData = &configureData->pcSamplingData;
+
+  // Collect data in rounds
+  bool firstRound = true;
+  while (pcSamplingData->totalNumPcs > 0 ||
+         pcSamplingData->remainingNumPcs > 0 || firstRound) {
+
+    DEBUG_PRINTF("Processing %zu PCs (remaining: %zu)\n",
+                 pcSamplingData->totalNumPcs, pcSamplingData->remainingNumPcs);
+
+    // Process each PC sample
+    for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
+      auto *pcData = pcSamplingData->pPcData + i;
+      auto *cubinData = getCubinData(pcData->cubinCrc);
+
+      // Get or cache source location
+      auto key =
+          CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset};
+      if (cubinData->lineInfo.find(key) == cubinData->lineInfo.end()) {
+        auto [lineNumber, fileName, dirName] =
+            getSassToSourceCorrelation(pcData->functionName, pcData->pcOffset,
+                                       cubinData->cubin, cubinData->cubinSize);
+        cubinData->lineInfo.try_emplace(key, lineNumber,
+                                        std::string(pcData->functionName),
+                                        dirName, fileName);
+      }
+      auto &lineInfo = cubinData->lineInfo[key];
+
+      // Calculate total and stalled samples
+      uint64_t totalSamples = 0;
+      uint64_t stalledSamples = 0;
+
+      for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
+        auto *stallReason = &pcData->stallReason[j];
+        totalSamples += stallReason->samples;
+
+        // Check if this is a "not_issued" stall (not really stalled)
+        bool isNotIssued =
+            configureData->notIssuedStallReasonIndices.count(
+                stallReason->pcSamplingStallReasonIndex) > 0;
+
+        if (!isNotIssued) {
+          stalledSamples += stallReason->samples;
+        }
+      }
+
+      // Emit summary probe
+      std::string fullPath = lineInfo.fileName.size()
+                                 ? lineInfo.dirName + "/" + lineInfo.fileName
+                                 : "";
+      STAP_PROBE8(parcagpu, pc_sample_summary, correlationId, pcData->functionIndex,
+                  pcData->pcOffset, totalSamples, stalledSamples,
+                  fullPath.c_str(), lineInfo.lineNumber, lineInfo.functionName.c_str());
+
+      // Emit detailed stall reason probes
+      for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
+        auto *stallReason = &pcData->stallReason[j];
+        auto stallReasonIndex = stallReason->pcSamplingStallReasonIndex;
+
+        // Get stall reason name
+        const char *stallReasonName = "";
+        for (size_t k = 0; k < configureData->numStallReasons; k++) {
+          if (configureData->stallReasonIndices[k] == stallReasonIndex) {
+            stallReasonName = configureData->stallReasonNames[k];
+            break;
+          }
+        }
+
+        STAP_PROBE6(parcagpu, pc_stall_reason, correlationId, pcData->functionIndex,
+                    pcData->pcOffset, stallReasonIndex, stallReasonName, stallReason->samples);
+      }
+    }
+
+    // Get next round of data if available
+    if (pcSamplingData->remainingNumPcs > 0 || firstRound) {
+      getPCSamplingData(configureData->context, pcSamplingData);
+      firstRound = false;
+    } else {
+      break;
+    }
+  }
+}
+
+void PCSampling::collectData(CUcontext context, uint32_t correlationId) {
+  uint32_t contextId = 0;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  if (!contextInitialized.contain(contextId)) {
+    DEBUG_PRINTF("Context %u not initialized, skipping data collection\n", contextId);
+    return;
+  }
+
+  auto *configureData = getConfigureData(contextId);
+  DEBUG_PRINTF("Collecting PC sampling data for correlation %u\n", correlationId);
+
+  // Collect data without stopping - continuous mode
+  processPCSamplingData(configureData, correlationId);
+}
+
+void PCSampling::finalize(CUcontext context) {
+  uint32_t contextId = 0;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  if (!contextInitialized.contain(contextId))
+    return;
+
+  DEBUG_PRINTF("Finalizing PC sampling for context %p\n", context);
+
+  contextIdToConfigureData.erase(contextId);
+  contextInitialized.erase(contextId);
+}
+
+void PCSampling::loadModule(const char *cubin, size_t cubinSize) {
+  auto cubinCrc =  getCubinCrc(cubin, cubinSize);
+  auto *cubinData = getCubinData(cubinCrc);
+
+  if (cubinCrcToCubinData.contain(cubinCrc)) {
+    // Increment reference count
+    cubinCrcToCubinData[cubinCrc].second++;
+    DEBUG_PRINTF("Module 0x%lx loaded (refcount=%zu)\n",
+                 cubinCrc, cubinCrcToCubinData[cubinCrc].second);
+  } else {
+    // New module
+    cubinData->cubinCrc = cubinCrc;
+    cubinData->cubinSize = cubinSize;
+    cubinData->cubin = cubin;
+    cubinCrcToCubinData[cubinCrc].second = 1;
+    DEBUG_PRINTF("Module 0x%lx loaded (new)\n", cubinCrc);
+    STAP_PROBE3(parcagpu, cubin_loaded, cubinCrc, 0, 0);
+  }
+}
+
+void PCSampling::unloadModule(const char *cubin, size_t cubinSize) {
+  auto cubinCrc = getCubinCrc(cubin, cubinSize);
+
+  if (!cubinCrcToCubinData.contain(cubinCrc))
+    return;
+
+  auto count = cubinCrcToCubinData[cubinCrc].second;
+  if (count > 1) {
+    cubinCrcToCubinData[cubinCrc].second = count - 1;
+    DEBUG_PRINTF("Module 0x%lx unloaded (refcount=%zu)\n", cubinCrc, count - 1);
+  } else {
+    cubinCrcToCubinData.erase(cubinCrc);
+    DEBUG_PRINTF("Module 0x%lx unloaded (removed)\n", cubinCrc);
+    STAP_PROBE2(parcagpu, cubin_unloaded, cubinCrc, 0);
+  }
+}
+
+} // namespace parcagpu
diff --git a/src/parcagpu_pc_sampling.h b/src/parcagpu_pc_sampling.h
new file mode 100644
index 0000000..01f9413
--- /dev/null
+++ b/src/parcagpu_pc_sampling.h
@@ -0,0 +1,107 @@
+#ifndef PARCAGPU_PC_SAMPLING_H_
+#define PARCAGPU_PC_SAMPLING_H_
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+#include <sys/sdt.h>
+#include <vector>
+
+#include <cuda.h>
+#include <cupti.h>
+
+#include "Driver/GPU/CuptiApi.h"
+#include "Profiler/Cupti/CuptiPCSampling.h"
+#include "Utility/Map.h"
+#include "Utility/Set.h"
+#include "Utility/Singleton.h"
+
+namespace parcagpu {
+
+// Use Proton's CubinData directly
+using proton::CubinData;
+
+// ConfigureData for PARCAGPU (based on Proton's but standalone)
+// We don't inherit to avoid linking Proton's profiler dependencies
+struct ConfigureData {
+  ConfigureData() = default;
+
+  ~ConfigureData() {
+    if (stallReasonNames) {
+      for (size_t i = 0; i < numStallReasons; i++) {
+        if (stallReasonNames[i])
+          std::free(stallReasonNames[i]);
+      }
+      std::free(stallReasonNames);
+    }
+    if (stallReasonIndices)
+      std::free(stallReasonIndices);
+    if (pcSamplingData.pPcData) {
+      for (size_t i = 0; i < numValidStallReasons; ++i) {
+        std::free(pcSamplingData.pPcData[i].stallReason);
+      }
+      std::free(pcSamplingData.pPcData);
+    }
+  }
+
+  void initialize(CUcontext context);
+
+  CUpti_PCSamplingConfigurationInfo configureStallReasons();
+  CUpti_PCSamplingConfigurationInfo configureSamplingPeriod();
+  CUpti_PCSamplingConfigurationInfo configureSamplingBuffer();
+  CUpti_PCSamplingConfigurationInfo configureScratchBuffer();
+  CUpti_PCSamplingConfigurationInfo configureHardwareBufferSize();
+  CUpti_PCSamplingConfigurationInfo configureCollectionMode();
+
+  // Buffer size constants (from Proton)
+  static constexpr size_t HardwareBufferSize = 128 * 1024 * 1024;
+  static constexpr size_t ScratchBufferSize = 16 * 1024 * 1024;
+  static constexpr size_t DataBufferPCCount = 1024;
+
+  CUcontext context{};
+  uint32_t contextId;
+  uint32_t numStallReasons{};
+  uint32_t numValidStallReasons{};
+  char **stallReasonNames{};
+  uint32_t *stallReasonIndices{};
+  std::map<size_t, size_t> stallReasonIndexToMetricIndex{};
+  std::set<size_t> notIssuedStallReasonIndices{};
+  CUpti_PCSamplingData pcSamplingData{};
+  std::vector<CUpti_PCSamplingConfigurationInfo> configurationInfos;
+};
+
+// PC Sampling singleton class (adapted from Proton's CuptiPCSampling)
+class PCSampling : public proton::Singleton<PCSampling> {
+public:
+  PCSampling() = default;
+  virtual ~PCSampling() = default;
+
+  // Check if PC sampling is supported (CUPTI >= 12.8.1)
+  static bool isSupported();
+
+  void initialize(CUcontext context);
+  void collectData(CUcontext context, uint32_t correlationId);
+  void finalize(CUcontext context);
+  void loadModule(const char *cubin, size_t cubinSize);
+  void unloadModule(const char *cubin, size_t cubinSize);
+
+private:
+  ConfigureData *getConfigureData(uint32_t contextId);
+  CubinData *getCubinData(uint64_t cubinCrc);
+  void processPCSamplingData(ConfigureData *configureData,
+                             uint32_t correlationId);
+
+  proton::ThreadSafeMap<uint32_t, ConfigureData> contextIdToConfigureData;
+  proton::ThreadSafeMap<size_t, std::pair<CubinData, size_t>>
+      cubinCrcToCubinData;
+  proton::ThreadSafeSet<uint32_t> contextInitialized;
+
+  std::mutex contextMutex{};
+};
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_PC_SAMPLING_H_
diff --git a/test.sh b/test.sh
index 56f4e5a..d486cf7 100755
--- a/test.sh
+++ b/test.sh
@@ -6,12 +6,7 @@ cd "$(dirname "$0")"
 
 # Parse arguments
 USE_BPFTRACE=0
-# Auto-detect architecture
-case "$(uname -m)" in
-    aarch64|arm64) DEFAULT_ARCH="arm64" ;;
-    *)             DEFAULT_ARCH="amd64" ;;
-esac
-ARCH="${ARCH:-$DEFAULT_ARCH}"
+ARCH="${ARCH:-amd64}"
 CUDA_MAJOR="${CUDA_MAJOR:-12}"
 for arg in "$@"; do
     case $arg in
@@ -27,8 +22,11 @@ done
 
 echo ""
 echo "=== Building test infrastructure with CMake ==="
-mkdir -p test/build
-cd test/build && cmake .. -DCUDA_MAJOR_VERSION="${CUDA_MAJOR}" && make && cd ../..
+mkdir -p build-local
+cd build-local
+cmake ..
+make -j$(nproc)
+cd ..
 
 # Start bpftrace if requested
 if [ "$USE_BPFTRACE" -eq 1 ]; then
@@ -53,12 +51,12 @@ fi
 
 echo ""
 echo "=== Running test program ==="
-# Set LD_LIBRARY_PATH so the test can find libcupti.so at runtime
-# Set PARCAGPU_DEBUG externally to enable debug output
-# Set PARCAGPU_RATE_LIMIT externally to override default (100/s)
-export LD_LIBRARY_PATH="$(pwd)/test/build:$LD_LIBRARY_PATH"
-# Use the CMake-built library with mock CUPTI
-test/build/test_cupti_prof "build/${CUDA_MAJOR}/${ARCH}/libparcagpucupti.so" --kernel-names=kernel_names.txt "$@"
+# Set LD_LIBRARY_PATH so the test can find libcupti.so and libparcagpucupti.so at runtime
+# Set PARCAGPU_DEBUG to enable debug output
+export LD_LIBRARY_PATH="$(pwd)/build-local/lib:$LD_LIBRARY_PATH"
+export PARCAGPU_DEBUG=1
+# Run the test program with path to library
+./build-local/bin/test_cupti_prof build-local/lib/libparcagpucupti.so "$@"
 
 # If bpftrace was started, stop it and show results
 if [ "$USE_BPFTRACE" -eq 1 ]; then
diff --git a/test/README.md b/test/README.md
index 4cd932a..ff74c58 100644
--- a/test/README.md
+++ b/test/README.md
@@ -1,19 +1,19 @@
 # CUPTI Profiler Test Infrastructure
 
-This directory contains test infrastructure for `libparcagpucupti.so` using Zig as the build system.
+This directory contains test infrastructure for `libparcagpucupti.so` using CMake as the build system.
 
 ## Components
 
-- **test/mock_cupti.c**: Mock CUPTI library that provides stub implementations of all CUPTI APIs used by cupti-prof.c
+- **test/mock_cupti.c**: Mock CUPTI library that provides stub implementations of all CUPTI APIs used by the profiler
 - **test/test_cupti_prof.c**: Test program that dynamically loads libparcagpucupti.so and simulates CUPTI callbacks
-- **build.zig**: Zig build configuration (at project root)
+- **CMakeLists.txt**: CMake build configuration (at project root)
 - **test.sh**: Test script (at project root)
 
 ## Building
 
 From the project root:
 ```bash
-zig build
+make
 ```
 
 This builds:
@@ -21,32 +21,31 @@ This builds:
 2. `libparcagpucupti.so` - The profiler library linked against the mock CUPTI
 3. `test_cupti_prof` - Test executable that loads and exercises the profiler
 
-All outputs go to `zig-out/lib/` and `zig-out/bin/`.
+All outputs go to `build/lib/` and `build/bin/`.
 
 ## Running
 
 Using the test script (recommended):
 ```bash
-cd /home/tpr/src/gpu/parcagpu
 ./test.sh
 ```
 
-Using Zig directly:
+Using Make directly:
 ```bash
-zig build run
+make test
 ```
 
 Or manually:
 ```bash
-zig build
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so
+make
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so
 ```
 
 ### Running Continuously
 
 To run the test in continuous mode (useful for monitoring probes with bpftrace):
 ```bash
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so --forever
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so --forever
 ```
 
 In this mode, the test will:
@@ -83,7 +82,7 @@ The test script automatically enables `PARCAGPU_DEBUG=1` to show detailed debug
 
 To run without debug output:
 ```bash
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so
 ```
 
 ## Verifying DTRACE Probes
@@ -92,13 +91,11 @@ To verify that the DTRACE/USDT probes are firing correctly, use the provided bpf
 
 **Terminal 1** - Run bpftrace to monitor probes:
 ```bash
-cd /home/tpr/src/gpu/parcagpu
 sudo bpftrace parcagpu.bt
 ```
 
 **Terminal 2** - Run the test:
 ```bash
-cd /home/tpr/src/gpu/parcagpu
 ./test.sh
 ```
 
diff --git a/test/mock_cuda.c b/test/mock_cuda.c
new file mode 100644
index 0000000..09caa88
--- /dev/null
+++ b/test/mock_cuda.c
@@ -0,0 +1,26 @@
+/*
+ * Mock CUDA Driver API for testing
+ * Provides minimal implementation of cuDriverGetVersion for test environment
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+// Mock implementation of cuDriverGetVersion
+// Returns CUDA 12.8.1 (12081) to enable PC sampling in tests
+CUresult cuDriverGetVersion(int *driverVersion) {
+    if (driverVersion == NULL) {
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+
+    // Version format: major * 1000 + minor * 10 + patch
+    // 12.8.1 = 12 * 1000 + 8 * 10 + 1 = 12081
+    *driverVersion = 12081;
+
+    if (getenv("PARCAGPU_DEBUG") != NULL) {
+        fprintf(stderr, "[MOCK_CUDA] cuDriverGetVersion() -> 12.8.1 (12081)\n");
+    }
+
+    return CUDA_SUCCESS;
+}
diff --git a/test/mock_cupti.c b/test/mock_cupti.c
index 3c04589..b4901f1 100644
--- a/test/mock_cupti.c
+++ b/test/mock_cupti.c
@@ -134,6 +134,13 @@ CUptiResult cuptiActivityGetNumDroppedRecords(CUcontext context,
     return CUPTI_SUCCESS;
 }
 
+CUptiResult cuptiGetContextId(CUcontext context, uint32_t *contextId) {
+    (void)context;  // Mark as intentionally unused
+    // Return a fixed context ID for testing
+    *contextId = 1;
+    return CUPTI_SUCCESS;
+}
+
 CUptiResult cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
     (void)subscriber;  // Mark as intentionally unused
     fprintf(stderr, "[MOCK_CUPTI] cuptiUnsubscribe()\n");

From 5951652060e333a1d73812628dbf4db50019aa40 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Thu, 12 Mar 2026 11:14:11 -0400
Subject: [PATCH 02/14] Update proton callback API names for upstream sync

setDriverCallbacks renamed to setLaunchCallbacks in upstream proton.
---
 src/parcagpu_cupti.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/parcagpu_cupti.cpp b/src/parcagpu_cupti.cpp
index 839fde6..e18cfa5 100644
--- a/src/parcagpu_cupti.cpp
+++ b/src/parcagpu_cupti.cpp
@@ -90,7 +90,7 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
 
     // Enable runtime and driver API callbacks (using Proton's utilities)
     proton::setRuntimeCallbacks(subscriber, /*enable=*/true);
-    proton::setDriverCallbacks(subscriber, /*enable=*/true);
+    proton::setLaunchCallbacks(subscriber, /*enable=*/true);
 
     // Enable resource callbacks only if PC sampling is enabled
     if (pcSamplingEnabled) {
@@ -134,7 +134,7 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     // Disable all callbacks (using Proton's utilities)
     if (subscriber) {
       proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
-      proton::setDriverCallbacks(subscriber, /*enable=*/false);
+      proton::setLaunchCallbacks(subscriber, /*enable=*/false);
       if (pcSamplingEnabled) {
         proton::setResourceCallbacks(subscriber, /*enable=*/false);
       }

From 5fbd8ebeb70590ef4d99eb9806202cc8caca04af Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Thu, 12 Mar 2026 11:17:43 -0400
Subject: [PATCH 03/14] Update proton submodule to latest upstream sync

---
 proton | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proton b/proton
index b0ffb66..bfc06a7 160000
--- a/proton
+++ b/proton
@@ -1 +1 @@
-Subproject commit b0ffb664cef804328c872bf02f7961de660c18a9
+Subproject commit bfc06a7e18f129da8832a433f993700036121daa

From 1736e287f526a50b0b16528820272399c7876fce Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Mon, 23 Mar 2026 17:14:52 -0400
Subject: [PATCH 04/14] Replace interval-based rate limiter with token bucket
 algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The simple 500μs interval check could only pass 2000 samples/sec
regardless of actual load. The token bucket (configurable via
PARCAGPU_RATE_LIMIT, default 100/sec) smooths bursts while
maintaining a predictable average rate.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/parcagpu_cupti.cpp | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/parcagpu_cupti.cpp b/src/parcagpu_cupti.cpp
index e18cfa5..98cd44b 100644
--- a/src/parcagpu_cupti.cpp
+++ b/src/parcagpu_cupti.cpp
@@ -33,14 +33,45 @@ static std::atomic<uint32_t> g_bufferCycle{0};
 // so we can skip driver EXIT probe when it matches (driver calls happen under runtime calls)
 thread_local uint32_t runtimeEnterCorrelationId = 0;
 
-// Thread-local rate limiting
-thread_local uint64_t lastProbeTimeNs = 0;
+// Token bucket rate limiter (configurable via PARCAGPU_RATE_LIMIT)
+double rateLimitPerSec = 100.0;
+
+// Thread-local token bucket state
+thread_local uint64_t lastRefillNs = 0;
+thread_local double tokens = 0;
+
+// Returns true if the sample should be emitted, false if rate limited
+bool rateLimiterTryAcquire(uint64_t nowNs) {
+  if (lastRefillNs > 0) {
+    double elapsedSec = (nowNs - lastRefillNs) / 1e9;
+    tokens = tokens + elapsedSec * rateLimitPerSec;
+    if (tokens > rateLimitPerSec) {
+      tokens = rateLimitPerSec;
+    }
+  } else {
+    tokens = rateLimitPerSec; // Start with full bucket
+  }
+  lastRefillNs = nowNs;
+
+  if (tokens >= 1.0) {
+    tokens -= 1.0;
+    return true;
+  }
+  return false;
+}
 
 void init_debug() {
   static bool initialized = false;
   if (!initialized) {
     debug_enabled = getenv("PARCAGPU_DEBUG") != nullptr;
     limiter_disabled = getenv("PARCAGPU_LIMITER_DISABLE") != nullptr;
+    const char *rateEnv = getenv("PARCAGPU_RATE_LIMIT");
+    if (rateEnv != nullptr) {
+      double rate = atof(rateEnv);
+      if (rate > 0) {
+        rateLimitPerSec = rate;
+      }
+    }
     initialized = true;
   }
 }
@@ -360,18 +391,16 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
                          signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000);
       }
 
-      // Rate limit probes (skip for graph launches)
+      // Rate limit probes using token bucket (skip for graph launches)
       if (!limiter_disabled && !isGraphLaunch) {
         struct timespec ts;
         clock_gettime(CLOCK_MONOTONIC, &ts);
         uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
-        constexpr uint64_t PROBE_MIN_INTERVAL_NS = 500000; // 500μs
-        if (nowNs - lastProbeTimeNs < PROBE_MIN_INTERVAL_NS) {
+        if (!rateLimiterTryAcquire(nowNs)) {
           DEBUG_PRINTF("[PARCAGPU] Rate limited: skipping probe for correlationId=%u\n",
                        correlationId);
           return;
         }
-        lastProbeTimeNs = nowNs;
       }
 
       profiler.outstandingEvents++;

From 86728f6822cf362714a6daf2cc32ad52a44fa113 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Mon, 23 Mar 2026 17:15:14 -0400
Subject: [PATCH 05/14] Add activity_batch USDT probe and fix test
 infrastructure

Add parcagpuActivityBatch() probe that fires with batches of up to 128
activity record pointers, enabling BPF consumers to read kernel timing
data directly from CUPTI buffers without per-record probe overhead.

Build/test changes:
- Link test binary against mock CUPTI/CUDA with --no-as-needed so
  Proton's dlopen(RTLD_NOLOAD) finds the mocks at runtime
- Fix make test to run the test binary directly with LD_LIBRARY_PATH
  (ctest had no tests registered)
- Add make bpf-test and make test-multi targets for BPF activity
  parser integration testing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt         |  9 +++++++++
 Makefile               | 36 ++++++++++++++++++++++++++++++++++--
 src/parcagpu_cupti.cpp | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b01b3a..0c06f5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,8 +117,17 @@ if(BUILD_TESTS)
     target_include_directories(test_cupti_prof PRIVATE
         ${CUDA_INCLUDE_DIR}
     )
+    # Link against mock libraries so their symbols are globally visible.
+    # This lets Proton's dlopen(RTLD_NOLOAD) find them, and lets the test
+    # access __cupti_runtime_api_callback etc. via dlsym(RTLD_DEFAULT).
+    # --no-as-needed forces the linker to keep them even though the test
+    # only accesses their symbols via dlsym, not direct references.
     target_link_libraries(test_cupti_prof
         dl
+        -Wl,--no-as-needed
+        cuda_mock
+        cupti_mock
+        -Wl,--as-needed
     )
     set_target_properties(test_cupti_prof PROPERTIES
         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
diff --git a/Makefile b/Makefile
index 13c91a3..7b46cd5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test test-multi
 
 LIB_NAME = libparcagpucupti.so
 
@@ -65,7 +65,8 @@ debug:
 # Run local tests
 test: local
 	@echo "=== Running tests ==="
-	@cd build-local && ctest --output-on-failure
+	@LD_LIBRARY_PATH="$(CURDIR)/build-local/lib:$$LD_LIBRARY_PATH" \
+		./build-local/bin/test_cupti_prof build-local/lib/libparcagpucupti.so --duration=5
 
 # Clean build artifacts
 clean:
@@ -103,6 +104,37 @@ docker-test-run: docker-test-build
 	@echo "=== Running tests in container ==="
 	@docker run --rm parcagpu-test:latest $(ARGS)
 
+# Build the BPF activity parser test program
+# Requires: clang, libbpf-dev, bpftool (for vmlinux.h), Go 1.21+
+bpf-test:
+	@echo "=== Building BPF activity parser test ==="
+	@if [ ! -f test/bpf/vmlinux.h ]; then \
+		echo "Generating vmlinux.h from kernel BTF..."; \
+		bpftool btf dump file /sys/kernel/btf/vmlinux format c > test/bpf/vmlinux.h; \
+	fi
+	@cd test/bpf && go generate ./... && CGO_ENABLED=0 go build -o activity_parser .
+	@echo "BPF test built: test/bpf/activity_parser"
+
+# Run test_cupti_prof and BPF activity parser in parallel.
+# The BPF test attaches to the activity_batch USDT probe and logs kernel activities.
+# Requires root (sudo) for BPF.
+test-multi: local bpf-test
+	@echo "=== Running test with BPF activity parser ==="
+	@LIB_PATH="build-local/lib/libparcagpucupti.so"; \
+	export LD_LIBRARY_PATH="$(CURDIR)/build-local/lib:$$LD_LIBRARY_PATH"; \
+	./build-local/bin/test_cupti_prof "$${LIB_PATH}" --kernel-names=kernel_names.txt --duration=10 & \
+	TEST_PID=$$!; \
+	sleep 1; \
+	echo "test_cupti_prof PID: $${TEST_PID}"; \
+	echo "Starting BPF activity parser (requires root)..."; \
+	sudo test/bpf/activity_parser -pid $${TEST_PID} -lib "$$(pwd)/$${LIB_PATH}" -v & \
+	BPF_PID=$$!; \
+	wait $${TEST_PID}; \
+	TEST_EXIT=$$?; \
+	sleep 1; \
+	sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
+	echo "=== test-multi completed (test exit: $${TEST_EXIT}) ==="
+
 format:
 	@echo "=== Formatting source files ==="
 	@clang-format -i -style=file src/*.cpp src/*.h test/*.c
diff --git a/src/parcagpu_cupti.cpp b/src/parcagpu_cupti.cpp
index 98cd44b..41303d1 100644
--- a/src/parcagpu_cupti.cpp
+++ b/src/parcagpu_cupti.cpp
@@ -87,6 +87,19 @@ void init_debug() {
     }                                                                          \
   } while (0)
 
+// Out-of-line USDT probe site for activity batches.
+// Single call site ensures one probe location in the ELF .note.stapsdt section.
+static constexpr int ACTIVITY_BATCH_SIZE = 128;
+
+} // namespace parcagpu
+
+__attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs,
+                                                     uint32_t count) {
+  DTRACE_PROBE2(parcagpu, activity_batch, ptrs, count);
+}
+
+namespace parcagpu {
+
 // Simplified profiler using Proton's patterns
 class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
 public:
@@ -213,6 +226,12 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     int recordCount = 0;
     int filteredCount = 0;
 
+    // Batch probe: collect pointers to activity records and pass them to
+    // BPF/USDT every ACTIVITY_BATCH_SIZE records. Stack-allocated array
+    // of pointers — no heap allocation, no copying, version-independent.
+    const void *batchPtrs[ACTIVITY_BATCH_SIZE];
+    uint32_t batchCount = 0;
+
     DEBUG_PRINTF(
         "[PARCAGPU] completeBuffer called: buffer=%p validSize=%zu\n",
         buffer, validSize);
@@ -273,6 +292,19 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
                      record->kind);
         break;
       }
+
+      // Collect pointer for batch probe (all activity kinds).
+      // BPF consumers inspect the kind field to filter types they care about.
+      batchPtrs[batchCount++] = record;
+      if (batchCount >= ACTIVITY_BATCH_SIZE) {
+        parcagpuActivityBatch(batchPtrs, batchCount);
+        batchCount = 0;
+      }
+    }
+
+    // Flush remaining batch
+    if (batchCount > 0) {
+      parcagpuActivityBatch(batchPtrs, batchCount);
     }
 
     // End cycle - cleanup completed graph entries
@@ -281,6 +313,10 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     DEBUG_PRINTF("[PARCAGPU] Processed %d activity records (%d filtered) from buffer %p\n",
                  recordCount, filteredCount, buffer);
 
+    // Reset to 0 rather than decrement - one API callback can produce N
+    // activities so decrementing by recordCount can cause underflow
+    CuptiProfiler::instance().outstandingEvents = 0;
+
     // Free the buffer (Proton's pattern)
     std::free(buffer);
   }

From 59e0b35baca8f4d6ec12990aa207a7e15552f6d2 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Tue, 24 Mar 2026 13:31:41 -0500
Subject: [PATCH 06/14] Various fixes to make arm64 work

---
 CMakeLists.txt               |   5 +
 cupti/correlation_filter.cpp | 240 -----------------------------------
 cupti/correlation_filter.h   |  83 ------------
 test/bpf/go.mod              |   2 +-
 test/bpf/go.sum              |   5 +
 test/bpf/main.go             |   2 +-
 test/bpf/usdt_compat.h       |   9 +-
 7 files changed, 20 insertions(+), 326 deletions(-)
 delete mode 100644 cupti/correlation_filter.cpp
 delete mode 100644 cupti/correlation_filter.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c06f5b..aaa85ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,11 @@ message(STATUS "Using Proton from: ${PROTON_DIR}")
 
 # Find CUDA headers (allow fallback to manual path for header-only builds)
 if(NOT DEFINED CUDA_INCLUDE_DIR)
+    # Prefer /usr/local/cuda over system-installed CUDA packages, which may
+    # ship older headers (e.g. CUDA 12.0) missing symbols that Proton needs.
+    if(NOT DEFINED CUDAToolkit_ROOT AND EXISTS "/usr/local/cuda/include/cuda.h")
+        set(CUDAToolkit_ROOT "/usr/local/cuda")
+    endif()
     find_package(CUDAToolkit QUIET)
     if(CUDAToolkit_FOUND)
         set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS})
diff --git a/cupti/correlation_filter.cpp b/cupti/correlation_filter.cpp
deleted file mode 100644
index 8d48dfa..0000000
--- a/cupti/correlation_filter.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-#include "correlation_filter.h"
-#include <unordered_set>
-#include <unordered_map>
-#include <mutex>
-#include <memory>
-
-// CorrelationFilter implementation using std::unordered_set with mutex protection
-// This provides thread-safe access with minimal overhead for our use case
-class CorrelationFilter {
-public:
-    CorrelationFilter() = default;
-
-    // Insert a correlation ID into the filter
-    // Thread-safe
-    void insert(uint32_t correlation_id) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        set_.insert(correlation_id);
-    }
-
-    // Check if correlation ID exists and remove it atomically
-    // Returns true if found and removed, false if not found
-    // Thread-safe
-    bool check_and_remove(uint32_t correlation_id) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        auto it = set_.find(correlation_id);
-        if (it != set_.end()) {
-            set_.erase(it);
-            return true;
-        }
-        return false;
-    }
-
-    // Get current size
-    // Thread-safe
-    size_t size() const {
-        std::lock_guard<std::mutex> lock(mutex_);
-        return set_.size();
-    }
-
-private:
-    std::unordered_set<uint32_t> set_;
-    mutable std::mutex mutex_;
-};
-
-// GraphCorrelationMap implementation for tracking graph launches across buffer cycles
-// Uses a 2-slot state machine per correlation ID to detect when graph launches are complete
-struct GraphCorrelationEntry {
-    uint8_t state[2];         // State for alternating cycles
-    bool ever_seen_kernel;    // True once we've seen at least one kernel activity
-    uint32_t insertion_cycle; // Buffer cycle when entry was created (for fallback cleanup)
-
-    GraphCorrelationEntry(uint32_t cycle)
-        : state{GRAPH_STATE_UNINITIALIZED, GRAPH_STATE_UNINITIALIZED}
-        , ever_seen_kernel(false)
-        , insertion_cycle(cycle) {}
-};
-
-class GraphCorrelationMap {
-public:
-    GraphCorrelationMap() : current_cycle_(0) {}
-
-    // Insert a correlation ID (called when sampling a graph launch)
-    // Thread-safe
-    void insert(uint32_t correlation_id) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        map_.emplace(correlation_id, GraphCorrelationEntry(current_cycle_));
-    }
-
-    // Start a new processing cycle - clear the appropriate slot for all entries
-    // Thread-safe
-    void cycle_start(uint32_t cycle) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        current_cycle_ = cycle;
-        uint32_t slot = cycle % 2;
-        for (auto& pair : map_) {
-            pair.second.state[slot] = GRAPH_STATE_CYCLE_CLEARED;
-        }
-    }
-
-    // Check if correlation ID is tracked and mark as seen for this cycle
-    // Returns true if tracked (should fire probe)
-    // Thread-safe
-    bool check_and_mark_seen(uint32_t correlation_id, uint32_t cycle) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        auto it = map_.find(correlation_id);
-        if (it != map_.end()) {
-            uint32_t slot = cycle % 2;
-            it->second.state[slot] = GRAPH_STATE_KERNEL_SEEN;
-            it->second.ever_seen_kernel = true;  // Mark that we've seen at least one kernel
-            return true;
-        }
-        return false;
-    }
-
-    // End processing cycle - remove entries based on two conditions:
-    // 1. Primary: Both slots CYCLE_CLEARED AND we've seen at least one kernel (graph completed)
-    // 2. Fallback: Both slots CYCLE_CLEARED AND never seen kernel AND age > 100 cycles
-    //    (handles GPU reset, failed launches, etc.)
-    // Thread-safe
-    void cycle_end() {
-        std::lock_guard<std::mutex> lock(mutex_);
-        size_t removed_normal = 0;
-        size_t removed_fallback = 0;
-
-        for (auto it = map_.begin(); it != map_.end(); ) {
-            bool should_remove = false;
-            bool is_fallback = false;
-
-            if (it->second.state[0] == GRAPH_STATE_CYCLE_CLEARED &&
-                it->second.state[1] == GRAPH_STATE_CYCLE_CLEARED) {
-
-                if (it->second.ever_seen_kernel) {
-                    // Primary: Graph completed normally (saw kernels, then stopped)
-                    should_remove = true;
-                    removed_normal++;
-                } else if ((current_cycle_ - it->second.insertion_cycle) > 100) {
-                    // Fallback: Never saw kernels and entry is very old (>100 cycles)
-                    // Prevents leaking entries when GPU resets or launches fail
-                    should_remove = true;
-                    is_fallback = true;
-                    removed_fallback++;
-                }
-            }
-
-            if (should_remove) {
-                it = map_.erase(it);
-            } else {
-                ++it;
-            }
-        }
-    }
-
-    // Get cleanup stats (for debugging)
-    void get_stats(size_t& size, size_t& oldest_age) const {
-        std::lock_guard<std::mutex> lock(mutex_);
-        size = map_.size();
-        oldest_age = 0;
-        for (const auto& pair : map_) {
-            uint32_t age = current_cycle_ - pair.second.insertion_cycle;
-            if (age > oldest_age) {
-                oldest_age = age;
-            }
-        }
-    }
-
-    // Get current size
-    // Thread-safe
-    size_t size() const {
-        std::lock_guard<std::mutex> lock(mutex_);
-        return map_.size();
-    }
-
-private:
-    std::unordered_map<uint32_t, GraphCorrelationEntry> map_;
-    uint32_t current_cycle_;
-    mutable std::mutex mutex_;
-};
-
-// C API implementation
-extern "C" {
-
-CorrelationFilterHandle correlation_filter_create(void) {
-    return new CorrelationFilter();
-}
-
-void correlation_filter_destroy(CorrelationFilterHandle filter) {
-    if (filter) {
-        delete static_cast<CorrelationFilter*>(filter);
-    }
-}
-
-void correlation_filter_insert(CorrelationFilterHandle filter, uint32_t correlation_id) {
-    if (filter) {
-        static_cast<CorrelationFilter*>(filter)->insert(correlation_id);
-    }
-}
-
-bool correlation_filter_check_and_remove(CorrelationFilterHandle filter, uint32_t correlation_id) {
-    if (filter) {
-        return static_cast<CorrelationFilter*>(filter)->check_and_remove(correlation_id);
-    }
-    return false;
-}
-
-size_t correlation_filter_size(CorrelationFilterHandle filter) {
-    if (filter) {
-        return static_cast<CorrelationFilter*>(filter)->size();
-    }
-    return 0;
-}
-
-GraphCorrelationMapHandle graph_correlation_map_create(void) {
-    return new GraphCorrelationMap();
-}
-
-void graph_correlation_map_destroy(GraphCorrelationMapHandle map) {
-    if (map) {
-        delete static_cast<GraphCorrelationMap*>(map);
-    }
-}
-
-void graph_correlation_map_insert(GraphCorrelationMapHandle map, uint32_t correlation_id) {
-    if (map) {
-        static_cast<GraphCorrelationMap*>(map)->insert(correlation_id);
-    }
-}
-
-void graph_correlation_map_cycle_start(GraphCorrelationMapHandle map, uint32_t cycle) {
-    if (map) {
-        static_cast<GraphCorrelationMap*>(map)->cycle_start(cycle);
-    }
-}
-
-bool graph_correlation_map_check_and_mark_seen(GraphCorrelationMapHandle map, uint32_t correlation_id, uint32_t cycle) {
-    if (map) {
-        return static_cast<GraphCorrelationMap*>(map)->check_and_mark_seen(correlation_id, cycle);
-    }
-    return false;
-}
-
-void graph_correlation_map_cycle_end(GraphCorrelationMapHandle map) {
-    if (map) {
-        static_cast<GraphCorrelationMap*>(map)->cycle_end();
-    }
-}
-
-size_t graph_correlation_map_size(GraphCorrelationMapHandle map) {
-    if (map) {
-        return static_cast<GraphCorrelationMap*>(map)->size();
-    }
-    return 0;
-}
-
-void graph_correlation_map_get_stats(GraphCorrelationMapHandle map, size_t* size, size_t* oldest_age) {
-    if (map && size && oldest_age) {
-        static_cast<GraphCorrelationMap*>(map)->get_stats(*size, *oldest_age);
-    }
-}
-
-} // extern "C"
diff --git a/cupti/correlation_filter.h b/cupti/correlation_filter.h
deleted file mode 100644
index 53a2c78..0000000
--- a/cupti/correlation_filter.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#pragma once
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Opaque handle to correlation filter
-typedef void *CorrelationFilterHandle;
-
-// Create a new correlation filter
-CorrelationFilterHandle correlation_filter_create(void);
-
-// Destroy the correlation filter
-void correlation_filter_destroy(CorrelationFilterHandle filter);
-
-// Insert a correlation ID into the filter
-// Thread-safe: can be called from multiple threads concurrently
-void correlation_filter_insert(CorrelationFilterHandle filter,
-                               uint32_t correlation_id);
-
-// Check if a correlation ID exists and remove it if found
-// Returns true if the correlation ID was found and removed, false otherwise
-// Thread-safe: safe to call concurrently with inserts
-bool correlation_filter_check_and_remove(CorrelationFilterHandle filter,
-                                         uint32_t correlation_id);
-
-// Get the current size of the filter (number of tracked correlation IDs)
-// Note: This is an approximate count in concurrent scenarios
-size_t correlation_filter_size(CorrelationFilterHandle filter);
-
-// Graph correlation state values
-enum GraphCorrelationState {
-  GRAPH_STATE_UNINITIALIZED = 0, // Entry just created, slot not yet processed
-  GRAPH_STATE_CYCLE_CLEARED = 1, // Cycle started, no kernels seen yet
-  GRAPH_STATE_KERNEL_SEEN = 2    // At least one kernel seen this cycle
-};
-
-// Opaque handle to graph correlation map
-typedef void *GraphCorrelationMapHandle;
-
-// Create a new graph correlation map
-GraphCorrelationMapHandle graph_correlation_map_create(void);
-
-// Destroy the graph correlation map
-void graph_correlation_map_destroy(GraphCorrelationMapHandle map);
-
-// Insert a correlation ID into the map (called when sampling a graph launch)
-// Thread-safe: can be called from multiple threads concurrently
-void graph_correlation_map_insert(GraphCorrelationMapHandle map,
-                                  uint32_t correlation_id);
-
-// Start a new processing cycle - clears the appropriate slot for all entries
-// Thread-safe
-void graph_correlation_map_cycle_start(GraphCorrelationMapHandle map,
-                                       uint32_t cycle);
-
-// Check if correlation ID should fire probe and mark as seen for this cycle
-// Returns true if the correlation ID is tracked (should fire probe)
-// Thread-safe
-bool graph_correlation_map_check_and_mark_seen(GraphCorrelationMapHandle map,
-                                               uint32_t correlation_id,
-                                               uint32_t cycle);
-
-// End processing cycle - removes entries that haven't seen kernels in 2
-// consecutive cycles Thread-safe
-void graph_correlation_map_cycle_end(GraphCorrelationMapHandle map);
-
-// Get the current size of the map (number of tracked correlation IDs)
-// Note: This is an approximate count in concurrent scenarios
-size_t graph_correlation_map_size(GraphCorrelationMapHandle map);
-
-// Get statistics about the map (for debugging)
-// Returns the current size and age of the oldest entry (in cycles)
-void graph_correlation_map_get_stats(GraphCorrelationMapHandle map,
-                                     size_t *size, size_t *oldest_age);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/test/bpf/go.mod b/test/bpf/go.mod
index ae3ac5b..46ea536 100644
--- a/test/bpf/go.mod
+++ b/test/bpf/go.mod
@@ -3,7 +3,7 @@ module github.com/parca-dev/batch-kernel/test/bpf
 go 1.25.1
 
 require (
-	github.com/cilium/ebpf v0.20.0
+	github.com/cilium/ebpf v0.21.0
 	go.opentelemetry.io/ebpf-profiler v0.0.0-00010101000000-000000000000
 	golang.org/x/sys v0.41.0
 )
diff --git a/test/bpf/go.sum b/test/bpf/go.sum
index 6530420..7eaf9ca 100644
--- a/test/bpf/go.sum
+++ b/test/bpf/go.sum
@@ -2,9 +2,12 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cilium/ebpf v0.20.0 h1:atwWj9d3NffHyPZzVlx3hmw1on5CLe9eljR8VuHTwhM=
 github.com/cilium/ebpf v0.20.0/go.mod h1:pzLjFymM+uZPLk/IXZUL63xdx5VXEo+enTzxkZXdycw=
+github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k=
+github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
 github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
@@ -30,10 +33,12 @@ github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/
 github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
 github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U=
 github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA=
+github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos=
 github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM=
 github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
 github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
diff --git a/test/bpf/main.go b/test/bpf/main.go
index f6e9738..85c6ff9 100644
--- a/test/bpf/main.go
+++ b/test/bpf/main.go
@@ -34,7 +34,7 @@ import (
 	"golang.org/x/sys/unix"
 )
 
-//go:generate bpf2go -cc clang -cflags "-O2 -g -Wall -target bpf -D__TARGET_ARCH_x86 -D__x86_64__ -I../../vendor/opentelemetry-ebpf-profiler/support/ebpf" activityParser activity_parser.bpf.c
+//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $GOARCH -cflags "-I../../vendor/opentelemetry-ebpf-profiler/support/ebpf" activityParser activity_parser.bpf.c
 
 // KernelEvent matches struct kernel_event in the BPF program.
 type KernelEvent struct {
diff --git a/test/bpf/usdt_compat.h b/test/bpf/usdt_compat.h
index ca5a15e..fcd127e 100644
--- a/test/bpf/usdt_compat.h
+++ b/test/bpf/usdt_compat.h
@@ -8,7 +8,8 @@
 #ifndef USDT_COMPAT_H
 #define USDT_COMPAT_H
 
-// Skip vendor bpfdefs.h and types.h — we already have everything from vmlinux.h.
+// Skip vendor bpfdefs.h and types.h — we already have everything from
+// vmlinux.h.
 #define OPTI_BPFDEFS_H
 #define OPTI_TYPES_H
 
@@ -26,4 +27,10 @@ typedef struct bpf_map_def {
   unsigned int map_flags;
 } bpf_map_def;
 
+// Hack: even with -target arm64 bpf2go doesn't invoke clang so that __arch64__
+// is defined, but __TARGET_ARCH_arm64 is
+#if defined(__TARGET_ARCH_arm64)
+#define __aarch64__
+#endif
+
 #endif // USDT_COMPAT_H

From 73410e7ddf475966a8e0ecaa0aa16edb8e6c87e0 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Tue, 24 Mar 2026 14:50:25 -0400
Subject: [PATCH 07/14] And make amd64 compile too

---
 test/bpf/usdt_compat.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/bpf/usdt_compat.h b/test/bpf/usdt_compat.h
index fcd127e..4dc35bd 100644
--- a/test/bpf/usdt_compat.h
+++ b/test/bpf/usdt_compat.h
@@ -27,9 +27,13 @@ typedef struct bpf_map_def {
   unsigned int map_flags;
 } bpf_map_def;
 
-// Hack: even with -target arm64 bpf2go doesn't invoke clang so that __arch64__
-// is defined, but __TARGET_ARCH_arm64 is
-#if defined(__TARGET_ARCH_arm64)
+// bpf2go passes -target bpfel (not the real platform triple), so the
+// usual compiler builtins (__x86_64__, __aarch64__) are never set.
+// Bridge from bpf2go's __TARGET_ARCH_* defines to the builtins that
+// usdt_args.h checks for pt_regs layout.
+#if defined(__TARGET_ARCH_x86)
+#define __x86_64__
+#elif defined(__TARGET_ARCH_arm64)
 #define __aarch64__
 #endif
 

From 16a9e54e77a6aec90fe2894da114602cfa564eee Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Wed, 25 Mar 2026 09:08:40 -0500
Subject: [PATCH 08/14] Small cleanups/formatting

---
 .clangd                      |   2 +
 src/parcagpu_cupti.cpp       | 124 +++++++++++++++++++----------------
 src/parcagpu_pc_sampling.cpp |  90 ++++++++++++-------------
 src/parcagpu_pc_sampling.h   |  16 ++++-
 4 files changed, 127 insertions(+), 105 deletions(-)
 create mode 100644 .clangd

diff --git a/.clangd b/.clangd
new file mode 100644
index 0000000..1e71e2d
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,2 @@
+CompileFlags:
+  CompilationDatabase: build-local
diff --git a/src/parcagpu_cupti.cpp b/src/parcagpu_cupti.cpp
index 41303d1..486bf88 100644
--- a/src/parcagpu_cupti.cpp
+++ b/src/parcagpu_cupti.cpp
@@ -30,7 +30,8 @@ static GraphCorrelationMap g_graphCorrelationMap;
 static std::atomic<uint32_t> g_bufferCycle{0};
 
 // Thread-local tracking: store correlation ID from runtime ENTER
-// so we can skip driver EXIT probe when it matches (driver calls happen under runtime calls)
+// so we can skip driver EXIT probe when it matches (driver calls happen under
+// runtime calls)
 thread_local uint32_t runtimeEnterCorrelationId = 0;
 
 // Token bucket rate limiter (configurable via PARCAGPU_RATE_LIMIT)
@@ -76,17 +77,6 @@ void init_debug() {
   }
 }
 
-#define DEBUG_PRINTF(...)                                                      \
-  do {                                                                         \
-    parcagpu::init_debug();                                                    \
-    if (parcagpu::debug_enabled) {                                             \
-      struct timespec ts;                                                      \
-      clock_gettime(CLOCK_REALTIME, &ts);                                      \
-      fprintf(stderr, "[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec);                  \
-      fprintf(stderr, __VA_ARGS__);                                            \
-    }                                                                          \
-  } while (0)
-
 // Out-of-line USDT probe site for activity batches.
 // Single call site ensures one probe location in the ELF .note.stapsdt section.
 static constexpr int ACTIVITY_BATCH_SIZE = 128;
@@ -95,7 +85,7 @@ static constexpr int ACTIVITY_BATCH_SIZE = 128;
 
 __attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs,
                                                      uint32_t count) {
-  DTRACE_PROBE2(parcagpu, activity_batch, ptrs, count);
+  STAP_PROBE2(parcagpu, activity_batch, ptrs, count);
 }
 
 namespace parcagpu {
@@ -121,14 +111,16 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     if (pcSamplingEnabled) {
       DEBUG_PRINTF("[PARCAGPU] PC sampling enabled (continuous mode)\n");
     } else {
-      DEBUG_PRINTF("[PARCAGPU] PC sampling disabled, using kernel activity only\n");
+      DEBUG_PRINTF(
+          "[PARCAGPU] PC sampling disabled, using kernel activity only\n");
     }
 
     // Subscribe to callbacks
-    auto result = proton::cupti::subscribe<true>(&subscriber, callbackHandler,
-                                            nullptr);
+    auto result =
+        proton::cupti::subscribe<true>(&subscriber, callbackHandler, nullptr);
     if (result != CUPTI_SUCCESS) {
-      DEBUG_PRINTF("[PARCAGPU] Failed to subscribe to callbacks: error %d\n", result);
+      DEBUG_PRINTF("[PARCAGPU] Failed to subscribe to callbacks: error %d\n",
+                   result);
       return false;
     }
 
@@ -145,7 +137,9 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     result = proton::cupti::activityRegisterCallbacks<true>(allocBuffer,
                                                             completeBuffer);
     if (result != CUPTI_SUCCESS) {
-      DEBUG_PRINTF("[PARCAGPU] Failed to register activity callbacks: error %d\n", result);
+      DEBUG_PRINTF(
+          "[PARCAGPU] Failed to register activity callbacks: error %d\n",
+          result);
       return false;
     }
 
@@ -153,7 +147,9 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     result = proton::cupti::activityEnable<true>(
         CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
     if (result != CUPTI_SUCCESS) {
-      DEBUG_PRINTF("[PARCAGPU] Failed to enable concurrent kernel activity: error %d\n", result);
+      DEBUG_PRINTF(
+          "[PARCAGPU] Failed to enable concurrent kernel activity: error %d\n",
+          result);
     } else {
       DEBUG_PRINTF("[PARCAGPU] Enabled CONCURRENT_KERNEL activity\n");
     }
@@ -220,8 +216,8 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
                  *buffer, *bufferSize);
   }
 
-  static void completeBuffer(CUcontext ctx, uint32_t streamId,
-                              uint8_t *buffer, size_t size, size_t validSize) {
+  static void completeBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
+                             size_t size, size_t validSize) {
     CUpti_Activity *record = nullptr;
     int recordCount = 0;
     int filteredCount = 0;
@@ -232,21 +228,21 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     const void *batchPtrs[ACTIVITY_BATCH_SIZE];
     uint32_t batchCount = 0;
 
-    DEBUG_PRINTF(
-        "[PARCAGPU] completeBuffer called: buffer=%p validSize=%zu\n",
-        buffer, validSize);
+    DEBUG_PRINTF("[PARCAGPU] completeBuffer called: buffer=%p validSize=%zu\n",
+                 buffer, validSize);
 
     // Start a new buffer cycle for graph correlation tracking
     uint32_t cycle = g_bufferCycle.fetch_add(1);
     g_graphCorrelationMap.cycle_start(cycle);
 
     while (true) {
-      CUptiResult result =
-          proton::cupti::activityGetNextRecord<false>(buffer, validSize, &record);
+      CUptiResult result = proton::cupti::activityGetNextRecord<false>(
+          buffer, validSize, &record);
       if (result == CUPTI_ERROR_MAX_LIMIT_REACHED) {
         break;
       } else if (result != CUPTI_SUCCESS) {
-        DEBUG_PRINTF("[PARCAGPU] Error reading activity record: error %d\n", result);
+        DEBUG_PRINTF("[PARCAGPU] Error reading activity record: error %d\n",
+                     result);
         break;
       }
 
@@ -260,7 +256,8 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         bool shouldEmit = false;
         if (k->graphId != 0) {
           // Graph kernel - check graph correlation map
-          shouldEmit = g_graphCorrelationMap.check_and_mark_seen(k->correlationId, cycle);
+          shouldEmit = g_graphCorrelationMap.check_and_mark_seen(
+              k->correlationId, cycle);
         } else {
           // Regular kernel - check and remove from correlation filter
           shouldEmit = g_correlationFilter.check_and_remove(k->correlationId);
@@ -310,20 +307,26 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     // End cycle - cleanup completed graph entries
     g_graphCorrelationMap.cycle_end();
 
-    DEBUG_PRINTF("[PARCAGPU] Processed %d activity records (%d filtered) from buffer %p\n",
+    DEBUG_PRINTF("[PARCAGPU] Processed %d activity records (%d filtered) from "
+                 "buffer %p\n",
                  recordCount, filteredCount, buffer);
 
     // Reset to 0 rather than decrement - one API callback can produce N
     // activities so decrementing by recordCount can cause underflow
     CuptiProfiler::instance().outstandingEvents = 0;
 
+    // Drain PC sampling data on the buffer-completion thread (off the
+    // application's launch path).
+    if (CuptiProfiler::instance().pcSamplingEnabled && ctx != nullptr) {
+      parcagpu::PCSampling::instance().collectData(ctx);
+    }
+
     // Free the buffer (Proton's pattern)
     std::free(buffer);
   }
 
   static void callbackHandler(void *userdata, CUpti_CallbackDomain domain,
-                              CUpti_CallbackId cbid,
-                              const void *cbdata_void) {
+                              CUpti_CallbackId cbid, const void *cbdata_void) {
     auto &profiler = CuptiProfiler::instance();
 
     if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
@@ -338,23 +341,25 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
       switch (cbid) {
       case CUPTI_CBID_RESOURCE_MODULE_LOADED: {
         const CUpti_ModuleResourceData *modData =
-            static_cast<const CUpti_ModuleResourceData *>(resData->resourceDescriptor);
+            static_cast<const CUpti_ModuleResourceData *>(
+                resData->resourceDescriptor);
         if (modData && modData->pCubin && modData->cubinSize > 0) {
           DEBUG_PRINTF("[PARCAGPU] Module loaded: cubin=%p size=%zu\n",
                        modData->pCubin, modData->cubinSize);
-          parcagpu::PCSampling::instance().loadModule(
-              modData->pCubin, modData->cubinSize);
+          parcagpu::PCSampling::instance().loadModule(modData->pCubin,
+                                                      modData->cubinSize);
         }
         break;
       }
       case CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING: {
         const CUpti_ModuleResourceData *modData =
-            static_cast<const CUpti_ModuleResourceData *>(resData->resourceDescriptor);
+            static_cast<const CUpti_ModuleResourceData *>(
+                resData->resourceDescriptor);
         if (modData && modData->pCubin && modData->cubinSize > 0) {
           DEBUG_PRINTF("[PARCAGPU] Module unloading: cubin=%p size=%zu\n",
                        modData->pCubin, modData->cubinSize);
-          parcagpu::PCSampling::instance().unloadModule(
-              modData->pCubin, modData->cubinSize);
+          parcagpu::PCSampling::instance().unloadModule(modData->pCubin,
+                                                        modData->cubinSize);
         }
         break;
       }
@@ -391,24 +396,29 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         return;
       }
 
-      const char *name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
+      const char *name =
+          cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
       int signedCbid;
 
       if (domain == CUPTI_CB_DOMAIN_DRIVER_API) {
-        // Skip if this driver call is under a runtime call (same correlation ID)
+        // Skip if this driver call is under a runtime call (same correlation
+        // ID)
         if (correlationId == runtimeEnterCorrelationId) {
-          DEBUG_PRINTF("[PARCAGPU] Skipping driver EXIT correlationId=%u - runtime will handle\n",
+          DEBUG_PRINTF("[PARCAGPU] Skipping driver EXIT correlationId=%u - "
+                       "runtime will handle\n",
                        correlationId);
           return;
         }
         // Pure driver call (no runtime wrapper) - use negative cbid
         signedCbid = -(int)cbid;
-        DEBUG_PRINTF("[PARCAGPU] Driver API callback: cbid=%d, correlationId=%u, func=%s\n",
+        DEBUG_PRINTF("[PARCAGPU] Driver API callback: cbid=%d, "
+                     "correlationId=%u, func=%s\n",
                      cbid, correlationId, name);
       } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
         signedCbid = (int)cbid;
         runtimeEnterCorrelationId = 0; // Clear after use
-        DEBUG_PRINTF("[PARCAGPU] Runtime API callback: cbid=%d, correlationId=%u, func=%s\n",
+        DEBUG_PRINTF("[PARCAGPU] Runtime API callback: cbid=%d, "
+                     "correlationId=%u, func=%s\n",
                      cbid, correlationId, name);
       } else {
         return;
@@ -419,12 +429,15 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
       if (signedCbid < 0) {
         // Driver API: cuGraphLaunch = 514, cuGraphLaunch_ptsz = 515
         int driverCbid = -signedCbid;
-        isGraphLaunch = (driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch ||
-                         driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
+        isGraphLaunch =
+            (driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch ||
+             driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
       } else {
         // Runtime API: cudaGraphLaunch = 311, cudaGraphLaunch_ptsz = 312
-        isGraphLaunch = (signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 ||
-                         signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000);
+        isGraphLaunch =
+            (signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 ||
+             signedCbid ==
+                 CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000);
       }
 
       // Rate limit probes using token bucket (skip for graph launches)
@@ -433,14 +446,16 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         clock_gettime(CLOCK_MONOTONIC, &ts);
         uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
         if (!rateLimiterTryAcquire(nowNs)) {
-          DEBUG_PRINTF("[PARCAGPU] Rate limited: skipping probe for correlationId=%u\n",
-                       correlationId);
+          DEBUG_PRINTF(
+              "[PARCAGPU] Rate limited: skipping probe for correlationId=%u\n",
+              correlationId);
           return;
         }
       }
 
       profiler.outstandingEvents++;
-      // Emit USDT probe with signed cbid (negative for driver, positive for runtime)
+      // Emit USDT probe with signed cbid (negative for driver, positive for
+      // runtime)
       STAP_PROBE3(parcagpu, cuda_correlation, correlationId, signedCbid, name);
 
       // Insert into correlation filter so we can match kernel activities later
@@ -450,8 +465,9 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
                      correlationId);
       } else {
         g_correlationFilter.insert(correlationId);
-        DEBUG_PRINTF("[PARCAGPU] Inserted correlationId=%u into correlation filter\n",
-                     correlationId);
+        DEBUG_PRINTF(
+            "[PARCAGPU] Inserted correlationId=%u into correlation filter\n",
+            correlationId);
       }
 
       // Flush if too many events pile up
@@ -461,12 +477,6 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         proton::cupti::activityFlushAll<true>(0);
         profiler.outstandingEvents = 0;
       }
-
-      // Collect PC sampling data after kernel launch (continuous mode)
-      if (profiler.pcSamplingEnabled) {
-        parcagpu::PCSampling::instance().collectData(cbdata->context,
-                                                   correlationId);
-      }
     }
   }
 };
diff --git a/src/parcagpu_pc_sampling.cpp b/src/parcagpu_pc_sampling.cpp
index 7eb766d..35ef021 100644
--- a/src/parcagpu_pc_sampling.cpp
+++ b/src/parcagpu_pc_sampling.cpp
@@ -1,6 +1,6 @@
-#include "parcagpu_pc_sampling.h"
 #include "Driver/GPU/CudaApi.h"
 #include "Driver/GPU/CuptiApi.h"
+#include "parcagpu_pc_sampling.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -18,15 +18,6 @@ namespace parcagpu {
 extern bool debug_enabled;
 extern void init_debug();
 
-#define DEBUG_PRINTF(...)                                                      \
-  do {                                                                         \
-    parcagpu::init_debug();                                                    \
-    if (parcagpu::debug_enabled) {                                             \
-      fprintf(stderr, "[PARCAGPU:PCSampling] ");                               \
-      fprintf(stderr, __VA_ARGS__);                                            \
-    }                                                                          \
-  } while (0)
-
 namespace {
 
 // CUPTI helper functions (adapted from Proton's CuptiPCSamplingUtils.h)
@@ -126,7 +117,7 @@ void doubleCheckedLock(CheckFn check, std::mutex &mutex, ActionFn action) {
 }
 
 // Helper to get PARCAGPU's custom sampling frequency from environment
-uint32_t getParcaGPUSamplingFrequency() {
+uint32_t getGPUSamplingFrequency() {
   // Default frequency for PARCAGPU is 18 (Proton uses 10)
   constexpr uint32_t PARCAGPU_DEFAULT_FREQUENCY = 18;
 
@@ -138,9 +129,13 @@ uint32_t getParcaGPUSamplingFrequency() {
       samplingPeriod = factor;
       DEBUG_PRINTF("Using PARCAGPU_SAMPLING_FACTOR=%u\n", samplingPeriod);
     } else if (factor != 0) { // 0 is handled in isSupported()
-      fprintf(stderr, "[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%d out of range [5,31], using default %u\n",
+      fprintf(stderr,
+              "[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%d out of range "
+              "[5,31], using default %u\n",
               factor, PARCAGPU_DEFAULT_FREQUENCY);
     }
+  } else {
+    return 0;
   }
   return samplingPeriod;
 }
@@ -251,9 +246,10 @@ CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingPeriod() {
       CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD;
 
   // Use PARCAGPU's custom sampling frequency
-  uint32_t frequency = getParcaGPUSamplingFrequency();
+  uint32_t frequency = getGPUSamplingFrequency();
 
-  samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod = frequency;
+  samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod =
+      frequency;
   return samplingPeriodInfo;
 }
 
@@ -299,8 +295,8 @@ void ConfigureData::initialize(CUcontext context) {
   this->context = context;
   proton::cupti::getContextId<true>(context, &contextId);
 
-  DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n",
-               context, contextId);
+  DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n", context,
+               contextId);
 
   configurationInfos.emplace_back(configureStallReasons());
   configurationInfos.emplace_back(configureSamplingPeriod());
@@ -319,9 +315,9 @@ void ConfigureData::initialize(CUcontext context) {
 
 bool PCSampling::isSupported() {
   // Check PARCAGPU_SAMPLING_FACTOR - 0 means disabled
-  const char *sampling_factor_env = getenv("PARCAGPU_SAMPLING_FACTOR");
-  if (sampling_factor_env && atoi(sampling_factor_env) == 0) {
-    DEBUG_PRINTF("PC sampling disabled via PARCAGPU_SAMPLING_FACTOR=0\n");
+  int sampling_factor = getGPUSamplingFrequency();
+  if (sampling_factor == 0) {
+    DEBUG_PRINTF("PC sampling not enabled via PARCAGPU_SAMPLING_FACTOR\n");
     return false;
   }
 
@@ -333,7 +329,8 @@ bool PCSampling::isSupported() {
     int major = driverVersion / 1000;
     int minor = (driverVersion % 1000) / 10;
     int patch = driverVersion % 10;
-    DEBUG_PRINTF("PC sampling not supported: CUDA driver version %d.%d.%d < required 12.8.1\n",
+    DEBUG_PRINTF("PC sampling not supported: CUDA driver version %d.%d.%d < "
+                 "required 12.8.1\n",
                  major, minor, patch);
     return false;
   }
@@ -341,7 +338,8 @@ bool PCSampling::isSupported() {
   int major = driverVersion / 1000;
   int minor = (driverVersion % 1000) / 10;
   int patch = driverVersion % 10;
-  DEBUG_PRINTF("PC sampling supported: CUDA driver version %d.%d.%d\n", major, minor, patch);
+  DEBUG_PRINTF("PC sampling supported: CUDA driver version %d.%d.%d\n", major,
+               minor, patch);
   return true;
 }
 
@@ -357,19 +355,19 @@ void PCSampling::initialize(CUcontext context) {
   uint32_t contextId = 0;
   proton::cupti::getContextId<true>(context, &contextId);
 
-  doubleCheckedLock([&]() { return !contextInitialized.contain(contextId); },
-                    contextMutex,
-                    [&]() {
-                      enablePCSampling(context);
-                      getConfigureData(contextId)->initialize(context);
+  doubleCheckedLock(
+      [&]() { return !contextInitialized.contain(contextId); }, contextMutex,
+      [&]() {
+        enablePCSampling(context);
+        getConfigureData(contextId)->initialize(context);
 
-                      contextInitialized.insert(contextId);
-                      DEBUG_PRINTF("PC sampling started in continuous mode for context %u\n", contextId);
-                    });
+        contextInitialized.insert(contextId);
+        DEBUG_PRINTF("PC sampling started in continuous mode for context %u\n",
+                     contextId);
+      });
 }
 
-void PCSampling::processPCSamplingData(ConfigureData *configureData,
-                                                uint32_t correlationId) {
+void PCSampling::processPCSamplingData(ConfigureData *configureData) {
   auto *pcSamplingData = &configureData->pcSamplingData;
 
   // Collect data in rounds
@@ -407,9 +405,8 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData,
         totalSamples += stallReason->samples;
 
         // Check if this is a "not_issued" stall (not really stalled)
-        bool isNotIssued =
-            configureData->notIssuedStallReasonIndices.count(
-                stallReason->pcSamplingStallReasonIndex) > 0;
+        bool isNotIssued = configureData->notIssuedStallReasonIndices.count(
+                               stallReason->pcSamplingStallReasonIndex) > 0;
 
         if (!isNotIssued) {
           stalledSamples += stallReason->samples;
@@ -420,9 +417,10 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData,
       std::string fullPath = lineInfo.fileName.size()
                                  ? lineInfo.dirName + "/" + lineInfo.fileName
                                  : "";
-      STAP_PROBE8(parcagpu, pc_sample_summary, correlationId, pcData->functionIndex,
+      STAP_PROBE7(parcagpu, pc_sample_summary, pcData->functionIndex,
                   pcData->pcOffset, totalSamples, stalledSamples,
-                  fullPath.c_str(), lineInfo.lineNumber, lineInfo.functionName.c_str());
+                  fullPath.c_str(), lineInfo.lineNumber,
+                  lineInfo.functionName.c_str());
 
       // Emit detailed stall reason probes
       for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
@@ -438,8 +436,9 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData,
           }
         }
 
-        STAP_PROBE6(parcagpu, pc_stall_reason, correlationId, pcData->functionIndex,
-                    pcData->pcOffset, stallReasonIndex, stallReasonName, stallReason->samples);
+        STAP_PROBE5(parcagpu, pc_stall_reason, pcData->functionIndex,
+                    pcData->pcOffset, stallReasonIndex, stallReasonName,
+                    stallReason->samples);
       }
     }
 
@@ -453,20 +452,21 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData,
   }
 }
 
-void PCSampling::collectData(CUcontext context, uint32_t correlationId) {
+void PCSampling::collectData(CUcontext context) {
   uint32_t contextId = 0;
   proton::cupti::getContextId<true>(context, &contextId);
 
   if (!contextInitialized.contain(contextId)) {
-    DEBUG_PRINTF("Context %u not initialized, skipping data collection\n", contextId);
+    DEBUG_PRINTF("Context %u not initialized, skipping data collection\n",
+                 contextId);
     return;
   }
 
   auto *configureData = getConfigureData(contextId);
-  DEBUG_PRINTF("Collecting PC sampling data for correlation %u\n", correlationId);
+  DEBUG_PRINTF("Collecting PC sampling data for context %u\n", contextId);
 
   // Collect data without stopping - continuous mode
-  processPCSamplingData(configureData, correlationId);
+  processPCSamplingData(configureData);
 }
 
 void PCSampling::finalize(CUcontext context) {
@@ -483,14 +483,14 @@ void PCSampling::finalize(CUcontext context) {
 }
 
 void PCSampling::loadModule(const char *cubin, size_t cubinSize) {
-  auto cubinCrc =  getCubinCrc(cubin, cubinSize);
+  auto cubinCrc = getCubinCrc(cubin, cubinSize);
   auto *cubinData = getCubinData(cubinCrc);
 
   if (cubinCrcToCubinData.contain(cubinCrc)) {
     // Increment reference count
     cubinCrcToCubinData[cubinCrc].second++;
-    DEBUG_PRINTF("Module 0x%lx loaded (refcount=%zu)\n",
-                 cubinCrc, cubinCrcToCubinData[cubinCrc].second);
+    DEBUG_PRINTF("Module 0x%lx loaded (refcount=%zu)\n", cubinCrc,
+                 cubinCrcToCubinData[cubinCrc].second);
   } else {
     // New module
     cubinData->cubinCrc = cubinCrc;
diff --git a/src/parcagpu_pc_sampling.h b/src/parcagpu_pc_sampling.h
index 01f9413..0f2144b 100644
--- a/src/parcagpu_pc_sampling.h
+++ b/src/parcagpu_pc_sampling.h
@@ -19,6 +19,17 @@
 #include "Utility/Set.h"
 #include "Utility/Singleton.h"
 
+#define DEBUG_PRINTF(...)                                                      \
+  do {                                                                         \
+    parcagpu::init_debug();                                                    \
+    if (parcagpu::debug_enabled) {                                             \
+      struct timespec ts;                                                      \
+      clock_gettime(CLOCK_REALTIME, &ts);                                      \
+      fprintf(stderr, "[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec);                  \
+      fprintf(stderr, __VA_ARGS__);                                            \
+    }                                                                          \
+  } while (0)
+
 namespace parcagpu {
 
 // Use Proton's CubinData directly
@@ -83,7 +94,7 @@ class PCSampling : public proton::Singleton<PCSampling> {
   static bool isSupported();
 
   void initialize(CUcontext context);
-  void collectData(CUcontext context, uint32_t correlationId);
+  void collectData(CUcontext context);
   void finalize(CUcontext context);
   void loadModule(const char *cubin, size_t cubinSize);
   void unloadModule(const char *cubin, size_t cubinSize);
@@ -91,8 +102,7 @@ class PCSampling : public proton::Singleton<PCSampling> {
 private:
   ConfigureData *getConfigureData(uint32_t contextId);
   CubinData *getCubinData(uint64_t cubinCrc);
-  void processPCSamplingData(ConfigureData *configureData,
-                             uint32_t correlationId);
+  void processPCSamplingData(ConfigureData *configureData);
 
   proton::ThreadSafeMap<uint32_t, ConfigureData> contextIdToConfigureData;
   proton::ThreadSafeMap<size_t, std::pair<CubinData, size_t>>

From 8b50a7768a7dd2595e7455d9fab54a45ae8d108c Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Tue, 31 Mar 2026 07:00:12 -0500
Subject: [PATCH 09/14] Shorten names

---
 src/{parcagpu_cupti.cpp => cupti.cpp}             | 0
 src/{parcagpu_pc_sampling.cpp => pc_sampling.cpp} | 0
 src/{parcagpu_pc_sampling.h => pc_sampling.h}     | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename src/{parcagpu_cupti.cpp => cupti.cpp} (100%)
 rename src/{parcagpu_pc_sampling.cpp => pc_sampling.cpp} (100%)
 rename src/{parcagpu_pc_sampling.h => pc_sampling.h} (100%)

diff --git a/src/parcagpu_cupti.cpp b/src/cupti.cpp
similarity index 100%
rename from src/parcagpu_cupti.cpp
rename to src/cupti.cpp
diff --git a/src/parcagpu_pc_sampling.cpp b/src/pc_sampling.cpp
similarity index 100%
rename from src/parcagpu_pc_sampling.cpp
rename to src/pc_sampling.cpp
diff --git a/src/parcagpu_pc_sampling.h b/src/pc_sampling.h
similarity index 100%
rename from src/parcagpu_pc_sampling.h
rename to src/pc_sampling.h

From d32344dd5c9a7b6e73a91b74170ea6d0e557d419 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Tue, 31 Mar 2026 15:56:44 -0500
Subject: [PATCH 10/14] Checkpoint PC sampling tweaking

---
 .gitignore          |   6 +-
 CMakeLists.txt      |  31 ++++++-
 parcagpu.bt         |  69 ++++++++++++--
 src/cupti.cpp       |  79 +++++++++-------
 src/pc_sampling.cpp | 216 ++++++++++++++++++++++++++++++--------------
 src/pc_sampling.h   |  22 +++--
 src/probes.d        |  16 ++++
 7 files changed, 321 insertions(+), 118 deletions(-)
 create mode 100644 src/probes.d

diff --git a/.gitignore b/.gitignore
index 07d7b12..01d5342 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,7 @@
-cupti/build-*
 build/
-.zig-cache/
-zig-out/
 test/build/
 test/bpf/vmlinux.h
 test/bpf/activity_parser
 test/bpf/activityparser_*.go
-test/bpf/activityparser_*.o
\ No newline at end of file
+test/bpf/activityparser_*.o
+src/probes.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aaa85ec..10aef9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,25 @@ include_directories(
     ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
+# Generate USDT probe header and object from probes.d
+find_program(DTRACE dtrace REQUIRED)
+
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h
+    COMMAND ${DTRACE} -h -s ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+                         -o ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+    COMMENT "Generating USDT probe header (probes.h)"
+)
+
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/probes.o
+    COMMAND ${DTRACE} -G -s ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+                         -o ${CMAKE_CURRENT_BINARY_DIR}/probes.o
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+    COMMENT "Generating USDT probe object (probes.o)"
+)
+
 # Collect proton driver source files we need
 # Note: We only link driver APIs and callback utilities (no profiler dependencies)
 set(PROTON_SOURCES
@@ -64,12 +83,20 @@ set(PROTON_SOURCES
 
 # Build the shared library
 add_library(parcagpucupti SHARED
-    src/parcagpu_cupti.cpp
-    src/parcagpu_pc_sampling.cpp
+    src/cupti.cpp
+    src/pc_sampling.cpp
     src/correlation_filter.cpp
     ${PROTON_SOURCES}
 )
 
+# Wire USDT probe artifacts into the build
+add_custom_target(probes_generated
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h
+            ${CMAKE_CURRENT_BINARY_DIR}/probes.o
+)
+add_dependencies(parcagpucupti probes_generated)
+target_link_libraries(parcagpucupti ${CMAKE_CURRENT_BINARY_DIR}/probes.o)
+
 # Link against system libraries only (NOT libcupti - loaded dynamically by Proton)
 target_link_libraries(parcagpucupti
     dl
diff --git a/parcagpu.bt b/parcagpu.bt
index 757c1eb..8995b32 100755
--- a/parcagpu.bt
+++ b/parcagpu.bt
@@ -1,7 +1,7 @@
 #!/usr/bin/env bpftrace
 
 BEGIN {
-    printf("Monitoring CUDA kernel executions via parcagpu USDT probes...\n");
+    printf("Monitoring CUDA activity via parcagpu USDT probes...\n");
     printf("%-20s %-12s %-10s %-10s %-10s %-10s %-10s %-10s %s\n",
            "TIME", "DURATION_NS", "CORR_ID", "DEVICE", "STREAM", "GRAPH", "NODE", "START", "KERNEL");
 }
@@ -17,16 +17,17 @@ usdt:*:parcagpu:kernel_executed {
     $graph_node_id = arg6;
     $kernel_name = str(arg7);
 
-    printf("%-20s %-12lu %-10u %-10u %-10u %-10u %-10lu %-10lu %s\n",
-           strftime("%H:%M:%S.%f", nsecs),
+    printf("%-12s.%-6u %-12lu %-10u %-10u %-10u %-10u %-10lu %-10lu ",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
            $duration,
            $correlation_id,
            $device_id,
            $stream_id,
            $graph_id,
            $graph_node_id,
-           $start,
-           $kernel_name);
+           $start);
+    printf("%s\n", $kernel_name);
 }
 
 usdt:*:parcagpu:cuda_correlation {
@@ -34,7 +35,63 @@ usdt:*:parcagpu:cuda_correlation {
     $cbid = arg1;
     $name = str(arg2);
 
-    printf("%-20s [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S.%f", nsecs),$correlation_id, $cbid, $name);
+    printf("%-12s.%-6u [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S", nsecs), (nsecs % 1000000000) / 1000, $correlation_id, $cbid, $name);
+}
+
+usdt:*:parcagpu:pc_sample_summary {
+    $function_index = arg0;
+    $pc_offset = arg1;
+    $total_samples = arg2;
+    $stalled_samples = arg3;
+    $source_path = str(arg4);
+    $line_number = arg5;
+    $function_name = str(arg6);
+
+    printf("%-12s.%-6u [PC_SAMPLE] func=%u pc=0x%lx total=%lu stalled=%lu %s:%u %s\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $function_index,
+           $pc_offset,
+           $total_samples,
+           $stalled_samples,
+           $source_path,
+           $line_number,
+           $function_name);
+}
+
+usdt:*:parcagpu:pc_stall_reason {
+    $function_index = arg0;
+    $pc_offset = arg1;
+    $stall_reason_index = arg2;
+    $stall_reason_name = str(arg3);
+    $samples = arg4;
+
+    printf("%-12s.%-6u [STALL]     func=%u pc=0x%lx reason[%u]=%s samples=%lu\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $function_index,
+           $pc_offset,
+           $stall_reason_index,
+           $stall_reason_name,
+           $samples);
+}
+
+usdt:*:parcagpu:cubin_loaded {
+    $cubin_crc = arg0;
+
+    printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $cubin_crc);
+}
+
+usdt:*:parcagpu:cubin_unloaded {
+    $cubin_crc = arg0;
+
+    printf("%-12s.%-6u [CUBIN_UNLOAD] crc=0x%lx\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $cubin_crc);
 }
 
 END {
diff --git a/src/cupti.cpp b/src/cupti.cpp
index 486bf88..ed7c513 100644
--- a/src/cupti.cpp
+++ b/src/cupti.cpp
@@ -1,22 +1,21 @@
-#include <array>
 #include <atomic>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <memory>
-#include <sys/sdt.h>
 #include <time.h>
 #include <unistd.h>
 
+// USDT probes — must come before any header that might include <sys/sdt.h>,
+// so that _SDT_HAS_SEMAPHORES is defined first.
+#include "probes.h"
+
 // Include proton headers
 #include "Driver/GPU/CuptiApi.h"
 #include "Profiler/Cupti/CuptiCallbacks.h"
 #include "Utility/Singleton.h"
 #include "correlation_filter.h"
-#include "parcagpu_pc_sampling.h"
-
-#include <cupti.h>
+#include "pc_sampling.h"
 
 namespace parcagpu {
 
@@ -85,7 +84,7 @@ static constexpr int ACTIVITY_BATCH_SIZE = 128;
 
 __attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs,
                                                      uint32_t count) {
-  STAP_PROBE2(parcagpu, activity_batch, ptrs, count);
+  PARCAGPU_ACTIVITY_BATCH(ptrs, count);
 }
 
 namespace parcagpu {
@@ -165,19 +164,27 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
 
     DEBUG_PRINTF("[PARCAGPU] Cleanup started\n");
 
-    // Flush any remaining activity records
-    proton::cupti::activityFlushAll<true>(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
-
-    // Disable activity recording
-    proton::cupti::activityDisable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+    // PC sampling data is drained in finalize() during CONTEXT_DESTROY_STARTING
+    // when the CUDA context is still valid. By the time cleanup() runs, the
+    // context may already be dead, so we don't drain here.
 
-    // Disable all callbacks (using Proton's utilities)
+    // Disable all callbacks
     if (subscriber) {
       proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
       proton::setLaunchCallbacks(subscriber, /*enable=*/false);
       if (pcSamplingEnabled) {
         proton::setResourceCallbacks(subscriber, /*enable=*/false);
       }
+    }
+
+    // Flush any remaining activity records
+    proton::cupti::activityFlushAll<true>(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+
+    // Disable activity recording
+    proton::cupti::activityDisable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+
+    // Unsubscribe
+    if (subscriber) {
       proton::cupti::unsubscribe<true>(subscriber);
       subscriber = nullptr;
     }
@@ -190,6 +197,9 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
   bool pcSamplingEnabled = false;
   CUpti_SubscriberHandle subscriber = nullptr;
 
+  // PC sampling state — owned by this profiler, destroyed with it.
+  parcagpu::PCSampling pcSampling;
+
   // Outstanding event counter for flushing
   size_t outstandingEvents = 0;
 
@@ -205,6 +215,10 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
 
   static void allocBuffer(uint8_t **buffer, size_t *bufferSize,
                           size_t *maxNumRecords) {
+    if (!PARCAGPU_CUDA_CORRELATION_ENABLED()) {
+      *buffer = nullptr;
+      return;
+    }
     *buffer = static_cast<uint8_t *>(aligned_alloc(AlignSize, BufferSize));
     if (*buffer == nullptr) {
       DEBUG_PRINTF("[PARCAGPU] ERROR: aligned_alloc failed\n");
@@ -228,8 +242,9 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     const void *batchPtrs[ACTIVITY_BATCH_SIZE];
     uint32_t batchCount = 0;
 
-    DEBUG_PRINTF("[PARCAGPU] completeBuffer called: buffer=%p validSize=%zu\n",
-                 buffer, validSize);
+    DEBUG_PRINTF(
+        "[PARCAGPU] completeBuffer called: ctx=%p buffer=%p validSize=%zu\n",
+        ctx, buffer, validSize);
 
     // Start a new buffer cycle for graph correlation tracking
     uint32_t cycle = g_bufferCycle.fetch_add(1);
@@ -279,9 +294,9 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
                      k->end - k->start);
 
         // Emit USDT probe for kernel execution
-        STAP_PROBE8(parcagpu, kernel_executed, k->start, k->end,
-                    k->correlationId, k->deviceId, k->streamId, k->graphId,
-                    k->graphNodeId, k->name);
+        PARCAGPU_KERNEL_EXECUTED(k->start, k->end, k->correlationId,
+                                 k->deviceId, k->streamId, k->graphId,
+                                 k->graphNodeId, k->name);
         break;
       }
       default:
@@ -315,12 +330,6 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     // activities so decrementing by recordCount can cause underflow
     CuptiProfiler::instance().outstandingEvents = 0;
 
-    // Drain PC sampling data on the buffer-completion thread (off the
-    // application's launch path).
-    if (CuptiProfiler::instance().pcSamplingEnabled && ctx != nullptr) {
-      parcagpu::PCSampling::instance().collectData(ctx);
-    }
-
     // Free the buffer (Proton's pattern)
     std::free(buffer);
   }
@@ -346,8 +355,7 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         if (modData && modData->pCubin && modData->cubinSize > 0) {
           DEBUG_PRINTF("[PARCAGPU] Module loaded: cubin=%p size=%zu\n",
                        modData->pCubin, modData->cubinSize);
-          parcagpu::PCSampling::instance().loadModule(modData->pCubin,
-                                                      modData->cubinSize);
+          profiler.pcSampling.loadModule(modData->pCubin, modData->cubinSize);
         }
         break;
       }
@@ -358,27 +366,30 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         if (modData && modData->pCubin && modData->cubinSize > 0) {
           DEBUG_PRINTF("[PARCAGPU] Module unloading: cubin=%p size=%zu\n",
                        modData->pCubin, modData->cubinSize);
-          parcagpu::PCSampling::instance().unloadModule(modData->pCubin,
-                                                        modData->cubinSize);
+          profiler.pcSampling.unloadModule(modData->pCubin, modData->cubinSize);
         }
         break;
       }
       case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: {
         CUcontext ctx = resData->context;
         DEBUG_PRINTF("[PARCAGPU] Context created: %p\n", ctx);
-        parcagpu::PCSampling::instance().initialize(ctx);
+        profiler.pcSampling.initialize(ctx);
         break;
       }
       case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: {
         CUcontext ctx = resData->context;
         DEBUG_PRINTF("[PARCAGPU] Context destroying: %p\n", ctx);
-        parcagpu::PCSampling::instance().finalize(ctx);
+        profiler.pcSampling.finalize(ctx);
         break;
       }
       default:
         break;
       }
     } else {
+      // No profiler attached — skip all correlation/rate-limiter work.
+      if (!PARCAGPU_CUDA_CORRELATION_ENABLED())
+        return;
+
       // Handle both Runtime and Driver API callbacks
       // Track runtime ENTER so we can skip driver EXIT when they match
       const CUpti_CallbackData *cbdata =
@@ -456,7 +467,13 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
       profiler.outstandingEvents++;
       // Emit USDT probe with signed cbid (negative for driver, positive for
       // runtime)
-      STAP_PROBE3(parcagpu, cuda_correlation, correlationId, signedCbid, name);
+      PARCAGPU_CUDA_CORRELATION(correlationId, signedCbid, name);
+
+      // Drain PC sampling data on kernel launch EXIT — the context from
+      // cbdata is always valid here, unlike in completeBuffer or at shutdown.
+      if (profiler.pcSamplingEnabled) {
+        profiler.pcSampling.collectData(cbdata->context);
+      }
 
       // Insert into correlation filter so we can match kernel activities later
       if (isGraphLaunch) {
diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp
index 35ef021..a36bb20 100644
--- a/src/pc_sampling.cpp
+++ b/src/pc_sampling.cpp
@@ -1,6 +1,7 @@
+#include "probes.h"
 #include "Driver/GPU/CudaApi.h"
 #include "Driver/GPU/CuptiApi.h"
-#include "parcagpu_pc_sampling.h"
+#include "pc_sampling.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -43,6 +44,24 @@ void enablePCSampling(CUcontext context) {
   proton::cupti::pcSamplingEnable<true>(&params);
 }
 
+void startPCSampling(CUcontext context) {
+  CUpti_PCSamplingStartParams params = {
+      /*size=*/CUpti_PCSamplingStartParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  proton::cupti::pcSamplingStart<true>(&params);
+}
+
+void stopPCSampling(CUcontext context) {
+  CUpti_PCSamplingStopParams params = {
+      /*size=*/CUpti_PCSamplingStopParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  proton::cupti::pcSamplingStop<true>(&params);
+}
+
 void disablePCSampling(CUcontext context) {
   CUpti_PCSamplingDisableParams params = {
       /*size=*/CUpti_PCSamplingDisableParamsSize,
@@ -52,7 +71,8 @@ void disablePCSampling(CUcontext context) {
   proton::cupti::pcSamplingDisable<true>(&params);
 }
 
-void getPCSamplingData(CUcontext context,
+// Returns true if data was retrieved successfully, false on error.
+bool getPCSamplingData(CUcontext context,
                        CUpti_PCSamplingData *pcSamplingData) {
   CUpti_PCSamplingGetDataParams params = {
       /*size=*/CUpti_PCSamplingGetDataParamsSize,
@@ -60,7 +80,13 @@ void getPCSamplingData(CUcontext context,
       /*ctx=*/context,
       /*pcSamplingData=*/pcSamplingData,
   };
-  proton::cupti::pcSamplingGetData<true>(&params);
+  auto result = proton::cupti::pcSamplingGetData<false>(&params);
+  if (result != CUPTI_SUCCESS) {
+    DEBUG_PRINTF("cuptiPCSamplingGetData failed: error %d (ctx=%p)\n",
+                 result, context);
+    return false;
+  }
+  return true;
 }
 
 void setConfigurationAttribute(
@@ -299,14 +325,19 @@ void ConfigureData::initialize(CUcontext context) {
                contextId);
 
   configurationInfos.emplace_back(configureStallReasons());
-  configurationInfos.emplace_back(configureSamplingPeriod());
-  configurationInfos.emplace_back(configureHardwareBufferSize());
-  configurationInfos.emplace_back(configureScratchBuffer());
-  configurationInfos.emplace_back(configureSamplingBuffer());
   configurationInfos.emplace_back(configureCollectionMode());
+  configurationInfos.emplace_back(configureSamplingBuffer());
+  // Don't set sampling period — let CUPTI use its default.
+  // Explicit period values silently break sampling on some GPUs (e.g. Blackwell).
 
   setConfigurationAttribute(context, configurationInfos);
 
+  // Allocate a separate output buffer for getPCSamplingData calls.
+  // The configured pcSamplingData buffer is owned by CUPTI internally;
+  // we must pass a different buffer to getPCSamplingData.
+  this->outputData =
+      allocPCSamplingData(DataBufferPCCount, numValidStallReasons);
+
   DEBUG_PRINTF("PC sampling configured with %u stall reasons (%u valid)\n",
                numStallReasons, numValidStallReasons);
 }
@@ -314,10 +345,17 @@ void ConfigureData::initialize(CUcontext context) {
 // GPUPCSampling implementation
 
 bool PCSampling::isSupported() {
-  // Check PARCAGPU_SAMPLING_FACTOR - 0 means disabled
-  int sampling_factor = getGPUSamplingFrequency();
-  if (sampling_factor == 0) {
-    DEBUG_PRINTF("PC sampling not enabled via PARCAGPU_SAMPLING_FACTOR\n");
+  // PARCAGPU_SAMPLING_FACTOR must be set to enable PC sampling.
+  // Any non-zero value enables it; 0 disables.
+  // If unset, PC sampling is disabled.
+  const char *env = getenv("PARCAGPU_SAMPLING_FACTOR");
+  if (!env) {
+    DEBUG_PRINTF("PC sampling not enabled (PARCAGPU_SAMPLING_FACTOR not set)\n");
+    return false;
+  }
+  int factor = atoi(env);
+  if (factor == 0) {
+    DEBUG_PRINTF("PC sampling disabled via PARCAGPU_SAMPLING_FACTOR=0\n");
     return false;
   }
 
@@ -362,28 +400,47 @@ void PCSampling::initialize(CUcontext context) {
         getConfigureData(contextId)->initialize(context);
 
         contextInitialized.insert(contextId);
+        initializedContextIds.push_back(contextId);
         DEBUG_PRINTF("PC sampling started in continuous mode for context %u\n",
                      contextId);
       });
 }
 
 void PCSampling::processPCSamplingData(ConfigureData *configureData) {
-  auto *pcSamplingData = &configureData->pcSamplingData;
+  auto *pcSamplingData = &configureData->outputData;
 
-  // Collect data in rounds
-  bool firstRound = true;
-  while (pcSamplingData->totalNumPcs > 0 ||
-         pcSamplingData->remainingNumPcs > 0 || firstRound) {
+  if (pcSamplingData->totalNumPcs == 0) {
+    return;
+  }
 
-    DEBUG_PRINTF("Processing %zu PCs (remaining: %zu)\n",
-                 pcSamplingData->totalNumPcs, pcSamplingData->remainingNumPcs);
+  DEBUG_PRINTF("Processing %zu PCs (remaining: %zu)\n",
+               pcSamplingData->totalNumPcs, pcSamplingData->remainingNumPcs);
 
-    // Process each PC sample
-    for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
-      auto *pcData = pcSamplingData->pPcData + i;
-      auto *cubinData = getCubinData(pcData->cubinCrc);
+  // Process each PC sample
+  for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
+    auto *pcData = pcSamplingData->pPcData + i;
+
+    // Calculate total and stalled samples
+    uint64_t totalSamples = 0;
+    uint64_t stalledSamples = 0;
+
+    for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
+      auto *stallReason = &pcData->stallReason[j];
+      totalSamples += stallReason->samples;
 
-      // Get or cache source location
+      // Check if this is a "not_issued" stall (not really stalled)
+      bool isNotIssued = configureData->notIssuedStallReasonIndices.count(
+                             stallReason->pcSamplingStallReasonIndex) > 0;
+
+      if (!isNotIssued) {
+        stalledSamples += stallReason->samples;
+      }
+    }
+
+    // Source correlation only for debug logging — backend resolves
+    // file/line from the cubin using pcOffset.
+    if (debug_enabled) {
+      auto *cubinData = getCubinData(pcData->cubinCrc);
       auto key =
           CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset};
       if (cubinData->lineInfo.find(key) == cubinData->lineInfo.end()) {
@@ -395,39 +452,25 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData) {
                                         dirName, fileName);
       }
       auto &lineInfo = cubinData->lineInfo[key];
-
-      // Calculate total and stalled samples
-      uint64_t totalSamples = 0;
-      uint64_t stalledSamples = 0;
-
-      for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
-        auto *stallReason = &pcData->stallReason[j];
-        totalSamples += stallReason->samples;
-
-        // Check if this is a "not_issued" stall (not really stalled)
-        bool isNotIssued = configureData->notIssuedStallReasonIndices.count(
-                               stallReason->pcSamplingStallReasonIndex) > 0;
-
-        if (!isNotIssued) {
-          stalledSamples += stallReason->samples;
-        }
-      }
-
-      // Emit summary probe
       std::string fullPath = lineInfo.fileName.size()
                                  ? lineInfo.dirName + "/" + lineInfo.fileName
                                  : "";
-      STAP_PROBE7(parcagpu, pc_sample_summary, pcData->functionIndex,
-                  pcData->pcOffset, totalSamples, stalledSamples,
-                  fullPath.c_str(), lineInfo.lineNumber,
-                  lineInfo.functionName.c_str());
+      DEBUG_PRINTF("  [%zu] func=%s pc=0x%lx total=%lu stalled=%lu %s:%u\n",
+                   i, lineInfo.functionName.c_str(), pcData->pcOffset,
+                   totalSamples, stalledSamples, fullPath.c_str(),
+                   lineInfo.lineNumber);
+    }
 
-      // Emit detailed stall reason probes
-      for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
-        auto *stallReason = &pcData->stallReason[j];
-        auto stallReasonIndex = stallReason->pcSamplingStallReasonIndex;
+    PARCAGPU_PC_SAMPLE_SUMMARY(pcData->functionIndex, pcData->pcOffset,
+                               totalSamples, stalledSamples,
+                               pcData->functionName);
 
-        // Get stall reason name
+    // Emit detailed stall reason probes
+    for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
+      auto *stallReason = &pcData->stallReason[j];
+      auto stallReasonIndex = stallReason->pcSamplingStallReasonIndex;
+
+      if (debug_enabled) {
         const char *stallReasonName = "";
         for (size_t k = 0; k < configureData->numStallReasons; k++) {
           if (configureData->stallReasonIndices[k] == stallReasonIndex) {
@@ -435,19 +478,14 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData) {
             break;
           }
         }
-
-        STAP_PROBE5(parcagpu, pc_stall_reason, pcData->functionIndex,
-                    pcData->pcOffset, stallReasonIndex, stallReasonName,
-                    stallReason->samples);
+        if (stallReason->samples > 0) {
+          DEBUG_PRINTF("    stall: %s = %u\n", stallReasonName,
+                       stallReason->samples);
+        }
       }
-    }
 
-    // Get next round of data if available
-    if (pcSamplingData->remainingNumPcs > 0 || firstRound) {
-      getPCSamplingData(configureData->context, pcSamplingData);
-      firstRound = false;
-    } else {
-      break;
+      PARCAGPU_PC_STALL_REASON(pcData->functionIndex, pcData->pcOffset,
+                               stallReasonIndex, stallReason->samples);
     }
   }
 }
@@ -463,12 +501,37 @@ void PCSampling::collectData(CUcontext context) {
   }
 
   auto *configureData = getConfigureData(contextId);
-  DEBUG_PRINTF("Collecting PC sampling data for context %u\n", contextId);
-
-  // Collect data without stopping - continuous mode
+  DEBUG_PRINTF("Collecting PC sampling data for context %u (cfg total=%zu remaining=%zu)\n",
+               contextId, configureData->pcSamplingData.totalNumPcs,
+               configureData->pcSamplingData.remainingNumPcs);
+
+  // Use the separate output buffer for getData — the configured
+  // pcSamplingData buffer is owned by CUPTI.
+  bool ok = getPCSamplingData(context, &configureData->outputData);
+  DEBUG_PRINTF("getData: ok=%d output total=%zu remaining=%zu "
+               "cfg total=%zu remaining=%zu\n",
+               ok, configureData->outputData.totalNumPcs,
+               configureData->outputData.remainingNumPcs,
+               configureData->pcSamplingData.totalNumPcs,
+               configureData->pcSamplingData.remainingNumPcs);
   processPCSamplingData(configureData);
 }
 
+void PCSampling::collectAllData() {
+  std::lock_guard<std::mutex> lock(contextMutex);
+  for (auto contextId : initializedContextIds) {
+    auto result = contextIdToConfigureData.find(contextId);
+    if (!result) {
+      DEBUG_PRINTF("Context %u in initializedContextIds but not in map, "
+                   "skipping\n", contextId);
+      continue;
+    }
+    auto *configureData = &result->get();
+    DEBUG_PRINTF("Draining PC sampling data for context %u\n", contextId);
+    processPCSamplingData(configureData);
+  }
+}
+
 void PCSampling::finalize(CUcontext context) {
   uint32_t contextId = 0;
   proton::cupti::getContextId<true>(context, &contextId);
@@ -476,8 +539,27 @@ void PCSampling::finalize(CUcontext context) {
   if (!contextInitialized.contain(contextId))
     return;
 
+  // Hold contextMutex for the entire finalize to prevent collectAllData
+  // from racing with us (it iterates initializedContextIds under this lock).
+  std::lock_guard<std::mutex> lock(contextMutex);
+
   DEBUG_PRINTF("Finalizing PC sampling for context %p\n", context);
 
+  // Remove from iteration list first so collectAllData won't touch this context
+  initializedContextIds.erase(
+      std::remove(initializedContextIds.begin(),
+                  initializedContextIds.end(), contextId),
+      initializedContextIds.end());
+
+  // Drain remaining data before disabling
+  auto *configureData = getConfigureData(contextId);
+  processPCSamplingData(configureData);
+
+  // After disable, CUPTI may fill remaining records — drain once more
+  if (configureData->pcSamplingData.totalNumPcs > 0) {
+    processPCSamplingData(configureData);
+  }
+
   contextIdToConfigureData.erase(contextId);
   contextInitialized.erase(contextId);
 }
@@ -498,7 +580,7 @@ void PCSampling::loadModule(const char *cubin, size_t cubinSize) {
     cubinData->cubin = cubin;
     cubinCrcToCubinData[cubinCrc].second = 1;
     DEBUG_PRINTF("Module 0x%lx loaded (new)\n", cubinCrc);
-    STAP_PROBE3(parcagpu, cubin_loaded, cubinCrc, 0, 0);
+    PARCAGPU_CUBIN_LOADED(cubinCrc, 0, 0);
   }
 }
 
@@ -515,7 +597,7 @@ void PCSampling::unloadModule(const char *cubin, size_t cubinSize) {
   } else {
     cubinCrcToCubinData.erase(cubinCrc);
     DEBUG_PRINTF("Module 0x%lx unloaded (removed)\n", cubinCrc);
-    STAP_PROBE2(parcagpu, cubin_unloaded, cubinCrc, 0);
+    PARCAGPU_CUBIN_UNLOADED(cubinCrc, 0);
   }
 }
 
diff --git a/src/pc_sampling.h b/src/pc_sampling.h
index 0f2144b..26acb45 100644
--- a/src/pc_sampling.h
+++ b/src/pc_sampling.h
@@ -1,5 +1,5 @@
-#ifndef PARCAGPU_PC_SAMPLING_H_
-#define PARCAGPU_PC_SAMPLING_H_
+#ifndef PC_SAMPLING_H_
+#define PC_SAMPLING_H_
 
 #include <atomic>
 #include <map>
@@ -7,7 +7,6 @@
 #include <mutex>
 #include <set>
 #include <string>
-#include <sys/sdt.h>
 #include <vector>
 
 #include <cuda.h>
@@ -17,7 +16,6 @@
 #include "Profiler/Cupti/CuptiPCSampling.h"
 #include "Utility/Map.h"
 #include "Utility/Set.h"
-#include "Utility/Singleton.h"
 
 #define DEBUG_PRINTF(...)                                                      \
   do {                                                                         \
@@ -80,21 +78,25 @@ struct ConfigureData {
   uint32_t *stallReasonIndices{};
   std::map<size_t, size_t> stallReasonIndexToMetricIndex{};
   std::set<size_t> notIssuedStallReasonIndices{};
-  CUpti_PCSamplingData pcSamplingData{};
+  CUpti_PCSamplingData pcSamplingData{};       // registered with CUPTI config
+  CUpti_PCSamplingData outputData{};           // separate buffer for getData calls
   std::vector<CUpti_PCSamplingConfigurationInfo> configurationInfos;
 };
 
-// PC Sampling singleton class (adapted from Proton's CuptiPCSampling)
-class PCSampling : public proton::Singleton<PCSampling> {
+// PC Sampling class (adapted from Proton's CuptiPCSampling)
+// Owned by CuptiProfiler — not a standalone singleton, so lifetime
+// is tied to the profiler and there are no static destruction order issues.
+class PCSampling {
 public:
   PCSampling() = default;
-  virtual ~PCSampling() = default;
+  ~PCSampling() = default;
 
   // Check if PC sampling is supported (CUPTI >= 12.8.1)
   static bool isSupported();
 
   void initialize(CUcontext context);
   void collectData(CUcontext context);
+  void collectAllData();
   void finalize(CUcontext context);
   void loadModule(const char *cubin, size_t cubinSize);
   void unloadModule(const char *cubin, size_t cubinSize);
@@ -109,6 +111,10 @@ class PCSampling : public proton::Singleton<PCSampling> {
       cubinCrcToCubinData;
   proton::ThreadSafeSet<uint32_t> contextInitialized;
 
+  // Plain vector of initialized context IDs for iteration in collectAllData.
+  // Protected by contextMutex.
+  std::vector<uint32_t> initializedContextIds;
+
   std::mutex contextMutex{};
 };
 
diff --git a/src/probes.d b/src/probes.d
new file mode 100644
index 0000000..bb5ad79
--- /dev/null
+++ b/src/probes.d
@@ -0,0 +1,16 @@
+provider parcagpu {
+  probe cuda_correlation(uint32_t correlationId, int signedCbid,
+                          const char *name);
+  probe kernel_executed(uint64_t start, uint64_t end,
+                        uint32_t correlationId, uint32_t deviceId,
+                        uint32_t streamId, uint32_t graphId,
+                        uint64_t graphNodeId, const char *name);
+  probe activity_batch(const void **ptrs, uint32_t count);
+  probe pc_sample_summary(uint32_t functionIndex, uint64_t pcOffset,
+                           uint64_t totalSamples, uint64_t stalledSamples,
+                           const char *functionName);
+  probe pc_stall_reason(uint32_t functionIndex, uint64_t pcOffset,
+                         uint32_t stallReasonIndex, uint32_t samples);
+  probe cubin_loaded(uint64_t cubinCrc, int reserved1, int reserved2);
+  probe cubin_unloaded(uint64_t cubinCrc, int reserved);
+};

From f87ed73b9007bb4a6942f1d6849916ba74f33766 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Tue, 31 Mar 2026 17:09:49 -0500
Subject: [PATCH 11/14] Stall reason map handling, prepping for batched pc
 samples

---
 Makefile                       | 33 +++++++++++-
 bpft_test.sh                   | 22 ++++++++
 parcagpu.bt                    | 25 +++++----
 src/cupti.cpp                  | 59 ++++++---------------
 src/pc_sampling.cpp            | 15 +++++-
 src/pc_sampling.h              |  9 ++++
 src/probes.d                   |  1 +
 src/stall_reason_map.h         | 56 ++++++++++++++++++++
 src/token_bucket.h             | 59 +++++++++++++++++++++
 test/bpf/activity_parser.bpf.c | 51 ++++++++++++++++++
 test/bpf/main.go               | 95 +++++++++++++++++++++++-----------
 11 files changed, 341 insertions(+), 84 deletions(-)
 create mode 100755 bpft_test.sh
 create mode 100644 src/stall_reason_map.h
 create mode 100644 src/token_bucket.h

diff --git a/Makefile b/Makefile
index 7b46cd5..7d5bcba 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test test-multi
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test test-multi test-pc-real
 
 LIB_NAME = libparcagpucupti.so
 
@@ -135,6 +135,37 @@ test-multi: local bpf-test
 	sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
 	echo "=== test-multi completed (test exit: $${TEST_EXIT}) ==="
 
+# Run pc_sample_toy with BPF activity parser and verify stall reason map is received.
+# Requires: real GPU, root (sudo) for BPF, pc_sample_toy compiled separately.
+test-pc-real: local bpf-test
+	@echo "=== Running PC sampling smoke test ==="
+	@LIB_PATH="$$(pwd)/build-local/lib/libparcagpucupti.so"; \
+	TOY="$$(pwd)/microbenchmarks/pc_sample_toy"; \
+	if [ ! -x "$$TOY" ]; then \
+		echo "error: $$TOY not found — compile with: /usr/local/cuda/bin/nvcc -o microbenchmarks/pc_sample_toy microbenchmarks/pc_sample_toy.cu" >&2; \
+		exit 1; \
+	fi; \
+	PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$$LIB_PATH" "$$TOY" 3 & \
+	TOY_PID=$$!; \
+	echo "pc_sample_toy PID: $${TOY_PID}"; \
+	while kill -0 $${TOY_PID} 2>/dev/null && ! grep -q libparcagpucupti "/proc/$${TOY_PID}/maps" 2>/dev/null; do \
+		sleep 0.1; \
+	done; \
+	echo "Starting BPF activity parser (requires root)..."; \
+	sudo test/bpf/activity_parser -pid $${TOY_PID} -lib "$$LIB_PATH" -v 2>&1 | tee /tmp/parcagpu-pc-test.log & \
+	BPF_PID=$$!; \
+	wait $${TOY_PID} 2>/dev/null; \
+	sleep 1; \
+	sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
+	echo; \
+	if grep -q 'stall reason map:' /tmp/parcagpu-pc-test.log && \
+	   grep -q 'smsp__pcsamp' /tmp/parcagpu-pc-test.log; then \
+		echo "=== PASS: stall reason map received ==="; \
+	else \
+		echo "=== FAIL: stall reason map not found in output ===" >&2; \
+		exit 1; \
+	fi
+
 format:
 	@echo "=== Formatting source files ==="
 	@clang-format -i -style=file src/*.cpp src/*.h test/*.c
diff --git a/bpft_test.sh b/bpft_test.sh
new file mode 100755
index 0000000..2c2a285
--- /dev/null
+++ b/bpft_test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -eu
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LIB="$SCRIPT_DIR/build-local/lib/libparcagpucupti.so"
+
+if [ ! -f "$LIB" ]; then
+    echo "error: $LIB not found" >&2
+    exit 1
+fi
+
+export PARCAGPU_DEBUG=1
+PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$LIB" "$SCRIPT_DIR/microbenchmarks/pc_sample_toy" 4 &
+TOY_PID=$!
+trap "kill $TOY_PID 2>/dev/null; wait $TOY_PID 2>/dev/null" EXIT
+
+# Wait for the injection library to be loaded (CUDA must initialize first)
+while kill -0 "$TOY_PID" 2>/dev/null && ! grep -q libparcagpucupti "/proc/$TOY_PID/maps" 2>/dev/null; do
+    sleep 0.1
+done
+
+exec bpftrace -p "$TOY_PID" "$SCRIPT_DIR/parcagpu.bt"
diff --git a/parcagpu.bt b/parcagpu.bt
index 8995b32..9898a2a 100755
--- a/parcagpu.bt
+++ b/parcagpu.bt
@@ -43,19 +43,15 @@ usdt:*:parcagpu:pc_sample_summary {
     $pc_offset = arg1;
     $total_samples = arg2;
     $stalled_samples = arg3;
-    $source_path = str(arg4);
-    $line_number = arg5;
-    $function_name = str(arg6);
+    $function_name = str(arg4);
 
-    printf("%-12s.%-6u [PC_SAMPLE] func=%u pc=0x%lx total=%lu stalled=%lu %s:%u %s\n",
+    printf("%-12s.%-6u [PC_SAMPLE] func=%u pc=0x%lx total=%lu stalled=%lu %s\n",
            strftime("%H:%M:%S", nsecs),
            (nsecs % 1000000000) / 1000,
            $function_index,
            $pc_offset,
            $total_samples,
            $stalled_samples,
-           $source_path,
-           $line_number,
            $function_name);
 }
 
@@ -63,19 +59,28 @@ usdt:*:parcagpu:pc_stall_reason {
     $function_index = arg0;
     $pc_offset = arg1;
     $stall_reason_index = arg2;
-    $stall_reason_name = str(arg3);
-    $samples = arg4;
+    $samples = arg3;
 
-    printf("%-12s.%-6u [STALL]     func=%u pc=0x%lx reason[%u]=%s samples=%lu\n",
+    printf("%-12s.%-6u [STALL]     func=%u pc=0x%lx reason[%u] samples=%lu\n",
            strftime("%H:%M:%S", nsecs),
            (nsecs % 1000000000) / 1000,
            $function_index,
            $pc_offset,
            $stall_reason_index,
-           $stall_reason_name,
            $samples);
 }
 
+usdt:*:parcagpu:stall_reason_map {
+    $names = arg0;
+    $count = arg1;
+
+    printf("%-12s.%-6u [STALL_MAP] count=%u base=%p\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $count,
+           $names);
+}
+
 usdt:*:parcagpu:cubin_loaded {
     $cubin_crc = arg0;
 
diff --git a/src/cupti.cpp b/src/cupti.cpp
index ed7c513..a40448c 100644
--- a/src/cupti.cpp
+++ b/src/cupti.cpp
@@ -16,6 +16,7 @@
 #include "Utility/Singleton.h"
 #include "correlation_filter.h"
 #include "pc_sampling.h"
+#include "token_bucket.h"
 
 namespace parcagpu {
 
@@ -33,32 +34,9 @@ static std::atomic<uint32_t> g_bufferCycle{0};
 // runtime calls)
 thread_local uint32_t runtimeEnterCorrelationId = 0;
 
-// Token bucket rate limiter (configurable via PARCAGPU_RATE_LIMIT)
-double rateLimitPerSec = 100.0;
-
-// Thread-local token bucket state
-thread_local uint64_t lastRefillNs = 0;
-thread_local double tokens = 0;
-
-// Returns true if the sample should be emitted, false if rate limited
-bool rateLimiterTryAcquire(uint64_t nowNs) {
-  if (lastRefillNs > 0) {
-    double elapsedSec = (nowNs - lastRefillNs) / 1e9;
-    tokens = tokens + elapsedSec * rateLimitPerSec;
-    if (tokens > rateLimitPerSec) {
-      tokens = rateLimitPerSec;
-    }
-  } else {
-    tokens = rateLimitPerSec; // Start with full bucket
-  }
-  lastRefillNs = nowNs;
-
-  if (tokens >= 1.0) {
-    tokens -= 1.0;
-    return true;
-  }
-  return false;
-}
+// Thread-local rate limiter for callback probes (default 100/sec,
+// configurable via PARCAGPU_RATE_LIMIT).
+thread_local TokenBucket callbackLimiter(100.0);
 
 void init_debug() {
   static bool initialized = false;
@@ -69,7 +47,7 @@ void init_debug() {
     if (rateEnv != nullptr) {
       double rate = atof(rateEnv);
       if (rate > 0) {
-        rateLimitPerSec = rate;
+        callbackLimiter.setRate(rate);
       }
     }
     initialized = true;
@@ -386,12 +364,7 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         break;
       }
     } else {
-      // No profiler attached — skip all correlation/rate-limiter work.
-      if (!PARCAGPU_CUDA_CORRELATION_ENABLED())
-        return;
-
       // Handle both Runtime and Driver API callbacks
-      // Track runtime ENTER so we can skip driver EXIT when they match
       const CUpti_CallbackData *cbdata =
           static_cast<const CUpti_CallbackData *>(cbdata_void);
       uint32_t correlationId = cbdata->correlationId;
@@ -407,6 +380,17 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         return;
       }
 
+      // Drain PC sampling data on kernel launch EXIT — the context from
+      // cbdata is always valid here, unlike in completeBuffer or at shutdown.
+      // Runs regardless of which USDT probes are attached.
+      if (profiler.pcSamplingEnabled) {
+        profiler.pcSampling.collectData(cbdata->context);
+      }
+
+      // Skip correlation/rate-limiter work when no profiler is attached.
+      if (!PARCAGPU_CUDA_CORRELATION_ENABLED())
+        return;
+
       const char *name =
           cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
       int signedCbid;
@@ -453,10 +437,7 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
 
       // Rate limit probes using token bucket (skip for graph launches)
       if (!limiter_disabled && !isGraphLaunch) {
-        struct timespec ts;
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
-        if (!rateLimiterTryAcquire(nowNs)) {
+        if (!callbackLimiter.tryAcquire()) {
           DEBUG_PRINTF(
               "[PARCAGPU] Rate limited: skipping probe for correlationId=%u\n",
               correlationId);
@@ -469,12 +450,6 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
       // runtime)
       PARCAGPU_CUDA_CORRELATION(correlationId, signedCbid, name);
 
-      // Drain PC sampling data on kernel launch EXIT — the context from
-      // cbdata is always valid here, unlike in completeBuffer or at shutdown.
-      if (profiler.pcSamplingEnabled) {
-        profiler.pcSampling.collectData(cbdata->context);
-      }
-
       // Insert into correlation filter so we can match kernel activities later
       if (isGraphLaunch) {
         g_graphCorrelationMap.insert(correlationId);
diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp
index a36bb20..64b9753 100644
--- a/src/pc_sampling.cpp
+++ b/src/pc_sampling.cpp
@@ -397,7 +397,13 @@ void PCSampling::initialize(CUcontext context) {
       [&]() { return !contextInitialized.contain(contextId); }, contextMutex,
       [&]() {
         enablePCSampling(context);
-        getConfigureData(contextId)->initialize(context);
+        auto *configData = getConfigureData(contextId);
+        configData->initialize(context);
+
+        // Build contiguous stall reason map for USDT probe emission.
+        stallReasonMap.build(configData->numStallReasons,
+                            configData->stallReasonIndices,
+                            configData->stallReasonNames);
 
         contextInitialized.insert(contextId);
         initializedContextIds.push_back(contextId);
@@ -505,6 +511,13 @@ void PCSampling::collectData(CUcontext context) {
                contextId, configureData->pcSamplingData.totalNumPcs,
                configureData->pcSamplingData.remainingNumPcs);
 
+  // Re-emit stall reason map periodically so the profiler backend can
+  // join stall reason indices to human-readable names.
+  if (stallReasonMap.data() && stallReasonMapLimiter.tryAcquire()) {
+    PARCAGPU_STALL_REASON_MAP(stallReasonMap.data(),
+                              stallReasonMap.numEntries());
+  }
+
   // Use the separate output buffer for getData — the configured
   // pcSamplingData buffer is owned by CUPTI.
   bool ok = getPCSamplingData(context, &configureData->outputData);
diff --git a/src/pc_sampling.h b/src/pc_sampling.h
index 26acb45..910a9f7 100644
--- a/src/pc_sampling.h
+++ b/src/pc_sampling.h
@@ -9,6 +9,9 @@
 #include <string>
 #include <vector>
 
+#include "stall_reason_map.h"
+#include "token_bucket.h"
+
 #include <cuda.h>
 #include <cupti.h>
 
@@ -116,6 +119,12 @@ class PCSampling {
   std::vector<uint32_t> initializedContextIds;
 
   std::mutex contextMutex{};
+
+  // Contiguous stall reason map for USDT probe emission.
+  StallReasonMap stallReasonMap;
+
+  // Rate limiter for stall reason map re-emission (1 token per 10 seconds).
+  TokenBucket stallReasonMapLimiter{0.1};
 };
 
 } // namespace parcagpu
diff --git a/src/probes.d b/src/probes.d
index bb5ad79..914b73e 100644
--- a/src/probes.d
+++ b/src/probes.d
@@ -11,6 +11,7 @@ provider parcagpu {
                            const char *functionName);
   probe pc_stall_reason(uint32_t functionIndex, uint64_t pcOffset,
                          uint32_t stallReasonIndex, uint32_t samples);
+  probe stall_reason_map(const char *names, uint32_t count);
   probe cubin_loaded(uint64_t cubinCrc, int reserved1, int reserved2);
   probe cubin_unloaded(uint64_t cubinCrc, int reserved);
 };
diff --git a/src/stall_reason_map.h b/src/stall_reason_map.h
new file mode 100644
index 0000000..ca64ed9
--- /dev/null
+++ b/src/stall_reason_map.h
@@ -0,0 +1,56 @@
+#ifndef PARCAGPU_STALL_REASON_MAP_H_
+#define PARCAGPU_STALL_REASON_MAP_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+namespace parcagpu {
+
+// Contiguous, BPF-friendly stall reason name table.
+// Array of fixed-width 64-byte name slots indexed directly by stall reason
+// index. BPF reads names[stallReasonIndex * 64] — no pointer chasing.
+static constexpr uint32_t STALL_REASON_NAME_LEN = 64;
+
+class StallReasonMap {
+public:
+  StallReasonMap() = default;
+  ~StallReasonMap() { std::free(buf); }
+
+  StallReasonMap(const StallReasonMap &) = delete;
+  StallReasonMap &operator=(const StallReasonMap &) = delete;
+
+  // Build from parallel arrays (as returned by CUPTI).
+  // Indices must be dense 0..N-1.
+  void build(uint32_t numReasons, const uint32_t *indices, char **names) {
+    // Find max index to size the array.
+    uint32_t maxIdx = 0;
+    for (uint32_t i = 0; i < numReasons; i++) {
+      if (indices[i] > maxIdx)
+        maxIdx = indices[i];
+    }
+    count = maxIdx + 1;
+
+    std::free(buf);
+    bufSize = count * STALL_REASON_NAME_LEN;
+    buf = static_cast<char *>(std::calloc(1, bufSize));
+
+    for (uint32_t i = 0; i < numReasons; i++) {
+      char *slot = buf + indices[i] * STALL_REASON_NAME_LEN;
+      strncpy(slot, names[i], STALL_REASON_NAME_LEN - 1);
+    }
+  }
+
+  const char *data() const { return buf; }
+  uint32_t size() const { return bufSize; }
+  uint32_t numEntries() const { return count; }
+
+private:
+  char *buf = nullptr;
+  uint32_t bufSize = 0;
+  uint32_t count = 0;
+};
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_STALL_REASON_MAP_H_
diff --git a/src/token_bucket.h b/src/token_bucket.h
new file mode 100644
index 0000000..88d6d7c
--- /dev/null
+++ b/src/token_bucket.h
@@ -0,0 +1,59 @@
+#ifndef PARCAGPU_TOKEN_BUCKET_H_
+#define PARCAGPU_TOKEN_BUCKET_H_
+
+#include <cstdint>
+#include <time.h>
+
+namespace parcagpu {
+
+// Simple token bucket rate limiter. Not thread-safe — use one instance per
+// thread, or a thread_local instance when shared across call sites.
+class TokenBucket {
+public:
+  // startFull=true: first tryAcquire succeeds immediately.
+  // startFull=false: must wait for refill before first success.
+  explicit TokenBucket(double tokensPerSec, bool startFull = true)
+      : rate(tokensPerSec), tokens(startFull ? 1.0 : 0.0) {}
+
+  void setRate(double tokensPerSec) {
+    rate = tokensPerSec;
+    if (tokens > rate)
+      tokens = rate;
+  }
+
+  // Returns true if a token was available and consumed.
+  bool tryAcquire() {
+    refill();
+    if (tokens >= 1.0) {
+      tokens -= 1.0;
+      return true;
+    }
+    return false;
+  }
+
+private:
+  void refill() {
+    uint64_t now = nowNs();
+    if (lastRefillNs > 0) {
+      double elapsed = (now - lastRefillNs) / 1e9;
+      tokens += elapsed * rate;
+      if (tokens > rate)
+        tokens = rate;
+    }
+    lastRefillNs = now;
+  }
+
+  static uint64_t nowNs() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+  }
+
+  double rate;
+  double tokens;
+  uint64_t lastRefillNs = 0;
+};
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_TOKEN_BUCKET_H_
diff --git a/test/bpf/activity_parser.bpf.c b/test/bpf/activity_parser.bpf.c
index 33aa754..6eddf79 100644
--- a/test/bpf/activity_parser.bpf.c
+++ b/test/bpf/activity_parser.bpf.c
@@ -26,6 +26,8 @@
 
 #define MAX_BATCH_SIZE 128
 #define MAX_KERNEL_NAME 128
+#define STALL_REASON_NAME_LEN 64
+#define MAX_STALL_REASONS 64
 
 // USDT spec map — populated by Go loader before uprobe attachment.
 // Keyed by spec ID (uint32); value is struct bpf_usdt_spec.
@@ -55,6 +57,23 @@ struct bpf_map_def events SEC("maps") = {
     .max_entries = 1 << 20, // 1 MB
 };
 
+// Stall reason name table — indexed by stall reason index.
+// Value is a 64-byte null-terminated string.
+struct bpf_map_def stall_reasons SEC("maps") = {
+    .type = BPF_MAP_TYPE_ARRAY,
+    .key_size = sizeof(u32),
+    .value_size = STALL_REASON_NAME_LEN,
+    .max_entries = MAX_STALL_REASONS,
+};
+
+// Whether the stall reason map has been populated.
+struct bpf_map_def stall_map_loaded SEC("maps") = {
+    .type = BPF_MAP_TYPE_ARRAY,
+    .key_size = sizeof(u32),
+    .value_size = sizeof(u32),
+    .max_entries = 1,
+};
+
 // Stats counters.
 struct bpf_map_def stats SEC("maps") = {
     .type = BPF_MAP_TYPE_ARRAY,
@@ -148,4 +167,36 @@ int BPF_USDT(handle_activity_batch, u64 ptrs_base, u32 num_activities) {
   return 0;
 }
 
+SEC("usdt/parcagpu/stall_reason_map")
+int BPF_USDT(handle_stall_reason_map, u64 names_base, u32 count) {
+  // Only load the map once.
+  u32 zero = 0;
+  u32 *loaded = bpf_map_lookup_elem(&stall_map_loaded, &zero);
+  if (!loaded)
+    return 0;
+  if (*loaded)
+    return 0;
+
+  if (count > MAX_STALL_REASONS)
+    count = MAX_STALL_REASONS;
+
+  // Read each 64-byte name slot and store in the BPF map.
+  for (u32 i = 0; i < MAX_STALL_REASONS; i++) {
+    if (i >= count)
+      break;
+
+    char name[STALL_REASON_NAME_LEN] = {};
+    int ret = bpf_probe_read_user(name, sizeof(name),
+                                  (void *)(names_base + (u64)i * STALL_REASON_NAME_LEN));
+    if (ret != 0)
+      continue;
+
+    bpf_map_update_elem(&stall_reasons, &i, name, BPF_ANY);
+  }
+
+  u32 one = 1;
+  bpf_map_update_elem(&stall_map_loaded, &zero, &one, BPF_ANY);
+  return 0;
+}
+
 char LICENSE[] SEC("license") = "GPL";
diff --git a/test/bpf/main.go b/test/bpf/main.go
index 85c6ff9..f4dde41 100644
--- a/test/bpf/main.go
+++ b/test/bpf/main.go
@@ -107,52 +107,63 @@ func main() {
 		log.Fatalf("Parsing USDT probes: %v", err)
 	}
 
-	// Find parcagpu:activity_batch probe(s) and attach uprobe at each site.
+	// Find USDT probes and attach uprobes at each site.
 	ex, err := link.OpenExecutable(realLib)
 	if err != nil {
 		log.Fatalf("Opening executable %s: %v", realLib, err)
 	}
 
+	type probeTarget struct {
+		name    string
+		handler *ebpf.Program
+	}
+	targets := []probeTarget{
+		{"activity_batch", objs.HandleActivityBatch},
+		{"stall_reason_map", objs.HandleStallReasonMap},
+	}
+
 	var links []link.Link
 	var specID uint32
-	for _, probe := range probes {
-		if probe.Provider != "parcagpu" || probe.Name != "activity_batch" {
-			continue
-		}
+	for _, t := range targets {
+		for _, probe := range probes {
+			if probe.Provider != "parcagpu" || probe.Name != t.name {
+				continue
+			}
 
-		// Parse the stapsdt argument spec into a bpf_usdt_spec.
-		spec, err := pfelf.ParseUSDTArguments(probe.Arguments)
-		if err != nil {
-			log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err)
-		}
+			// Parse the stapsdt argument spec into a bpf_usdt_spec.
+			spec, err := pfelf.ParseUSDTArguments(probe.Arguments)
+			if err != nil {
+				log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err)
+			}
 
-		// Store spec in the BPF map so bpf_usdt_arg() can look it up.
-		specBytes := pfelf.USDTSpecToBytes(spec)
-		if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil {
-			log.Fatalf("Populating USDT spec map: %v", err)
-		}
+			// Store spec in the BPF map so bpf_usdt_arg() can look it up.
+			specBytes := pfelf.USDTSpecToBytes(spec)
+			if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil {
+				log.Fatalf("Populating USDT spec map: %v", err)
+			}
 
-		// Cookie: spec_id in high 32 bits (bpf_usdt_arg reads it via bpf_get_attach_cookie).
-		cookie := uint64(specID) << 32
+			// Cookie: spec_id in high 32 bits (bpf_usdt_arg reads it via bpf_get_attach_cookie).
+			cookie := uint64(specID) << 32
 
-		log.Printf("USDT probe parcagpu:activity_batch at offset 0x%x, args=%q, spec_id=%d",
-			probe.Location, probe.Arguments, specID)
+			log.Printf("USDT probe parcagpu:%s at offset 0x%x, args=%q, spec_id=%d",
+				t.name, probe.Location, probe.Arguments, specID)
 
-		up, err := ex.Uprobe("activity_batch", objs.HandleActivityBatch, &link.UprobeOptions{
-			Address:      probe.Location,
-			PID:          *pid,
-			Cookie:       cookie,
-			RefCtrOffset: probe.SemaphoreOffset,
-		})
-		if err != nil {
-			log.Fatalf("Attaching uprobe at offset 0x%x: %v", probe.Location, err)
+			up, err := ex.Uprobe(t.name, t.handler, &link.UprobeOptions{
+				Address:      probe.Location,
+				PID:          *pid,
+				Cookie:       cookie,
+				RefCtrOffset: probe.SemaphoreOffset,
+			})
+			if err != nil {
+				log.Fatalf("Attaching uprobe for %s at offset 0x%x: %v", t.name, probe.Location, err)
+			}
+			links = append(links, up)
+			specID++
 		}
-		links = append(links, up)
-		specID++
 	}
 
 	if len(links) == 0 {
-		log.Fatalf("No parcagpu:activity_batch USDT probes found in %s", realLib)
+		log.Fatalf("No parcagpu USDT probes found in %s", realLib)
 	}
 	defer func() {
 		for _, l := range links {
@@ -233,6 +244,7 @@ func main() {
 	fmt.Println()
 	log.Printf("Final stats:")
 	printStats(&objs, eventCount)
+	printStallReasonMap(&objs)
 }
 
 func printStats(objs *activityParserObjects, eventCount uint64) {
@@ -251,6 +263,29 @@ func printStats(objs *activityParserObjects, eventCount uint64) {
 		batches, activities, kernels, eventCount, drops)
 }
 
+func printStallReasonMap(objs *activityParserObjects) {
+	// Check if the stall reason map was populated by BPF.
+	var loaded uint32
+	loadedKey := uint32(0)
+	if err := objs.StallMapLoaded.Lookup(&loadedKey, &loaded); err != nil || loaded == 0 {
+		log.Printf("  stall reason map: not received")
+		return
+	}
+
+	log.Printf("  stall reason map:")
+	for i := uint32(0); i < 64; i++ {
+		var name [64]byte
+		if err := objs.StallReasons.Lookup(&i, &name); err != nil {
+			continue
+		}
+		s := cString(name[:])
+		if s == "" {
+			continue
+		}
+		log.Printf("    [%2d] %s", i, s)
+	}
+}
+
 func raiseMemlock() error {
 	return unix.Setrlimit(unix.RLIMIT_MEMLOCK, &unix.Rlimit{
 		Cur: unix.RLIM_INFINITY,

From 6496aa7d3f5bbca0d281f800b52d23eb6b0f5b4d Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Thu, 2 Apr 2026 17:47:27 -0500
Subject: [PATCH 12/14] Flush out cubin processing, sass lookup and pc sampling
 probe batching

---
 .gitignore                         |   2 +
 Makefile                           |  43 +--
 microbenchmarks/pc_sample_toy.cu   | 110 +++++++
 microbenchmarks/rapid_launch.cu    |  44 +++
 parcagpu.bt                        |  39 +--
 src/pc_sampling.cpp                | 165 ++++++----
 src/pc_sampling.h                  |  14 +
 src/probes.d                       |  10 +-
 test/bpf/activity_parser.bpf.c     | 182 ++++++++++-
 test/bpf/go.mod                    |  13 +-
 test/bpf/go.sum                    |  53 ++--
 test/bpf/main.go                   | 477 +++++++++++++++++++++++++++--
 test/test-pc-real.sh               | 114 +++++++
 test/test_cupti_prof.c             |   7 +
 vendor/opentelemetry-ebpf-profiler |   2 +-
 15 files changed, 1069 insertions(+), 206 deletions(-)
 create mode 100644 microbenchmarks/pc_sample_toy.cu
 create mode 100644 microbenchmarks/rapid_launch.cu
 create mode 100755 test/test-pc-real.sh

diff --git a/.gitignore b/.gitignore
index 01d5342..1fb4839 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ test/bpf/activity_parser
 test/bpf/activityparser_*.go
 test/bpf/activityparser_*.o
 src/probes.h
+microbenchmarks/rapid_launch
+microbenchmarks/pc_sample_toy
diff --git a/Makefile b/Makefile
index 7d5bcba..cf30250 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test test-multi test-pc-real
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test microbenchmarks test-multi test-pc-real
 
 LIB_NAME = libparcagpucupti.so
 
@@ -104,6 +104,17 @@ docker-test-run: docker-test-build
 	@echo "=== Running tests in container ==="
 	@docker run --rm parcagpu-test:latest $(ARGS)
 
+# Build microbenchmark CUDA toys (with DWARF debug info for cubin symbolization)
+NVCC ?= nvcc
+CUDA_ARCH ?= native
+MICROBENCH_SRCS := $(wildcard microbenchmarks/*.cu)
+MICROBENCH_BINS := $(MICROBENCH_SRCS:.cu=)
+
+microbenchmarks: $(MICROBENCH_BINS)
+
+microbenchmarks/%: microbenchmarks/%.cu
+	$(NVCC) -g -lineinfo -arch=$(CUDA_ARCH) -o $@ $<
+
 # Build the BPF activity parser test program
 # Requires: clang, libbpf-dev, bpftool (for vmlinux.h), Go 1.21+
 bpf-test:
@@ -137,34 +148,8 @@ test-multi: local bpf-test
 
 # Run pc_sample_toy with BPF activity parser and verify stall reason map is received.
 # Requires: real GPU, root (sudo) for BPF, pc_sample_toy compiled separately.
-test-pc-real: local bpf-test
-	@echo "=== Running PC sampling smoke test ==="
-	@LIB_PATH="$$(pwd)/build-local/lib/libparcagpucupti.so"; \
-	TOY="$$(pwd)/microbenchmarks/pc_sample_toy"; \
-	if [ ! -x "$$TOY" ]; then \
-		echo "error: $$TOY not found — compile with: /usr/local/cuda/bin/nvcc -o microbenchmarks/pc_sample_toy microbenchmarks/pc_sample_toy.cu" >&2; \
-		exit 1; \
-	fi; \
-	PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$$LIB_PATH" "$$TOY" 3 & \
-	TOY_PID=$$!; \
-	echo "pc_sample_toy PID: $${TOY_PID}"; \
-	while kill -0 $${TOY_PID} 2>/dev/null && ! grep -q libparcagpucupti "/proc/$${TOY_PID}/maps" 2>/dev/null; do \
-		sleep 0.1; \
-	done; \
-	echo "Starting BPF activity parser (requires root)..."; \
-	sudo test/bpf/activity_parser -pid $${TOY_PID} -lib "$$LIB_PATH" -v 2>&1 | tee /tmp/parcagpu-pc-test.log & \
-	BPF_PID=$$!; \
-	wait $${TOY_PID} 2>/dev/null; \
-	sleep 1; \
-	sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
-	echo; \
-	if grep -q 'stall reason map:' /tmp/parcagpu-pc-test.log && \
-	   grep -q 'smsp__pcsamp' /tmp/parcagpu-pc-test.log; then \
-		echo "=== PASS: stall reason map received ==="; \
-	else \
-		echo "=== FAIL: stall reason map not found in output ===" >&2; \
-		exit 1; \
-	fi
+test-pc-real: local bpf-test microbenchmarks
+	sudo -E test/test-pc-real.sh
 
 format:
 	@echo "=== Formatting source files ==="
diff --git a/microbenchmarks/pc_sample_toy.cu b/microbenchmarks/pc_sample_toy.cu
new file mode 100644
index 0000000..beef66b
--- /dev/null
+++ b/microbenchmarks/pc_sample_toy.cu
@@ -0,0 +1,110 @@
+// pc_sample_toy.cu — a simple GPU busy-loop for testing PC sampling
+// Compile: make microbenchmarks  (or: nvcc -g -lineinfo -arch=native -o pc_sample_toy pc_sample_toy.cu)
+// Run:     ./pc_sample_toy
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#define CHECK(call)                                                            \
+  do {                                                                         \
+    cudaError_t err = (call);                                                  \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,         \
+              cudaGetErrorString(err));                                        \
+      exit(1);                                                                 \
+    }                                                                          \
+  } while (0)
+
+// Kernel A: heavy FP math (sin/cos chain)
+__global__ void trig_storm(float *out, int n, unsigned long long iters) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  float x = (float)idx * 0.001f;
+  for (unsigned long long i = 0; i < iters; i++) {
+    x = sinf(x) * cosf(x) + 0.1f;
+  }
+  out[idx] = x;
+}
+
+// Kernel B: integer bit-twiddling
+__global__ void hash_churn(unsigned int *out, int n, unsigned long long iters) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  unsigned int h = idx ^ 0xdeadbeef;
+  for (unsigned long long i = 0; i < iters; i++) {
+    h ^= h << 13;
+    h ^= h >> 17;
+    h ^= h << 5;
+    h += (unsigned int)i;
+  }
+  out[idx] = h;
+}
+
+// Kernel C: shared-memory bouncing
+__global__ void shmem_bounce(float *out, int n, unsigned long long iters) {
+  __shared__ float tile[256];
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int tid = threadIdx.x;
+
+  tile[tid] = (float)idx;
+  __syncthreads();
+
+  for (unsigned long long i = 0; i < iters; i++) {
+    tile[tid] += tile[(tid + 1) % blockDim.x] * 0.01f;
+    __syncthreads();
+  }
+
+  if (idx < n)
+    out[idx] = tile[tid];
+}
+
+void go() {
+  const int N = 1 << 18; // 256K elements
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  float *d_float;
+  unsigned int *d_uint;
+
+  CHECK(cudaMalloc(&d_float, N * sizeof(float)));
+  CHECK(cudaMalloc(&d_uint, N * sizeof(unsigned int)));
+
+  printf("Launching GPU kernels — attach your profiler now.\n");
+  printf("PID: %d\n\n", getpid());
+
+  sleep(1);
+  // Each kernel runs for roughly 0.5–1 second depending on GPU.
+  // Tune the iteration count up/down as needed.
+
+  printf("  [1/3] trig_storm ...\n");
+  trig_storm<<<blocks, threads>>>(d_float, N, 500000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("  [2/3] hash_churn ...\n");
+  hash_churn<<<blocks, threads>>>(d_uint, N, 2000000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("  [3/3] shmem_bounce ...\n");
+  shmem_bounce<<<blocks, threads>>>(d_float, N, 50000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("\nDone.\n");
+
+  CHECK(cudaFree(d_float));
+  CHECK(cudaFree(d_uint));
+}
+
+int main(int argc, char **argv) {
+  int loops = 1;
+  if (argc > 1) {
+    loops = atoi(argv[1]);
+  }
+  while (loops-- > 0) {
+    go();
+  }
+}
diff --git a/microbenchmarks/rapid_launch.cu b/microbenchmarks/rapid_launch.cu
new file mode 100644
index 0000000..aa02679
--- /dev/null
+++ b/microbenchmarks/rapid_launch.cu
@@ -0,0 +1,44 @@
+// rapid_launch.cu — measures per-kernel-launch overhead from CUPTI injection.
+// Launches many tiny kernels to stress the callback path.
+//
+// Compile: nvcc -o rapid_launch rapid_launch.cu
+// Run:     ./rapid_launch [num_launches]
+//
+// Compare:
+//   ./rapid_launch 50000                                          # baseline
+//   CUDA_INJECTION64_PATH=.../libparcagpucupti.so ./rapid_launch 50000  # injected
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <time.h>
+
+__global__ void empty_kernel() {}
+
+static double now_sec() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+int main(int argc, char **argv) {
+  int n = 50000;
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  // Warm up the CUDA context and any injection library init.
+  empty_kernel<<<1, 1>>>();
+  cudaDeviceSynchronize();
+
+  // Synchronous launches — each one round-trips through CUPTI callbacks.
+  double t0 = now_sec();
+  for (int i = 0; i < n; i++) {
+    empty_kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+  double t1 = now_sec();
+
+  double elapsed = t1 - t0;
+  printf("%d launches in %.3f s  (%.1f us/launch)\n", n, elapsed,
+         elapsed / n * 1e6);
+  return 0;
+}
diff --git a/parcagpu.bt b/parcagpu.bt
index 9898a2a..4132c47 100755
--- a/parcagpu.bt
+++ b/parcagpu.bt
@@ -38,36 +38,15 @@ usdt:*:parcagpu:cuda_correlation {
     printf("%-12s.%-6u [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S", nsecs), (nsecs % 1000000000) / 1000, $correlation_id, $cbid, $name);
 }
 
-usdt:*:parcagpu:pc_sample_summary {
-    $function_index = arg0;
-    $pc_offset = arg1;
-    $total_samples = arg2;
-    $stalled_samples = arg3;
-    $function_name = str(arg4);
-
-    printf("%-12s.%-6u [PC_SAMPLE] func=%u pc=0x%lx total=%lu stalled=%lu %s\n",
-           strftime("%H:%M:%S", nsecs),
-           (nsecs % 1000000000) / 1000,
-           $function_index,
-           $pc_offset,
-           $total_samples,
-           $stalled_samples,
-           $function_name);
-}
-
-usdt:*:parcagpu:pc_stall_reason {
-    $function_index = arg0;
-    $pc_offset = arg1;
-    $stall_reason_index = arg2;
-    $samples = arg3;
+usdt:*:parcagpu:pc_sample_batch {
+    $records = arg0;
+    $count = arg1;
 
-    printf("%-12s.%-6u [STALL]     func=%u pc=0x%lx reason[%u] samples=%lu\n",
+    printf("%-12s.%-6u [PC_BATCH] count=%u records=%p\n",
            strftime("%H:%M:%S", nsecs),
            (nsecs % 1000000000) / 1000,
-           $function_index,
-           $pc_offset,
-           $stall_reason_index,
-           $samples);
+           $count,
+           $records);
 }
 
 usdt:*:parcagpu:stall_reason_map {
@@ -83,11 +62,13 @@ usdt:*:parcagpu:stall_reason_map {
 
 usdt:*:parcagpu:cubin_loaded {
     $cubin_crc = arg0;
+    $cubin_size = arg2;
 
-    printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx\n",
+    printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx size=%lu\n",
            strftime("%H:%M:%S", nsecs),
            (nsecs % 1000000000) / 1000,
-           $cubin_crc);
+           $cubin_crc,
+           $cubin_size);
 }
 
 usdt:*:parcagpu:cubin_unloaded {
diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp
index 64b9753..2e3fa3a 100644
--- a/src/pc_sampling.cpp
+++ b/src/pc_sampling.cpp
@@ -1,7 +1,7 @@
-#include "probes.h"
 #include "Driver/GPU/CudaApi.h"
 #include "Driver/GPU/CuptiApi.h"
 #include "pc_sampling.h"
+#include "probes.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -19,6 +19,26 @@ namespace parcagpu {
 extern bool debug_enabled;
 extern void init_debug();
 
+// noinline wrappers so each USDT probe has exactly one call site in the
+// binary.  Multiple call sites produce multiple .note.stapsdt entries with
+// different argument encodings, which complicates BPF attachment.
+__attribute__((noinline)) void fireCubinLoaded(uint64_t crc, const char *cubin,
+                                               uint64_t size) {
+  PARCAGPU_CUBIN_LOADED(crc, cubin, size);
+}
+
+__attribute__((noinline)) void fireCubinUnloaded(uint64_t crc) {
+  PARCAGPU_CUBIN_UNLOADED(crc);
+}
+
+// Max records per pc_sample_batch probe invocation.
+static constexpr uint32_t PCSampleBatchSize = 128;
+
+__attribute__((noinline)) void firePCSampleBatch(
+    const void **ptrs, uint32_t count) {
+  PARCAGPU_PC_SAMPLE_BATCH(ptrs, count);
+}
+
 namespace {
 
 // CUPTI helper functions (adapted from Proton's CuptiPCSamplingUtils.h)
@@ -82,8 +102,8 @@ bool getPCSamplingData(CUcontext context,
   };
   auto result = proton::cupti::pcSamplingGetData<false>(&params);
   if (result != CUPTI_SUCCESS) {
-    DEBUG_PRINTF("cuptiPCSamplingGetData failed: error %d (ctx=%p)\n",
-                 result, context);
+    DEBUG_PRINTF("cuptiPCSamplingGetData failed: error %d (ctx=%p)\n", result,
+                 context);
     return false;
   }
   return true;
@@ -328,7 +348,8 @@ void ConfigureData::initialize(CUcontext context) {
   configurationInfos.emplace_back(configureCollectionMode());
   configurationInfos.emplace_back(configureSamplingBuffer());
   // Don't set sampling period — let CUPTI use its default.
-  // Explicit period values silently break sampling on some GPUs (e.g. Blackwell).
+  // Explicit period values silently break sampling on some GPUs (e.g.
+  // Blackwell).
 
   setConfigurationAttribute(context, configurationInfos);
 
@@ -350,7 +371,8 @@ bool PCSampling::isSupported() {
   // If unset, PC sampling is disabled.
   const char *env = getenv("PARCAGPU_SAMPLING_FACTOR");
   if (!env) {
-    DEBUG_PRINTF("PC sampling not enabled (PARCAGPU_SAMPLING_FACTOR not set)\n");
+    DEBUG_PRINTF(
+        "PC sampling not enabled (PARCAGPU_SAMPLING_FACTOR not set)\n");
     return false;
   }
   int factor = atoi(env);
@@ -402,8 +424,8 @@ void PCSampling::initialize(CUcontext context) {
 
         // Build contiguous stall reason map for USDT probe emission.
         stallReasonMap.build(configData->numStallReasons,
-                            configData->stallReasonIndices,
-                            configData->stallReasonNames);
+                             configData->stallReasonIndices,
+                             configData->stallReasonNames);
 
         contextInitialized.insert(contextId);
         initializedContextIds.push_back(contextId);
@@ -422,30 +444,21 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData) {
   DEBUG_PRINTF("Processing %zu PCs (remaining: %zu)\n",
                pcSamplingData->totalNumPcs, pcSamplingData->remainingNumPcs);
 
-  // Process each PC sample
-  for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
-    auto *pcData = pcSamplingData->pPcData + i;
-
-    // Calculate total and stalled samples
-    uint64_t totalSamples = 0;
-    uint64_t stalledSamples = 0;
-
-    for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
-      auto *stallReason = &pcData->stallReason[j];
-      totalSamples += stallReason->samples;
-
-      // Check if this is a "not_issued" stall (not really stalled)
-      bool isNotIssued = configureData->notIssuedStallReasonIndices.count(
-                             stallReason->pcSamplingStallReasonIndex) > 0;
-
-      if (!isNotIssued) {
-        stalledSamples += stallReason->samples;
+  if (debug_enabled) {
+    for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
+      auto *pcData = pcSamplingData->pPcData + i;
+
+      uint64_t totalSamples = 0;
+      uint64_t stalledSamples = 0;
+      for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
+        auto *stallReason = &pcData->stallReason[j];
+        totalSamples += stallReason->samples;
+        bool isNotIssued = configureData->notIssuedStallReasonIndices.count(
+                               stallReason->pcSamplingStallReasonIndex) > 0;
+        if (!isNotIssued)
+          stalledSamples += stallReason->samples;
       }
-    }
 
-    // Source correlation only for debug logging — backend resolves
-    // file/line from the cubin using pcOffset.
-    if (debug_enabled) {
       auto *cubinData = getCubinData(pcData->cubinCrc);
       auto key =
           CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset};
@@ -461,39 +474,29 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData) {
       std::string fullPath = lineInfo.fileName.size()
                                  ? lineInfo.dirName + "/" + lineInfo.fileName
                                  : "";
-      DEBUG_PRINTF("  [%zu] func=%s pc=0x%lx total=%lu stalled=%lu %s:%u\n",
-                   i, lineInfo.functionName.c_str(), pcData->pcOffset,
+      DEBUG_PRINTF("  [%zu] func=%s pc=0x%lx total=%lu stalled=%lu %s:%u\n", i,
+                   lineInfo.functionName.c_str(), pcData->pcOffset,
                    totalSamples, stalledSamples, fullPath.c_str(),
                    lineInfo.lineNumber);
     }
+  }
 
-    PARCAGPU_PC_SAMPLE_SUMMARY(pcData->functionIndex, pcData->pcOffset,
-                               totalSamples, stalledSamples,
-                               pcData->functionName);
-
-    // Emit detailed stall reason probes
-    for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
-      auto *stallReason = &pcData->stallReason[j];
-      auto stallReasonIndex = stallReason->pcSamplingStallReasonIndex;
-
-      if (debug_enabled) {
-        const char *stallReasonName = "";
-        for (size_t k = 0; k < configureData->numStallReasons; k++) {
-          if (configureData->stallReasonIndices[k] == stallReasonIndex) {
-            stallReasonName = configureData->stallReasonNames[k];
-            break;
-          }
-        }
-        if (stallReason->samples > 0) {
-          DEBUG_PRINTF("    stall: %s = %u\n", stallReasonName,
-                       stallReason->samples);
-        }
-      }
+  // Emit batched PC sample probes as a bag of pointers (like activity_batch).
+  // Using pointers avoids depending on the CUPTI struct stride, which can
+  // change across CUDA versions.
+  const void *batchPtrs[PCSampleBatchSize];
+  uint32_t batchCount = 0;
 
-      PARCAGPU_PC_STALL_REASON(pcData->functionIndex, pcData->pcOffset,
-                               stallReasonIndex, stallReason->samples);
+  for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
+    batchPtrs[batchCount++] = &pcSamplingData->pPcData[i];
+    if (batchCount == PCSampleBatchSize) {
+      firePCSampleBatch(batchPtrs, batchCount);
+      batchCount = 0;
     }
   }
+  if (batchCount > 0) {
+    firePCSampleBatch(batchPtrs, batchCount);
+  }
 }
 
 void PCSampling::collectData(CUcontext context) {
@@ -507,7 +510,8 @@ void PCSampling::collectData(CUcontext context) {
   }
 
   auto *configureData = getConfigureData(contextId);
-  DEBUG_PRINTF("Collecting PC sampling data for context %u (cfg total=%zu remaining=%zu)\n",
+  DEBUG_PRINTF("Collecting PC sampling data for context %u (cfg total=%zu "
+               "remaining=%zu)\n",
                contextId, configureData->pcSamplingData.totalNumPcs,
                configureData->pcSamplingData.remainingNumPcs);
 
@@ -518,6 +522,22 @@ void PCSampling::collectData(CUcontext context) {
                               stallReasonMap.numEntries());
   }
 
+  // Replay cubin_loaded probes for late-attaching tracers.
+  // When the semaphore transitions to non-zero, re-emit all known cubins.
+  if (PARCAGPU_CUBIN_LOADED_ENABLED()) {
+    if (!cubinsEmitted) {
+      DEBUG_PRINTF("Emitting cubins");
+      std::lock_guard<std::mutex> lock(contextMutex);
+      for (const auto &ref : loadedCubins) {
+        fireCubinLoaded(ref.crc, ref.data, ref.size);
+      }
+      cubinsEmitted = true;
+    }
+  } else {
+    // Tracer detached — reset so we replay again on next attach.
+    cubinsEmitted = false;
+  }
+
   // Use the separate output buffer for getData — the configured
   // pcSamplingData buffer is owned by CUPTI.
   bool ok = getPCSamplingData(context, &configureData->outputData);
@@ -536,7 +556,8 @@ void PCSampling::collectAllData() {
     auto result = contextIdToConfigureData.find(contextId);
     if (!result) {
       DEBUG_PRINTF("Context %u in initializedContextIds but not in map, "
-                   "skipping\n", contextId);
+                   "skipping\n",
+                   contextId);
       continue;
     }
     auto *configureData = &result->get();
@@ -559,10 +580,10 @@ void PCSampling::finalize(CUcontext context) {
   DEBUG_PRINTF("Finalizing PC sampling for context %p\n", context);
 
   // Remove from iteration list first so collectAllData won't touch this context
-  initializedContextIds.erase(
-      std::remove(initializedContextIds.begin(),
-                  initializedContextIds.end(), contextId),
-      initializedContextIds.end());
+  initializedContextIds.erase(std::remove(initializedContextIds.begin(),
+                                          initializedContextIds.end(),
+                                          contextId),
+                              initializedContextIds.end());
 
   // Drain remaining data before disabling
   auto *configureData = getConfigureData(contextId);
@@ -579,7 +600,6 @@ void PCSampling::finalize(CUcontext context) {
 
 void PCSampling::loadModule(const char *cubin, size_t cubinSize) {
   auto cubinCrc = getCubinCrc(cubin, cubinSize);
-  auto *cubinData = getCubinData(cubinCrc);
 
   if (cubinCrcToCubinData.contain(cubinCrc)) {
     // Increment reference count
@@ -587,13 +607,19 @@ void PCSampling::loadModule(const char *cubin, size_t cubinSize) {
     DEBUG_PRINTF("Module 0x%lx loaded (refcount=%zu)\n", cubinCrc,
                  cubinCrcToCubinData[cubinCrc].second);
   } else {
-    // New module
+    // New module — getCubinData after the contain() check so operator[]
+    // doesn't auto-insert before we test.
+    auto *cubinData = getCubinData(cubinCrc);
     cubinData->cubinCrc = cubinCrc;
     cubinData->cubinSize = cubinSize;
     cubinData->cubin = cubin;
     cubinCrcToCubinData[cubinCrc].second = 1;
     DEBUG_PRINTF("Module 0x%lx loaded (new)\n", cubinCrc);
-    PARCAGPU_CUBIN_LOADED(cubinCrc, 0, 0);
+    fireCubinLoaded(cubinCrc, cubin, cubinSize);
+    {
+      std::lock_guard<std::mutex> lock(contextMutex);
+      loadedCubins.push_back({cubinCrc, cubin, cubinSize});
+    }
   }
 }
 
@@ -610,7 +636,16 @@ void PCSampling::unloadModule(const char *cubin, size_t cubinSize) {
   } else {
     cubinCrcToCubinData.erase(cubinCrc);
     DEBUG_PRINTF("Module 0x%lx unloaded (removed)\n", cubinCrc);
-    PARCAGPU_CUBIN_UNLOADED(cubinCrc, 0);
+    fireCubinUnloaded(cubinCrc);
+    {
+      std::lock_guard<std::mutex> lock(contextMutex);
+      loadedCubins.erase(std::remove_if(loadedCubins.begin(),
+                                        loadedCubins.end(),
+                                        [cubinCrc](const CubinRef &r) {
+                                          return r.crc == cubinCrc;
+                                        }),
+                         loadedCubins.end());
+    }
   }
 }
 
diff --git a/src/pc_sampling.h b/src/pc_sampling.h
index 910a9f7..bd237cd 100644
--- a/src/pc_sampling.h
+++ b/src/pc_sampling.h
@@ -123,6 +123,20 @@ class PCSampling {
   // Contiguous stall reason map for USDT probe emission.
   StallReasonMap stallReasonMap;
 
+  // Lightweight cubin metadata for replaying cubin_loaded probes to
+  // late-attaching tracers. Protected by contextMutex.
+  struct CubinRef {
+    uint64_t crc;
+    const char *data;
+    size_t size;
+  };
+  std::vector<CubinRef> loadedCubins;
+
+  // Tracks whether we've replayed cubin_loaded probes for a late-attaching
+  // tracer. Reset to false when the cubin_loaded semaphore transitions to
+  // non-zero.
+  bool cubinsEmitted = false;
+
   // Rate limiter for stall reason map re-emission (1 token per 10 seconds).
   TokenBucket stallReasonMapLimiter{0.1};
 };
diff --git a/src/probes.d b/src/probes.d
index 914b73e..f5cb3a8 100644
--- a/src/probes.d
+++ b/src/probes.d
@@ -6,12 +6,8 @@ provider parcagpu {
                         uint32_t streamId, uint32_t graphId,
                         uint64_t graphNodeId, const char *name);
   probe activity_batch(const void **ptrs, uint32_t count);
-  probe pc_sample_summary(uint32_t functionIndex, uint64_t pcOffset,
-                           uint64_t totalSamples, uint64_t stalledSamples,
-                           const char *functionName);
-  probe pc_stall_reason(uint32_t functionIndex, uint64_t pcOffset,
-                         uint32_t stallReasonIndex, uint32_t samples);
+  probe pc_sample_batch(const void **records, uint32_t count);
   probe stall_reason_map(const char *names, uint32_t count);
-  probe cubin_loaded(uint64_t cubinCrc, int reserved1, int reserved2);
-  probe cubin_unloaded(uint64_t cubinCrc, int reserved);
+  probe cubin_loaded(uint64_t cubinCrc, const char *cubin, uint64_t cubinSize);
+  probe cubin_unloaded(uint64_t cubinCrc);
 };
diff --git a/test/bpf/activity_parser.bpf.c b/test/bpf/activity_parser.bpf.c
index 6eddf79..9396941 100644
--- a/test/bpf/activity_parser.bpf.c
+++ b/test/bpf/activity_parser.bpf.c
@@ -28,19 +28,21 @@
 #define MAX_KERNEL_NAME 128
 #define STALL_REASON_NAME_LEN 64
 #define MAX_STALL_REASONS 64
+#define MAX_CUBIN_SIZE (64 * 1024 * 1024) // 64MB safety cap
 
 // USDT spec map — populated by Go loader before uprobe attachment.
 // Keyed by spec ID (uint32); value is struct bpf_usdt_spec.
-// Old-style SEC("maps") definition to match the extern in usdt_args.h.
-struct bpf_map_def __bpf_usdt_specs SEC("maps") = {
-    .type = BPF_MAP_TYPE_HASH,
-    .key_size = sizeof(u32),
-    .value_size = sizeof(struct bpf_usdt_spec),
-    .max_entries = 256,
-};
+struct usdt_specs_t {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __type(key, u32);
+    __type(value, struct bpf_usdt_spec);
+    __uint(max_entries, 256);
+} __bpf_usdt_specs SEC(".maps");
 
 // Event sent to user-space for each kernel activity found.
 struct kernel_event {
+  u32 event_type; // EVENT_TYPE_KERNEL
+  u32 _pad;
   u64 start;
   u64 end;
   u32 correlation_id;
@@ -51,6 +53,59 @@ struct kernel_event {
   char name[MAX_KERNEL_NAME];
 };
 
+// Cubin load/unload events — Go reads actual bytes via /proc/pid/mem.
+struct cubin_event {
+  u32 event_type; // EVENT_TYPE_CUBIN_LOADED or EVENT_TYPE_CUBIN_UNLOADED
+  u32 _pad;
+  u64 cubin_crc;
+  u64 cubin_ptr; // user-space address (for /proc/pid/mem read)
+  u64 cubin_size;
+};
+
+// Event type tags for the ring buffer.
+#define EVENT_TYPE_KERNEL 1
+#define EVENT_TYPE_CUBIN_LOADED 2
+#define EVENT_TYPE_CUBIN_UNLOADED 3
+#define EVENT_TYPE_PC_SAMPLE 4
+
+// Matches CUpti_PCSamplingStallReason (packed, aligned 8).
+struct cupti_stall_reason {
+  u32 stall_reason_index;
+  u32 samples;
+};
+
+// Matches CUpti_PCSamplingPCData (packed, aligned 8).
+// Contains user-space pointers that BPF chases with bpf_probe_read_user.
+// We read the base 56-byte struct, then conditionally read correlationId
+// if the size field indicates CUDA 13+ (size > 56).
+struct cupti_pc_data {
+  u64 size;               // struct size (56 = CUDA 12, 60+ = CUDA 13)
+  u64 cubin_crc;
+  u64 pc_offset;
+  u32 function_index;
+  u32 _pad;
+  u64 function_name_ptr;  // const char* in user-space
+  u64 stall_reason_count;
+  u64 stall_reason_ptr;   // CUpti_PCSamplingStallReason* in user-space
+} __attribute__((__packed__)) __attribute__((aligned(8)));
+
+#define CUPTI_PC_DATA_BASE_SIZE 56
+
+#define MAX_PC_BATCH_SIZE 512
+#define MAX_FUNC_NAME 128
+
+// PC sample event sent to user-space.
+struct pc_sample_event {
+  u32 event_type; // EVENT_TYPE_PC_SAMPLE
+  u32 stall_reason_count;
+  u64 cubin_crc;
+  u64 pc_offset;
+  u32 function_index;
+  u32 correlation_id; // kernel correlation ID (CUDA 13+ serialized mode, else 0)
+  char function_name[MAX_FUNC_NAME];
+  struct cupti_stall_reason stall_reasons[MAX_STALL_REASONS];
+};
+
 // Ring buffer for sending events to user-space.
 struct bpf_map_def events SEC("maps") = {
     .type = BPF_MAP_TYPE_RINGBUF,
@@ -145,6 +200,7 @@ int BPF_USDT(handle_activity_batch, u64 ptrs_base, u32 num_activities) {
       continue;
     }
 
+    evt->event_type = EVENT_TYPE_KERNEL;
     evt->start = record.start;
     evt->end = record.end;
     evt->correlation_id = record.correlation_id;
@@ -186,8 +242,9 @@ int BPF_USDT(handle_stall_reason_map, u64 names_base, u32 count) {
       break;
 
     char name[STALL_REASON_NAME_LEN] = {};
-    int ret = bpf_probe_read_user(name, sizeof(name),
-                                  (void *)(names_base + (u64)i * STALL_REASON_NAME_LEN));
+    int ret = bpf_probe_read_user(
+        name, sizeof(name),
+        (void *)(names_base + (u64)i * STALL_REASON_NAME_LEN));
     if (ret != 0)
       continue;
 
@@ -199,4 +256,111 @@ int BPF_USDT(handle_stall_reason_map, u64 names_base, u32 count) {
   return 0;
 }
 
+SEC("usdt/parcagpu/cubin_loaded")
+int BPF_USDT(handle_cubin_loaded, u64 cubin_crc, u64 cubin_ptr,
+             u64 cubin_size) {
+  if (cubin_size == 0 || cubin_size > MAX_CUBIN_SIZE)
+    return 0;
+
+  struct cubin_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+  if (!evt) {
+    bump_stat(STAT_DROPS);
+    return 0;
+  }
+
+  evt->event_type = EVENT_TYPE_CUBIN_LOADED;
+  evt->cubin_crc = cubin_crc;
+  evt->cubin_ptr = cubin_ptr;
+  evt->cubin_size = cubin_size;
+  bpf_ringbuf_submit(evt, 0);
+  return 0;
+}
+
+SEC("usdt/parcagpu/cubin_unloaded")
+int BPF_USDT(handle_cubin_unloaded, u64 cubin_crc) {
+  struct cubin_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+  if (!evt) {
+    bump_stat(STAT_DROPS);
+    return 0;
+  }
+
+  evt->event_type = EVENT_TYPE_CUBIN_UNLOADED;
+  evt->cubin_crc = cubin_crc;
+  evt->cubin_ptr = 0;
+  evt->cubin_size = 0;
+  bpf_ringbuf_submit(evt, 0);
+  return 0;
+}
+
+SEC("usdt/parcagpu/pc_sample_batch")
+int BPF_USDT(handle_pc_sample_batch, u64 ptrs_base, u32 count) {
+  if (count > MAX_PC_BATCH_SIZE)
+    count = MAX_PC_BATCH_SIZE;
+
+  for (u32 i = 0; i < MAX_PC_BATCH_SIZE; i++) {
+    if (i >= count)
+      break;
+
+    // Read the i-th pointer from the array.
+    u64 rec_ptr = 0;
+    int ret = bpf_probe_read_user(&rec_ptr, sizeof(rec_ptr),
+                                  (void *)(ptrs_base + (u64)i * sizeof(u64)));
+    if (ret != 0 || rec_ptr == 0)
+      continue;
+
+    // Chase the pointer to read the CUPTI PC data record.
+    struct cupti_pc_data rec = {};
+    ret = bpf_probe_read_user(&rec, sizeof(rec), (void *)rec_ptr);
+    if (ret != 0)
+      continue;
+
+    // Reserve ring buffer space for the event.
+    struct pc_sample_event *evt =
+        bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+    if (!evt) {
+      bump_stat(STAT_DROPS);
+      continue;
+    }
+
+    evt->event_type = EVENT_TYPE_PC_SAMPLE;
+    evt->cubin_crc = rec.cubin_crc;
+    evt->pc_offset = rec.pc_offset;
+    evt->function_index = rec.function_index;
+
+    // Read correlationId if the struct is large enough (CUDA 13+).
+    // It sits right after the stallReason pointer at offset 56.
+    evt->correlation_id = 0;
+    if (rec.size > CUPTI_PC_DATA_BASE_SIZE) {
+      u32 corr = 0;
+      bpf_probe_read_user(&corr, sizeof(corr),
+                          (void *)(rec_ptr + CUPTI_PC_DATA_BASE_SIZE));
+      evt->correlation_id = corr;
+    }
+
+    // Chase the function name pointer.
+    if (rec.function_name_ptr) {
+      bpf_probe_read_user_str(evt->function_name, sizeof(evt->function_name),
+                              (void *)rec.function_name_ptr);
+    } else {
+      evt->function_name[0] = '\0';
+    }
+
+    // Chase the stall reason pointer.
+    u32 sr_count = rec.stall_reason_count;
+    if (sr_count > MAX_STALL_REASONS)
+      sr_count = MAX_STALL_REASONS;
+    evt->stall_reason_count = sr_count;
+
+    if (rec.stall_reason_ptr && sr_count > 0) {
+      bpf_probe_read_user(evt->stall_reasons,
+                          sr_count * sizeof(struct cupti_stall_reason),
+                          (void *)rec.stall_reason_ptr);
+    }
+
+    bpf_ringbuf_submit(evt, 0);
+  }
+
+  return 0;
+}
+
 char LICENSE[] SEC("license") = "GPL";
diff --git a/test/bpf/go.mod b/test/bpf/go.mod
index 46ea536..c674309 100644
--- a/test/bpf/go.mod
+++ b/test/bpf/go.mod
@@ -4,22 +4,19 @@ go 1.25.1
 
 require (
 	github.com/cilium/ebpf v0.21.0
+	github.com/gnurizen/sass-table v0.0.1
 	go.opentelemetry.io/ebpf-profiler v0.0.0-00010101000000-000000000000
 	golang.org/x/sys v0.41.0
 )
 
 require (
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/go-logr/logr v1.4.3 // indirect
-	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/google/uuid v1.6.0 // indirect
-	github.com/klauspost/cpuid/v2 v2.2.8 // indirect
+	github.com/klauspost/cpuid/v2 v2.2.10 // indirect
 	github.com/minio/sha256-simd v1.0.1 // indirect
-	github.com/sirupsen/logrus v1.9.3 // indirect
-	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
-	go.opentelemetry.io/otel v1.39.0 // indirect
-	go.opentelemetry.io/otel/metric v1.39.0 // indirect
-	go.opentelemetry.io/otel/trace v1.39.0 // indirect
+	github.com/rogpeppe/go-internal v1.14.1 // indirect
+	go.opentelemetry.io/otel v1.41.0 // indirect
+	go.opentelemetry.io/otel/metric v1.41.0 // indirect
 )
 
 replace go.opentelemetry.io/ebpf-profiler => ../../vendor/opentelemetry-ebpf-profiler
diff --git a/test/bpf/go.sum b/test/bpf/go.sum
index 7eaf9ca..aea53ac 100644
--- a/test/bpf/go.sum
+++ b/test/bpf/go.sum
@@ -1,14 +1,11 @@
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-github.com/cilium/ebpf v0.20.0 h1:atwWj9d3NffHyPZzVlx3hmw1on5CLe9eljR8VuHTwhM=
-github.com/cilium/ebpf v0.20.0/go.mod h1:pzLjFymM+uZPLk/IXZUL63xdx5VXEo+enTzxkZXdycw=
 github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k=
 github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
-github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/gnurizen/sass-table v0.0.1 h1:LvV2GIUAIv6ZXLzr6TbC51Domg/5iVihhc7JXVGZXe0=
+github.com/gnurizen/sass-table v0.0.1/go.mod h1:epXL4VyR6Yuec1rY5RV9awBdWm6ibGUKRB5zajgyUjo=
 github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
 github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
@@ -23,49 +20,41 @@ github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtL
 github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
 github.com/jsimonetti/rtnetlink/v2 v2.0.3 h1:Jcp7GTnTPepoUAJ9+LhTa7ZiebvNS56T1GtlEUaPNFE=
 github.com/jsimonetti/rtnetlink/v2 v2.0.3/go.mod h1:atIkksp/9fqtf6rpAw45JnttnP2gtuH9X88WPfWfS9A=
-github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
-github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
+github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE=
+github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g=
 github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
-github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U=
-github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA=
 github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos=
+github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ=
 github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM=
 github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
-github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
-github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
+github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
 go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
-go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48=
-go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8=
-go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0=
-go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs=
-go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI=
-go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA=
-golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c=
-golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk=
-golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
-golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
-golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
-golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
-golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c=
+go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE=
+go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ=
+go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps=
+go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0=
+go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis=
+golang.org/x/arch v0.24.0 h1:qlJ3M9upxvFfwRM51tTg3Yl+8CP9vCC1E7vlFpgv99Y=
+golang.org/x/arch v0.24.0/go.mod h1:dNHoOeKiyja7GTvF9NJS1l3Z2yntpQNzgrjh1cU103A=
+golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
+golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
 golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/test/bpf/main.go b/test/bpf/main.go
index f4dde41..e6befbd 100644
--- a/test/bpf/main.go
+++ b/test/bpf/main.go
@@ -1,10 +1,7 @@
 // Test program that loads the activity_parser BPF program, attaches it
-// to the parcagpu:activity_batch USDT probe in the target shared library,
-// and logs kernel activities received through the ring buffer.
-//
-// The USDT probe location and argument specs are parsed from the ELF
-// .note.stapsdt section using pfelf, then populated into the BPF
-// __bpf_usdt_specs map so bpf_usdt_arg() reads the correct registers.
+// to parcagpu USDT probes in the target shared library, and logs kernel
+// activities received through the ring buffer. Also captures cubin modules
+// and resolves PC addresses to source lines using llvm-dwarfdump.
 //
 // Usage:
 //
@@ -14,30 +11,50 @@
 package main
 
 import (
+	"bufio"
 	"bytes"
+	"debug/elf"
 	"encoding/binary"
 	"errors"
 	"flag"
 	"fmt"
+	"io"
 	"log"
 	"os"
+	"os/exec"
 	"os/signal"
 	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
 	"syscall"
 	"time"
 	"unsafe"
 
-	"github.com/cilium/ebpf"
+	ebpf2 "github.com/cilium/ebpf"
 	"github.com/cilium/ebpf/link"
 	"github.com/cilium/ebpf/ringbuf"
 	"go.opentelemetry.io/ebpf-profiler/libpf/pfelf"
 	"golang.org/x/sys/unix"
+
+	sasstable "github.com/gnurizen/sass-table"
 )
 
 //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $GOARCH -cflags "-I../../vendor/opentelemetry-ebpf-profiler/support/ebpf" activityParser activity_parser.bpf.c
 
+// Event type tags — must match BPF #defines.
+const (
+	eventTypeKernel        = 1
+	eventTypeCubinLoaded   = 2
+	eventTypeCubinUnloaded = 3
+	eventTypePCSample      = 4
+)
+
 // KernelEvent matches struct kernel_event in the BPF program.
 type KernelEvent struct {
+	EventType     uint32
+	_             uint32
 	Start         uint64
 	End           uint64
 	CorrelationID uint32
@@ -48,6 +65,33 @@ type KernelEvent struct {
 	Name          [128]byte
 }
 
+// CubinEvent matches struct cubin_event in the BPF program.
+type CubinEvent struct {
+	EventType uint32
+	_         uint32
+	CubinCRC  uint64
+	CubinPtr  uint64
+	CubinSize uint64
+}
+
+// StallReason matches struct cupti_stall_reason in the BPF program.
+type StallReason struct {
+	Index   uint32
+	Samples uint32
+}
+
+// PCSampleEvent matches struct pc_sample_event in the BPF program.
+type PCSampleEvent struct {
+	EventType        uint32
+	StallReasonCount uint32
+	CubinCRC         uint64
+	PCOffset         uint64
+	FunctionIndex    uint32
+	CorrelationID    uint32
+	FunctionName     [128]byte
+	StallReasons     [64]StallReason
+}
+
 const (
 	statBatches    = 0
 	statActivities = 1
@@ -55,6 +99,277 @@ const (
 	statDrops      = 3
 )
 
+// lineEntry is a single address→source mapping from the DWARF line table.
+type lineEntry struct {
+	addr uint64
+	file string
+	line int
+}
+
+// cubinStore holds loaded cubins and their parsed line tables.
+type cubinStore struct {
+	mu     sync.RWMutex
+	cubins map[uint64]*cubinInfo // keyed by CRC
+	pid    int
+}
+
+// textSection holds one .text._Zfuncname section from the cubin ELF.
+type textSection struct {
+	name string
+	data []byte
+}
+
+type cubinInfo struct {
+	crc       uint64
+	size      uint64
+	lines     []lineEntry // sorted by addr
+	files     []string    // file table from line table header
+	archSM    int              // SM version from ELF e_flags (e.g. 121)
+	texts     []textSection    // .text sections for instruction decoding
+	tmpPath   string           // temp file for llvm-dwarfdump
+}
+
+func newCubinStore(pid int) *cubinStore {
+	return &cubinStore{
+		cubins: make(map[uint64]*cubinInfo),
+		pid:    pid,
+	}
+}
+
+func (cs *cubinStore) load(crc, ptr, size uint64) {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+
+	if _, ok := cs.cubins[crc]; ok {
+		return // already loaded
+	}
+
+	data, err := readProcessMemory(cs.pid, ptr, size)
+	if err != nil {
+		log.Printf("  [CUBIN] failed to read cubin 0x%x (%d bytes) from pid %d: %v",
+			crc, size, cs.pid, err)
+		return
+	}
+
+	info := &cubinInfo{crc: crc, size: size}
+
+	// Write to temp file for llvm-dwarfdump.
+	tmp, err := os.CreateTemp("", fmt.Sprintf("cubin_%x_*.elf", crc))
+	if err != nil {
+		log.Printf("  [CUBIN] 0x%x loaded (%d bytes), no temp file: %v", crc, size, err)
+		cs.cubins[crc] = info
+		return
+	}
+	tmp.Write(data)
+	tmp.Close()
+	info.tmpPath = tmp.Name()
+
+	// Parse line table with llvm-dwarfdump.
+	lines, files, err := parseLinesWithDwarfdump(info.tmpPath)
+	if err != nil {
+		log.Printf("  [CUBIN] 0x%x loaded (%d bytes), no line info: %v", crc, size, err)
+	} else {
+		info.lines = lines
+		info.files = files
+		log.Printf("  [CUBIN] 0x%x loaded (%d bytes), %d line entries, %d files",
+			crc, size, len(lines), len(files))
+	}
+
+	// Parse ELF to extract SM version and .text sections for SASS decoding.
+	archSM, texts := parseCubinELF(data)
+	info.archSM = archSM
+	info.texts = texts
+	if archSM > 0 {
+		log.Printf("  [CUBIN] 0x%x SM%d, %d text sections", crc, archSM, len(texts))
+	}
+
+	cs.cubins[crc] = info
+}
+
+func (cs *cubinStore) unload(crc uint64) {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+	if info, ok := cs.cubins[crc]; ok {
+		if info.tmpPath != "" {
+			os.Remove(info.tmpPath)
+		}
+		delete(cs.cubins, crc)
+	}
+	log.Printf("  [CUBIN] 0x%x unloaded", crc)
+}
+
+func (cs *cubinStore) cleanup() {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+	for _, info := range cs.cubins {
+		if info.tmpPath != "" {
+			os.Remove(info.tmpPath)
+		}
+	}
+}
+
+// resolveInstruction looks up the SASS mnemonic at a PC offset.
+// Uses the sass-table opcode decoder first, falls back to nvdisasm cache.
+func (cs *cubinStore) resolveInstruction(cubinCRC uint64, pcOffset uint64) string {
+	cs.mu.RLock()
+	info, ok := cs.cubins[cubinCRC]
+	cs.mu.RUnlock()
+	if !ok {
+		return ""
+	}
+
+	if info.archSM == 0 || len(info.texts) == 0 {
+		return ""
+	}
+
+	// pcOffset is function-relative. Try all text sections — the offset
+	// should only produce a valid decode in the correct one.
+	for _, ts := range info.texts {
+		if int(pcOffset)+16 <= len(ts.data) {
+			m := sasstable.DecodeMnemonicFromSlice(info.archSM, ts.data[pcOffset:])
+			if m != "" {
+				return m
+			}
+		}
+	}
+	return ""
+}
+
+// parseCubinELF extracts the SM version and .text section data from a cubin ELF.
+func parseCubinELF(data []byte) (archSM int, texts []textSection) {
+	f, err := elf.NewFile(bytes.NewReader(data))
+	if err != nil {
+		return 0, nil
+	}
+	defer f.Close()
+
+	// Go's debug/elf doesn't expose e_flags. Read it directly from the ELF header.
+	// For ELF64: e_flags is at offset 48, 4 bytes little-endian.
+	if len(data) >= 52 {
+		flags := binary.LittleEndian.Uint32(data[48:52])
+		archSM = int((flags >> 8) & 0xFF)
+	}
+
+	for _, s := range f.Sections {
+		if s.Type == elf.SHT_PROGBITS && s.Flags&elf.SHF_EXECINSTR != 0 &&
+			strings.HasPrefix(s.Name, ".text") {
+			d, err := s.Data()
+			if err != nil {
+				continue
+			}
+			texts = append(texts, textSection{name: s.Name, data: d})
+		}
+	}
+	return archSM, texts
+}
+
+// resolvePC looks up a PC offset in a cubin's line table.
+func (cs *cubinStore) resolvePC(cubinCRC uint64, pcOffset uint64) (file string, line int) {
+	cs.mu.RLock()
+	info, ok := cs.cubins[cubinCRC]
+	cs.mu.RUnlock()
+	if !ok || len(info.lines) == 0 {
+		return
+	}
+
+	// Binary search for the largest address <= pcOffset.
+	i := sort.Search(len(info.lines), func(i int) bool {
+		return info.lines[i].addr > pcOffset
+	})
+	if i == 0 {
+		return
+	}
+	e := info.lines[i-1]
+	return e.file, e.line
+}
+
+// parseLinesWithDwarfdump runs llvm-dwarfdump --debug-line on the cubin ELF
+// and parses the output into a sorted line table.
+func parseLinesWithDwarfdump(path string) ([]lineEntry, []string, error) {
+	cmd := exec.Command("llvm-dwarfdump", "--debug-line", path)
+	out, err := cmd.Output()
+	if err != nil {
+		return nil, nil, fmt.Errorf("llvm-dwarfdump: %w", err)
+	}
+
+	var entries []lineEntry
+	files := map[string]bool{}
+	fileList := []string{}
+
+	scanner := bufio.NewScanner(bytes.NewReader(out))
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Parse file table entries: file_names[  N]:
+		//            name: "foo.cu"
+		if strings.HasPrefix(line, "           name: ") {
+			name := strings.Trim(strings.TrimPrefix(line, "           name: "), "\"")
+			if !files[name] {
+				files[name] = true
+				fileList = append(fileList, name)
+			}
+			continue
+		}
+
+		// Parse line table rows: 0xADDR  LINE  COL  FILE ...
+		if !strings.HasPrefix(line, "0x") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 4 {
+			continue
+		}
+
+		addr, err := strconv.ParseUint(strings.TrimPrefix(fields[0], "0x"), 16, 64)
+		if err != nil {
+			continue
+		}
+
+		lineNum, err := strconv.Atoi(fields[1])
+		if err != nil {
+			continue
+		}
+
+		// fields[3] is the file index (1-based)
+		fileIdx, err := strconv.Atoi(fields[3])
+		if err != nil || fileIdx < 1 || fileIdx > len(fileList) {
+			continue
+		}
+
+		entries = append(entries, lineEntry{
+			addr: addr,
+			file: fileList[fileIdx-1],
+			line: lineNum,
+		})
+	}
+
+	if len(entries) == 0 {
+		return nil, nil, fmt.Errorf("no line entries found")
+	}
+
+	sort.Slice(entries, func(i, j int) bool {
+		return entries[i].addr < entries[j].addr
+	})
+
+	return entries, fileList, nil
+}
+
+func readProcessMemory(pid int, addr, size uint64) ([]byte, error) {
+	f, err := os.Open(fmt.Sprintf("/proc/%d/mem", pid))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	data := make([]byte, size)
+	_, err = f.ReadAt(data, int64(addr))
+	if err != nil && !errors.Is(err, io.EOF) {
+		return nil, err
+	}
+	return data, nil
+}
+
 func main() {
 	pid := flag.Int("pid", 0, "PID of the target process")
 	libPath := flag.String("lib", "", "Path to the shared library containing the USDT probe")
@@ -83,7 +398,7 @@ func main() {
 	// Load pre-compiled BPF objects.
 	objs := activityParserObjects{}
 	if err := loadActivityParserObjects(&objs, nil); err != nil {
-		var ve *ebpf.VerifierError
+		var ve *ebpf2.VerifierError
 		if errors.As(err, &ve) {
 			log.Fatalf("Verifier error loading BPF objects:\n%+v", ve)
 		}
@@ -115,11 +430,14 @@ func main() {
 
 	type probeTarget struct {
 		name    string
-		handler *ebpf.Program
+		handler *ebpf2.Program
 	}
 	targets := []probeTarget{
 		{"activity_batch", objs.HandleActivityBatch},
 		{"stall_reason_map", objs.HandleStallReasonMap},
+		{"cubin_loaded", objs.HandleCubinLoaded},
+		{"cubin_unloaded", objs.HandleCubinUnloaded},
+		{"pc_sample_batch", objs.HandlePcSampleBatch},
 	}
 
 	var links []link.Link
@@ -130,19 +448,16 @@ func main() {
 				continue
 			}
 
-			// Parse the stapsdt argument spec into a bpf_usdt_spec.
 			spec, err := pfelf.ParseUSDTArguments(probe.Arguments)
 			if err != nil {
 				log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err)
 			}
 
-			// Store spec in the BPF map so bpf_usdt_arg() can look it up.
 			specBytes := pfelf.USDTSpecToBytes(spec)
 			if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil {
 				log.Fatalf("Populating USDT spec map: %v", err)
 			}
 
-			// Cookie: spec_id in high 32 bits (bpf_usdt_arg reads it via bpf_get_attach_cookie).
 			cookie := uint64(specID) << 32
 
 			log.Printf("USDT probe parcagpu:%s at offset 0x%x, args=%q, spec_id=%d",
@@ -171,6 +486,12 @@ func main() {
 		}
 	}()
 
+	cubins := newCubinStore(*pid)
+	defer cubins.cleanup()
+
+	// Stall reason index → name cache, populated lazily from BPF map.
+	stallReasonNames := map[uint32]string{}
+
 	// Open ring buffer reader.
 	rd, err := ringbuf.NewReader(objs.Events)
 	if err != nil {
@@ -182,7 +503,6 @@ func main() {
 	sig := make(chan os.Signal, 1)
 	signal.Notify(sig, syscall.SIGINT, syscall.SIGTERM)
 
-	// Also watch for the target process to exit.
 	done := make(chan struct{})
 	go func() {
 		for {
@@ -204,9 +524,10 @@ func main() {
 	}()
 
 	log.Printf("Attached %d USDT probe(s) in %s (PID %d)", len(links), realLib, *pid)
-	log.Printf("Waiting for kernel activity events...")
+	log.Printf("Waiting for events...")
 
 	var eventCount uint64
+	var pcSampleCount uint64
 	ticker := time.NewTicker(2 * time.Second)
 	defer ticker.Stop()
 
@@ -216,7 +537,6 @@ func main() {
 		}
 	}()
 
-	var event KernelEvent
 	for {
 		record, err := rd.Read()
 		if err != nil {
@@ -227,24 +547,103 @@ func main() {
 			continue
 		}
 
-		if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil {
-			log.Printf("Parsing event: %v", err)
+		raw := record.RawSample
+		if len(raw) < 4 {
 			continue
 		}
 
-		eventCount++
-		if *verbose {
-			name := cString(event.Name[:])
-			duration := event.End - event.Start
-			fmt.Printf("kernel: name=%-40s corr=%-6d dev=%d stream=%d graph=%-3d duration=%dns\n",
-				name, event.CorrelationID, event.DeviceID, event.StreamID, event.GraphID, duration)
+		eventType := binary.LittleEndian.Uint32(raw[:4])
+
+		switch eventType {
+		case eventTypeKernel:
+			var event KernelEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing kernel event: %v", err)
+				continue
+			}
+			eventCount++
+			if *verbose {
+				name := cString(event.Name[:])
+				duration := event.End - event.Start
+				fmt.Printf("kernel: name=%-40s corr=%-6d dev=%d stream=%d graph=%-3d duration=%dns\n",
+					name, event.CorrelationID, event.DeviceID, event.StreamID, event.GraphID, duration)
+			}
+
+		case eventTypeCubinLoaded:
+			var event CubinEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing cubin event: %v", err)
+				continue
+			}
+			cubins.load(event.CubinCRC, event.CubinPtr, event.CubinSize)
+
+		case eventTypeCubinUnloaded:
+			var event CubinEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing cubin event: %v", err)
+				continue
+			}
+			cubins.unload(event.CubinCRC)
+
+		case eventTypePCSample:
+			var event PCSampleEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing pc sample event: %v", err)
+				continue
+			}
+			pcSampleCount++
+
+			// Lazily populate stall reason name cache.
+			if len(stallReasonNames) == 0 {
+				for i := uint32(0); i < 64; i++ {
+					var name [64]byte
+					if err := objs.StallReasons.Lookup(&i, &name); err != nil {
+						continue
+					}
+					s := cString(name[:])
+					if s != "" {
+						stallReasonNames[i] = s
+					}
+				}
+			}
+
+			name := cString(event.FunctionName[:])
+			file, line := cubins.resolvePC(event.CubinCRC, event.PCOffset)
+			insn := cubins.resolveInstruction(event.CubinCRC, event.PCOffset)
+
+			src := ""
+			if file != "" {
+				src = fmt.Sprintf("  %s:%d", file, line)
+			}
+			insnStr := ""
+			if insn != "" {
+				insnStr = fmt.Sprintf("  [%s]", insn)
+			}
+			corrStr := ""
+			if event.CorrelationID != 0 {
+				corrStr = fmt.Sprintf("  corr=%d", event.CorrelationID)
+			}
+			fmt.Printf("pc_sample: %s  pc=0x%04x%s%s%s\n", name, event.PCOffset, src, insnStr, corrStr)
+			for i := uint32(0); i < event.StallReasonCount; i++ {
+				sr := event.StallReasons[i]
+				if sr.Samples == 0 {
+					continue
+				}
+				srName := stallReasonNames[sr.Index]
+				if srName == "" {
+					srName = fmt.Sprintf("reason[%d]", sr.Index)
+				}
+				fmt.Printf("    %s = %d\n", srName, sr.Samples)
+			}
 		}
 	}
 
 	fmt.Println()
 	log.Printf("Final stats:")
 	printStats(&objs, eventCount)
+	log.Printf("  pc_samples=%d", pcSampleCount)
 	printStallReasonMap(&objs)
+	printCubins(cubins)
 }
 
 func printStats(objs *activityParserObjects, eventCount uint64) {
@@ -264,7 +663,6 @@ func printStats(objs *activityParserObjects, eventCount uint64) {
 }
 
 func printStallReasonMap(objs *activityParserObjects) {
-	// Check if the stall reason map was populated by BPF.
 	var loaded uint32
 	loadedKey := uint32(0)
 	if err := objs.StallMapLoaded.Lookup(&loadedKey, &loaded); err != nil || loaded == 0 {
@@ -286,6 +684,31 @@ func printStallReasonMap(objs *activityParserObjects) {
 	}
 }
 
+func printCubins(cs *cubinStore) {
+	cs.mu.RLock()
+	defer cs.mu.RUnlock()
+
+	if len(cs.cubins) == 0 {
+		log.Printf("  cubins: none loaded")
+		return
+	}
+
+	log.Printf("  cubins loaded: %d", len(cs.cubins))
+	for crc, info := range cs.cubins {
+		log.Printf("    crc=0x%x size=%d lines=%d files=%v",
+			crc, info.size, len(info.lines), info.files)
+
+		// Print first 10 line entries as demo.
+		for i, e := range info.lines {
+			if i >= 10 {
+				log.Printf("      ... and %d more entries", len(info.lines)-10)
+				break
+			}
+			log.Printf("      0x%04x -> %s:%d", e.addr, e.file, e.line)
+		}
+	}
+}
+
 func raiseMemlock() error {
 	return unix.Setrlimit(unix.RLIMIT_MEMLOCK, &unix.Rlimit{
 		Cur: unix.RLIM_INFINITY,
@@ -300,5 +723,7 @@ func cString(b []byte) string {
 	return string(b)
 }
 
-// Ensure KernelEvent matches the BPF struct size.
-var _ = [1]struct{}{}[unsafe.Sizeof(KernelEvent{})-168]
+// Ensure struct sizes match BPF.
+var _ = [1]struct{}{}[unsafe.Sizeof(KernelEvent{})-176]
+var _ = [1]struct{}{}[unsafe.Sizeof(CubinEvent{})-32]
+var _ = [1]struct{}{}[unsafe.Sizeof(PCSampleEvent{})-672]
diff --git a/test/test-pc-real.sh b/test/test-pc-real.sh
new file mode 100755
index 0000000..6fc4d99
--- /dev/null
+++ b/test/test-pc-real.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# Smoke test: runs pc_sample_toy under parcagpu with BPF activity parser.
+# Verifies stall reason map, PC samples, and cubin loading.
+#
+# Prerequisites:
+#   make local bpf-test
+#   nvcc -g -lineinfo -o microbenchmarks/pc_sample_toy microbenchmarks/pc_sample_toy.cu
+#
+# Usage:
+#   sudo -E test/test-pc-real.sh          # default
+#   sudo -E test/test-pc-real.sh -v       # verbose (print every event)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+LIB="$ROOT/build-local/lib/libparcagpucupti.so"
+TOY="$ROOT/microbenchmarks/pc_sample_toy"
+BPF="$ROOT/test/bpf/activity_parser"
+BPF_LOG="/tmp/parcagpu-pc-test-bpf.log"
+TOY_LOG="/tmp/parcagpu-pc-test-toy.log"
+VERBOSE=""
+
+for arg in "$@"; do
+  case "$arg" in
+    -v) VERBOSE="-v" ;;
+  esac
+done
+
+# --- Preflight checks ---
+for f in "$LIB" "$TOY" "$BPF"; do
+  if [ ! -x "$f" ] && [ ! -f "$f" ]; then
+    echo "error: $f not found" >&2
+    exit 1
+  fi
+done
+
+cleanup() {
+  [ -n "${TOY_PID:-}" ] && kill "$TOY_PID" 2>/dev/null || true
+  [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true
+  wait 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# --- Launch the toy workload ---
+echo "=== Starting pc_sample_toy ==="
+PARCAGPU_DEBUG=1 PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$LIB" "$TOY" 8 > "$TOY_LOG" 2>&1 &
+TOY_PID=$!
+echo "pc_sample_toy PID: $TOY_PID"
+
+# Wait for library to be loaded into the process.
+while kill -0 "$TOY_PID" 2>/dev/null &&
+      ! grep -q libparcagpucupti "/proc/$TOY_PID/maps" 2>/dev/null; do
+  sleep 0.1
+done
+
+if ! kill -0 "$TOY_PID" 2>/dev/null; then
+  echo "error: pc_sample_toy exited before library loaded" >&2
+  exit 1
+fi
+
+# --- Attach BPF parser ---
+echo "=== Starting BPF activity parser ==="
+"$BPF" -pid "$TOY_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 &
+BPF_PID=$!
+echo "activity_parser PID: $BPF_PID"
+
+# --- Wait for workload to finish ---
+wait "$TOY_PID" 2>/dev/null || true
+TOY_PID=""
+sleep 2
+
+# --- Stop BPF parser ---
+kill "$BPF_PID" 2>/dev/null || true
+wait "$BPF_PID" 2>/dev/null || true
+BPF_PID=""
+
+# --- Results ---
+echo
+echo "=== Toy output (parcagpu debug) ==="
+cat "$TOY_LOG"
+echo
+echo "=== BPF parser output ==="
+cat "$BPF_LOG"
+echo
+
+# --- Checks ---
+PASS=true
+
+check() {
+  local label="$1" pattern="$2" file="$3"
+  if grep -q "$pattern" "$file"; then
+    echo "PASS: $label"
+  else
+    echo "FAIL: $label" >&2
+    PASS=false
+  fi
+}
+
+check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TOY_LOG"
+check "stall reason map received" "stall reason map:" "$BPF_LOG"
+check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG"
+check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG"
+check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG"
+
+if $PASS; then
+  echo
+  echo "=== ALL CHECKS PASSED ==="
+else
+  echo
+  echo "=== SOME CHECKS FAILED ===" >&2
+  exit 1
+fi
diff --git a/test/test_cupti_prof.c b/test/test_cupti_prof.c
index 04810a0..30b763a 100644
--- a/test/test_cupti_prof.c
+++ b/test/test_cupti_prof.c
@@ -628,6 +628,13 @@ void *cupti_thread(void *arg) {
             size_t maxNumRecords;
             bufferRequestedCallback(&buffer, &bufferSize, &maxNumRecords);
 
+            // The profiler may return NULL when no tracer is attached
+            // (semaphore-gated short circuit). Skip this flush cycle.
+            if (buffer == NULL) {
+                usleep(100000);
+                continue;
+            }
+
             // Fill the buffer with activity records for launched kernels
             size_t offset = 0;
             size_t recordSize = sizeof(CUpti_ActivityKernel5);
diff --git a/vendor/opentelemetry-ebpf-profiler b/vendor/opentelemetry-ebpf-profiler
index d13351c..96c49d2 160000
--- a/vendor/opentelemetry-ebpf-profiler
+++ b/vendor/opentelemetry-ebpf-profiler
@@ -1 +1 @@
-Subproject commit d13351cf54ac5cad54bbf313f321539f4ba9f70b
+Subproject commit 96c49d2f29166767c8847260d489b63bc2d4bdbd

From fea1c74a276fcfdfb12f81bfa22f6a5f71432a12 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Mon, 6 Apr 2026 12:57:19 -0500
Subject: [PATCH 13/14] PC sampling: probabilistic windowed start/stop with
 KERNEL_SERIALIZED mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement interval-gated probabilistic PC sampling that only serializes
kernels during active sampling windows, not for the entire process lifetime.

Architecture:
- CUPTI lifecycle: enable (once) → start/stop (per window) → disable (once)
- Enable START_STOP_CONTROL attribute so start/stop work from CUPTI callbacks
- Collection mode is KERNEL_SERIALIZED for per-kernel correlation
- Probabilistic window: every PARCAGPU_PC_SAMPLING_INTERVAL seconds, roll a
  PARCAGPU_PC_SAMPLING_PROBABILITY die; if it hits, start sampling until the
  window closes, then stop and drain data
- start()/stop() are mutex-guarded and idempotent (no double-start/stop races)
- ctxSynchronize before start to satisfy CUPTI's GPU-idle requirement

Key changes:
- pc_sampling.cpp: Session-based enable with per-window start/stop, semaphore-
  gated stall reason map replay (replaces rate-limited emission), CUPTI 12.4
  ABI version check (v22 correlationId boundary), graceful permission failure
  handling in enable
- cupti.cpp: Probabilistic window state machine in ENTER/EXIT callbacks,
  env var config (probability, interval), env_config validation
- probes.d: Add error USDT probe for surfacing CUPTI failures to BPF
- test/mock_cupti.c: Full PC sampling mock with real cubin from pc_sample_toy,
  real SASS offsets for source-line correlation, 11-entry sample table cycling
  through shmem_bounce/hash_churn/trig_storm kernels
- test/mock_cuda.c: Add cuCtxSynchronize stub
- test/test-pc-mock.sh: New GPU-less test using mock libs and real cubin
- test/test-pc-real.sh: Set probability=1 interval=0.5 for reliable test hits
- test/bpf/: Move CUPTI struct defs to shared cupti_bpf.h, add error event
  handling, CUDA 12.4+ correlationId support
- test/CMakeLists.txt: Build mock CUDA driver library

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                 |   1 +
 Makefile                       |   6 +-
 src/cupti.cpp                  | 111 ++++++++++++-
 src/env_config.cpp             | 103 ++++++++++++
 src/env_config.h               |  16 ++
 src/pc_sampling.cpp            | 214 +++++++++++++++++++------
 src/pc_sampling.h              |  35 ++++-
 src/probes.d                   |   1 +
 test/CMakeLists.txt            |   9 ++
 test/bpf/activity_parser.bpf.c |  67 ++++----
 test/bpf/main.go               |  20 +++
 test/mock_cuda.c               |   5 +
 test/mock_cupti.c              | 278 +++++++++++++++++++++++++++++++++
 test/pc_sample_toy.cubin       | Bin 0 -> 39864 bytes
 test/test-pc-mock.sh           | 129 +++++++++++++++
 test/test-pc-real.sh           |   7 +-
 test/test_cupti_prof.c         |  10 ++
 17 files changed, 918 insertions(+), 94 deletions(-)
 create mode 100644 src/env_config.cpp
 create mode 100644 src/env_config.h
 create mode 100644 test/pc_sample_toy.cubin
 create mode 100755 test/test-pc-mock.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10aef9a..be64c53 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ add_library(parcagpucupti SHARED
     src/cupti.cpp
     src/pc_sampling.cpp
     src/correlation_filter.cpp
+    src/env_config.cpp
     ${PROTON_SOURCES}
 )
 
diff --git a/Makefile b/Makefile
index cf30250..9408e80 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test microbenchmarks test-multi test-pc-real
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test microbenchmarks test-multi test-pc-real test-pc-mock
 
 LIB_NAME = libparcagpucupti.so
 
@@ -151,6 +151,10 @@ test-multi: local bpf-test
 test-pc-real: local bpf-test microbenchmarks
 	sudo -E test/test-pc-real.sh
 
+# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA.
+test-pc-mock: local bpf-test
+	sudo -E test/test-pc-mock.sh
+
 format:
 	@echo "=== Formatting source files ==="
 	@clang-format -i -style=file src/*.cpp src/*.h test/*.c
diff --git a/src/cupti.cpp b/src/cupti.cpp
index a40448c..51bd1f5 100644
--- a/src/cupti.cpp
+++ b/src/cupti.cpp
@@ -15,6 +15,7 @@
 #include "Profiler/Cupti/CuptiCallbacks.h"
 #include "Utility/Singleton.h"
 #include "correlation_filter.h"
+#include "env_config.h"
 #include "pc_sampling.h"
 #include "token_bucket.h"
 
@@ -38,6 +39,51 @@ thread_local uint32_t runtimeEnterCorrelationId = 0;
 // configurable via PARCAGPU_RATE_LIMIT).
 thread_local TokenBucket callbackLimiter(100.0);
 
+// ---------------------------------------------------------------------------
+// PC sampling probabilistic control.
+// Sampling is gated by an interval + probability: at most once per interval,
+// roll the probability die; if it hits, sample all kernels until the interval
+// window closes.
+// ---------------------------------------------------------------------------
+
+// Config — read from env at startup. Stored in a struct so it can be made
+// runtime-adjustable later (e.g. via signal or control file).
+struct PCSamplingConfig {
+  double probability;  // PARCAGPU_PC_SAMPLING_PROBABILITY (default 1/100)
+  uint64_t intervalNs; // PARCAGPU_PC_SAMPLING_INTERVAL in nanoseconds
+};
+static PCSamplingConfig g_pcSamplingConfig = {0.01, 1000000000ULL};
+
+// Per-thread sampling state.
+struct PCSamplingState {
+  bool active = false;        // Currently sampling
+  uint64_t windowStartNs = 0; // When the current window opened
+  uint64_t lastCheckNs = 0;   // Last time we checked the probability
+  unsigned int rngSeed = 0;   // Thread-local RNG state
+};
+thread_local PCSamplingState g_pcSamplingState;
+
+static uint64_t nowNs() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+// Seed the per-thread RNG lazily.
+static void ensureRngSeeded(PCSamplingState &s) {
+  if (s.rngSeed == 0) {
+    uint64_t t = nowNs();
+    s.rngSeed = (unsigned int)(t ^ (uintptr_t)&s);
+    if (s.rngSeed == 0)
+      s.rngSeed = 1;
+  }
+}
+
+static double threadRandom(PCSamplingState &s) {
+  ensureRngSeeded(s);
+  return (double)rand_r(&s.rngSeed) / RAND_MAX;
+}
+
 void init_debug() {
   static bool initialized = false;
   if (!initialized) {
@@ -50,6 +96,21 @@ void init_debug() {
         callbackLimiter.setRate(rate);
       }
     }
+
+    const char *probEnv = getenv("PARCAGPU_PC_SAMPLING_PROBABILITY");
+    if (probEnv) {
+      double p = atof(probEnv);
+      if (p > 0.0 && p <= 1.0)
+        g_pcSamplingConfig.probability = p;
+    }
+    const char *intervalEnv = getenv("PARCAGPU_PC_SAMPLING_INTERVAL");
+    if (intervalEnv) {
+      double s = atof(intervalEnv);
+      if (s > 0.0)
+        g_pcSamplingConfig.intervalNs = (uint64_t)(s * 1e9);
+    }
+
+    validateEnvVars();
     initialized = true;
   }
 }
@@ -86,7 +147,7 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     // Check if PC sampling is supported
     pcSamplingEnabled = parcagpu::PCSampling::isSupported();
     if (pcSamplingEnabled) {
-      DEBUG_PRINTF("[PARCAGPU] PC sampling enabled (continuous mode)\n");
+      DEBUG_PRINTF("[PARCAGPU] PC sampling enabled (serialized mode)\n");
     } else {
       DEBUG_PRINTF(
           "[PARCAGPU] PC sampling disabled, using kernel activity only\n");
@@ -369,9 +430,36 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
           static_cast<const CUpti_CallbackData *>(cbdata_void);
       uint32_t correlationId = cbdata->correlationId;
 
-      if (domain == CUPTI_CB_DOMAIN_RUNTIME_API &&
-          cbdata->callbackSite == CUPTI_API_ENTER) {
-        runtimeEnterCorrelationId = correlationId;
+      // ENTER: manage probabilistic sampling windows.
+      // start() begins CUPTI PC sampling (kernels serialized).
+      // stop() ends it and drains data (kernels concurrent again).
+      if (cbdata->callbackSite == CUPTI_API_ENTER) {
+        if (domain == CUPTI_CB_DOMAIN_RUNTIME_API)
+          runtimeEnterCorrelationId = correlationId;
+
+        if (profiler.pcSamplingEnabled) {
+          auto &st = g_pcSamplingState;
+          auto &cfg = g_pcSamplingConfig;
+          uint64_t now = nowNs();
+
+          // If sampling and the window has closed, stop + drain.
+          if (st.active && (now - st.windowStartNs >= cfg.intervalNs)) {
+            profiler.pcSampling.stop(cbdata->context);
+            st.active = false;
+          }
+
+          // If not sampling, check interval then probability.
+          if (!st.active && (now - st.lastCheckNs >= cfg.intervalNs)) {
+            st.lastCheckNs = now;
+            if (threadRandom(st) < cfg.probability) {
+              st.active = true;
+              st.windowStartNs = now;
+              profiler.pcSampling.start(cbdata->context);
+            }
+          }
+
+          profiler.pcSampling.emitMetadata();
+        }
         return;
       }
 
@@ -380,11 +468,18 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
         return;
       }
 
-      // Drain PC sampling data on kernel launch EXIT — the context from
-      // cbdata is always valid here, unlike in completeBuffer or at shutdown.
-      // Runs regardless of which USDT probes are attached.
+      // EXIT: stop sampling if the window has closed.
       if (profiler.pcSamplingEnabled) {
-        profiler.pcSampling.collectData(cbdata->context);
+        auto &st = g_pcSamplingState;
+        auto &cfg = g_pcSamplingConfig;
+        uint64_t now = nowNs();
+
+        if (st.active && (now - st.windowStartNs >= cfg.intervalNs)) {
+          profiler.pcSampling.stop(cbdata->context);
+          st.active = false;
+        }
+
+        profiler.pcSampling.emitMetadata();
       }
 
       // Skip correlation/rate-limiter work when no profiler is attached.
diff --git a/src/env_config.cpp b/src/env_config.cpp
new file mode 100644
index 0000000..7d0bac7
--- /dev/null
+++ b/src/env_config.cpp
@@ -0,0 +1,103 @@
+#include "env_config.h"
+#include "pc_sampling.h" // DEBUG_PRINTF, fireError
+#include "probes.h"
+
+#include <cstdlib>
+#include <cstring>
+
+extern char **environ;
+
+namespace parcagpu {
+
+// Known PARCAGPU_* environment variables.
+static const char *knownVars[] = {
+    "PARCAGPU_DEBUG",
+    "PARCAGPU_LIMITER_DISABLE",
+    "PARCAGPU_RATE_LIMIT",
+    "PARCAGPU_SAMPLING_FACTOR",
+    "PARCAGPU_PC_SAMPLING_PROBABILITY",
+    "PARCAGPU_PC_SAMPLING_INTERVAL",
+};
+static constexpr size_t numKnownVars =
+    sizeof(knownVars) / sizeof(knownVars[0]);
+
+static bool isKnown(const char *name, size_t nameLen) {
+  for (size_t i = 0; i < numKnownVars; ++i) {
+    if (std::strlen(knownVars[i]) == nameLen &&
+        std::strncmp(knownVars[i], name, nameLen) == 0)
+      return true;
+  }
+  return false;
+}
+
+void validateEnvVars() {
+  // Scan environment for unrecognized PARCAGPU_* variables.
+  static constexpr const char prefix[] = "PARCAGPU_";
+  static constexpr size_t prefixLen = sizeof(prefix) - 1;
+
+  for (char **ep = environ; *ep; ++ep) {
+    if (std::strncmp(*ep, prefix, prefixLen) != 0)
+      continue;
+
+    // Extract variable name (everything before '=').
+    const char *eq = std::strchr(*ep, '=');
+    size_t nameLen = eq ? (size_t)(eq - *ep) : std::strlen(*ep);
+
+    if (!isKnown(*ep, nameLen)) {
+      // Null-terminate for printing.
+      char nameBuf[128] = {};
+      size_t copyLen = nameLen < sizeof(nameBuf) - 1 ? nameLen : sizeof(nameBuf) - 1;
+      std::memcpy(nameBuf, *ep, copyLen);
+
+      DEBUG_PRINTF("[PARCAGPU] Warning: unrecognized env var '%s'\n", nameBuf);
+      fireError(0, nameBuf, "env_config: unrecognized variable");
+    }
+  }
+
+  // Validate specific variables.
+  const char *val;
+
+  val = std::getenv("PARCAGPU_RATE_LIMIT");
+  if (val) {
+    double rate = std::atof(val);
+    if (rate <= 0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_RATE_LIMIT=%s invalid "
+                   "(must be > 0), using default\n", val);
+      fireError(0, val, "env_config: PARCAGPU_RATE_LIMIT invalid");
+    }
+  }
+
+  val = std::getenv("PARCAGPU_SAMPLING_FACTOR");
+  if (val) {
+    int factor = std::atoi(val);
+    if (factor != 0 && (factor < 5 || factor > 31)) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%s out of "
+                   "range [0, 5-31], using default\n", val);
+      fireError(0, val, "env_config: PARCAGPU_SAMPLING_FACTOR out of range");
+    }
+  }
+
+  val = std::getenv("PARCAGPU_PC_SAMPLING_PROBABILITY");
+  if (val) {
+    double p = std::atof(val);
+    if (p <= 0.0 || p > 1.0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_SAMPLING_PROBABILITY=%s "
+                   "invalid (must be in (0, 1]), using default\n", val);
+      fireError(0, val,
+                "env_config: PARCAGPU_PC_SAMPLING_PROBABILITY invalid");
+    }
+  }
+
+  val = std::getenv("PARCAGPU_PC_SAMPLING_INTERVAL");
+  if (val) {
+    double s = std::atof(val);
+    if (s <= 0.0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_SAMPLING_INTERVAL=%s "
+                   "invalid (must be > 0), using default\n", val);
+      fireError(0, val,
+                "env_config: PARCAGPU_PC_SAMPLING_INTERVAL invalid");
+    }
+  }
+}
+
+} // namespace parcagpu
diff --git a/src/env_config.h b/src/env_config.h
new file mode 100644
index 0000000..2075c85
--- /dev/null
+++ b/src/env_config.h
@@ -0,0 +1,16 @@
+#ifndef PARCAGPU_ENV_CONFIG_H_
+#define PARCAGPU_ENV_CONFIG_H_
+
+namespace parcagpu {
+
+// Scan the process environment for all PARCAGPU_* variables.
+// Warns (via DEBUG_PRINTF and the error probe) about unrecognized names.
+// Validates types and ranges for known variables; on invalid values, fires
+// the error probe, prints a debug warning, and uses the default.
+//
+// Call once at startup (e.g. from init_debug).
+void validateEnvVars();
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_ENV_CONFIG_H_
diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp
index 2e3fa3a..230b494 100644
--- a/src/pc_sampling.cpp
+++ b/src/pc_sampling.cpp
@@ -11,13 +11,13 @@
 
 namespace parcagpu {
 
-// CUDA driver version for 12.8.1 (when continuous PC sampling became stable)
+// CUDA driver version for 12.8.1 (minimum for PC sampling)
 // Version format: major * 1000 + minor * 10 + patch
 #define CUDA_VERSION_12_8_1 12081
 
-// Debug logging (reuse from main file)
-extern bool debug_enabled;
-extern void init_debug();
+// CUPTI version that added correlationId to CUpti_PCSamplingPCData,
+// breaking ABI compatibility.
+#define CUPTI_CUDA12_4_VERSION 22
 
 // noinline wrappers so each USDT probe has exactly one call site in the
 // binary.  Multiple call sites produce multiple .note.stapsdt entries with
@@ -64,22 +64,34 @@ void enablePCSampling(CUcontext context) {
   proton::cupti::pcSamplingEnable<true>(&params);
 }
 
-void startPCSampling(CUcontext context) {
+bool startPCSampling(CUcontext context) {
+  // CUPTI requires the GPU to be idle before starting PC sampling.
+  proton::cuda::ctxSynchronize<false>();
   CUpti_PCSamplingStartParams params = {
       /*size=*/CUpti_PCSamplingStartParamsSize,
       /*pPriv=*/NULL,
       /*ctx=*/context,
   };
-  proton::cupti::pcSamplingStart<true>(&params);
+  auto ret = proton::cupti::pcSamplingStart<false>(&params);
+  if (ret != CUPTI_SUCCESS) {
+    DEBUG_PRINTF("cuptiPCSamplingStart failed: %d\n", ret);
+    return false;
+  }
+  return true;
 }
 
-void stopPCSampling(CUcontext context) {
+bool stopPCSampling(CUcontext context) {
   CUpti_PCSamplingStopParams params = {
       /*size=*/CUpti_PCSamplingStopParamsSize,
       /*pPriv=*/NULL,
       /*ctx=*/context,
   };
-  proton::cupti::pcSamplingStop<true>(&params);
+  auto ret = proton::cupti::pcSamplingStop<false>(&params);
+  if (ret != CUPTI_SUCCESS) {
+    DEBUG_PRINTF("cuptiPCSamplingStop failed: %d\n", ret);
+    return false;
+  }
+  return true;
 }
 
 void disablePCSampling(CUcontext context) {
@@ -333,10 +345,19 @@ CUpti_PCSamplingConfigurationInfo ConfigureData::configureCollectionMode() {
   collectionModeInfo.attributeType =
       CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE;
   collectionModeInfo.attributeData.collectionModeData.collectionMode =
-      CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS;
+      CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED;
   return collectionModeInfo;
 }
 
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureStartStopControl() {
+  CUpti_PCSamplingConfigurationInfo startStopControlInfo{};
+  startStopControlInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL;
+  startStopControlInfo.attributeData.enableStartStopControlData
+      .enableStartStopControl = true;
+  return startStopControlInfo;
+}
+
 void ConfigureData::initialize(CUcontext context) {
   this->context = context;
   proton::cupti::getContextId<true>(context, &contextId);
@@ -346,6 +367,7 @@ void ConfigureData::initialize(CUcontext context) {
 
   configurationInfos.emplace_back(configureStallReasons());
   configurationInfos.emplace_back(configureCollectionMode());
+  configurationInfos.emplace_back(configureStartStopControl());
   configurationInfos.emplace_back(configureSamplingBuffer());
   // Don't set sampling period — let CUPTI use its default.
   // Explicit period values silently break sampling on some GPUs (e.g.
@@ -366,17 +388,10 @@ void ConfigureData::initialize(CUcontext context) {
 // GPUPCSampling implementation
 
 bool PCSampling::isSupported() {
-  // PARCAGPU_SAMPLING_FACTOR must be set to enable PC sampling.
-  // Any non-zero value enables it; 0 disables.
-  // If unset, PC sampling is disabled.
+  // PC sampling is enabled by default.
+  // Set PARCAGPU_SAMPLING_FACTOR=0 to disable.
   const char *env = getenv("PARCAGPU_SAMPLING_FACTOR");
-  if (!env) {
-    DEBUG_PRINTF(
-        "PC sampling not enabled (PARCAGPU_SAMPLING_FACTOR not set)\n");
-    return false;
-  }
-  int factor = atoi(env);
-  if (factor == 0) {
+  if (env && atoi(env) == 0) {
     DEBUG_PRINTF("PC sampling disabled via PARCAGPU_SAMPLING_FACTOR=0\n");
     return false;
   }
@@ -392,14 +407,46 @@ bool PCSampling::isSupported() {
     DEBUG_PRINTF("PC sampling not supported: CUDA driver version %d.%d.%d < "
                  "required 12.8.1\n",
                  major, minor, patch);
+    fireError(driverVersion,
+              "CUDA driver version too low for PC sampling (need >= 12.8.1)",
+              "pc_sampling");
+    return false;
+  }
+
+  // Check CUPTI API/driver version compatibility.
+  // CUPTI 12.4 (v22) added correlationId to CUpti_PCSamplingPCData, breaking
+  // ABI. Mixing compile-time and runtime versions across this boundary crashes.
+  uint32_t cuptiVersion = 0;
+  proton::cupti::getVersion<true>(&cuptiVersion);
+
+  if ((cuptiVersion < CUPTI_CUDA12_4_VERSION &&
+       CUPTI_API_VERSION >= CUPTI_CUDA12_4_VERSION) ||
+      (cuptiVersion >= CUPTI_CUDA12_4_VERSION &&
+       CUPTI_API_VERSION < CUPTI_CUDA12_4_VERSION)) {
+    DEBUG_PRINTF(
+        "PC sampling disabled: CUPTI API version %d and driver version %d "
+        "are incompatible across the 12.4 (v22) ABI boundary\n",
+        CUPTI_API_VERSION, cuptiVersion);
+    fireError((int32_t)cuptiVersion,
+              "CUPTI API/driver version mismatch (12.4 ABI boundary)",
+              "pc_sampling");
     return false;
   }
 
+  // Attempt a lightweight permission probe. CUPTI PC sampling requires
+  // either root, CAP_SYS_ADMIN, or the NVIDIA module parameter
+  // NVreg_RestrictProfilingToAdminUsers=0.
+  // We cannot easily pre-check permissions without attempting CUPTI calls,
+  // so we defer the real check to initialize() where enablePCSampling()
+  // will fail with a CUPTI error if permissions are insufficient.
+  // TODO: Add explicit permission pre-check.
+  // Reference: https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters
+
   int major = driverVersion / 1000;
   int minor = (driverVersion % 1000) / 10;
   int patch = driverVersion % 10;
-  DEBUG_PRINTF("PC sampling supported: CUDA driver version %d.%d.%d\n", major,
-               minor, patch);
+  DEBUG_PRINTF("PC sampling supported: CUDA %d.%d.%d, CUPTI v%u (API v%d)\n",
+               major, minor, patch, cuptiVersion, CUPTI_API_VERSION);
   return true;
 }
 
@@ -416,9 +463,35 @@ void PCSampling::initialize(CUcontext context) {
   proton::cupti::getContextId<true>(context, &contextId);
 
   doubleCheckedLock(
-      [&]() { return !contextInitialized.contain(contextId); }, contextMutex,
       [&]() {
-        enablePCSampling(context);
+        return !contextInitialized.contain(contextId) &&
+               !contextFailed.contain(contextId);
+      },
+      contextMutex,
+      [&]() {
+        // enablePCSampling can fail due to insufficient permissions
+        // (ERR_NVGPUCTRPERM). Catch and degrade gracefully.
+        CUpti_PCSamplingEnableParams enableParams = {
+            /*size=*/CUpti_PCSamplingEnableParamsSize,
+            /*pPriv=*/NULL,
+            /*ctx=*/context,
+        };
+        auto result =
+            proton::cupti::pcSamplingEnable<false>(&enableParams);
+        if (result != CUPTI_SUCCESS) {
+          DEBUG_PRINTF(
+              "Failed to enable PC sampling for context %u: CUPTI error %d\n"
+              "This may be a permission issue. See:\n"
+              "https://developer.nvidia.com/nvidia-development-tools-solutions-"
+              "err_nvgpuctrperm-permission-issue-performance-counters\n",
+              contextId, result);
+          fireError((int32_t)result,
+                    "Failed to enable PC sampling (possible permission issue)",
+                    "pc_sampling");
+          contextFailed.insert(contextId);
+          return;
+        }
+
         auto *configData = getConfigureData(contextId);
         configData->initialize(context);
 
@@ -429,11 +502,39 @@ void PCSampling::initialize(CUcontext context) {
 
         contextInitialized.insert(contextId);
         initializedContextIds.push_back(contextId);
-        DEBUG_PRINTF("PC sampling started in continuous mode for context %u\n",
-                     contextId);
+        DEBUG_PRINTF(
+            "PC sampling initialized (serialized mode) for context %u\n",
+            contextId);
       });
 }
 
+void PCSampling::start(CUcontext context) {
+  std::lock_guard<std::mutex> lock(pcSamplingMutex);
+  if (samplingActive)
+    return;
+  if (startPCSampling(context)) {
+    samplingActive = true;
+    samplingContext = context;
+    DEBUG_PRINTF("PC sampling started (kernels serialized)\n");
+  }
+}
+
+void PCSampling::stop(CUcontext context) {
+  std::lock_guard<std::mutex> lock(pcSamplingMutex);
+  if (!samplingActive)
+    return;
+  stopPCSampling(context);
+  samplingActive = false;
+  DEBUG_PRINTF("PC sampling stopped (kernels concurrent)\n");
+  // Drain data collected during this window.
+  collectData(context);
+}
+
+__attribute__((noinline)) void fireError(int32_t code, const char *message,
+                                         const char *component) {
+  PARCAGPU_ERROR(code, message, component);
+}
+
 void PCSampling::processPCSamplingData(ConfigureData *configureData) {
   auto *pcSamplingData = &configureData->outputData;
 
@@ -499,27 +600,17 @@ void PCSampling::processPCSamplingData(ConfigureData *configureData) {
   }
 }
 
-void PCSampling::collectData(CUcontext context) {
-  uint32_t contextId = 0;
-  proton::cupti::getContextId<true>(context, &contextId);
-
-  if (!contextInitialized.contain(contextId)) {
-    DEBUG_PRINTF("Context %u not initialized, skipping data collection\n",
-                 contextId);
-    return;
-  }
-
-  auto *configureData = getConfigureData(contextId);
-  DEBUG_PRINTF("Collecting PC sampling data for context %u (cfg total=%zu "
-               "remaining=%zu)\n",
-               contextId, configureData->pcSamplingData.totalNumPcs,
-               configureData->pcSamplingData.remainingNumPcs);
-
-  // Re-emit stall reason map periodically so the profiler backend can
-  // join stall reason indices to human-readable names.
-  if (stallReasonMap.data() && stallReasonMapLimiter.tryAcquire()) {
-    PARCAGPU_STALL_REASON_MAP(stallReasonMap.data(),
-                              stallReasonMap.numEntries());
+void PCSampling::emitMetadata() {
+  // Re-emit stall reason map when a tracer attaches (semaphore transitions
+  // to non-zero), using the same pattern as cubin replay.
+  if (stallReasonMap.data() && PARCAGPU_STALL_REASON_MAP_ENABLED()) {
+    if (!stallMapEmitted) {
+      PARCAGPU_STALL_REASON_MAP(stallReasonMap.data(),
+                                stallReasonMap.numEntries());
+      stallMapEmitted = true;
+    }
+  } else {
+    stallMapEmitted = false;
   }
 
   // Replay cubin_loaded probes for late-attaching tracers.
@@ -537,6 +628,23 @@ void PCSampling::collectData(CUcontext context) {
     // Tracer detached — reset so we replay again on next attach.
     cubinsEmitted = false;
   }
+}
+
+void PCSampling::collectData(CUcontext context) {
+  uint32_t contextId = 0;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  if (!contextInitialized.contain(contextId)) {
+    DEBUG_PRINTF("Context %u not initialized, skipping data collection\n",
+                 contextId);
+    return;
+  }
+
+  auto *configureData = getConfigureData(contextId);
+  DEBUG_PRINTF("Collecting PC sampling data for context %u (cfg total=%zu "
+               "remaining=%zu)\n",
+               contextId, configureData->pcSamplingData.totalNumPcs,
+               configureData->pcSamplingData.remainingNumPcs);
 
   // Use the separate output buffer for getData — the configured
   // pcSamplingData buffer is owned by CUPTI.
@@ -570,8 +678,11 @@ void PCSampling::finalize(CUcontext context) {
   uint32_t contextId = 0;
   proton::cupti::getContextId<true>(context, &contextId);
 
-  if (!contextInitialized.contain(contextId))
+  if (!contextInitialized.contain(contextId)) {
+    // Clean up failed context tracking if applicable.
+    contextFailed.erase(contextId);
     return;
+  }
 
   // Hold contextMutex for the entire finalize to prevent collectAllData
   // from racing with us (it iterates initializedContextIds under this lock).
@@ -585,6 +696,15 @@ void PCSampling::finalize(CUcontext context) {
                                           contextId),
                               initializedContextIds.end());
 
+  // Stop sampling if it was started on this context.
+  {
+    std::lock_guard<std::mutex> lock2(pcSamplingMutex);
+    if (samplingActive) {
+      stopPCSampling(context);
+      samplingActive = false;
+    }
+  }
+
   // Drain remaining data before disabling
   auto *configureData = getConfigureData(contextId);
   processPCSamplingData(configureData);
diff --git a/src/pc_sampling.h b/src/pc_sampling.h
index bd237cd..0bcd56d 100644
--- a/src/pc_sampling.h
+++ b/src/pc_sampling.h
@@ -10,7 +10,6 @@
 #include <vector>
 
 #include "stall_reason_map.h"
-#include "token_bucket.h"
 
 #include <cuda.h>
 #include <cupti.h>
@@ -33,6 +32,10 @@
 
 namespace parcagpu {
 
+// Debug logging control — defined in cupti.cpp.
+extern bool debug_enabled;
+extern void init_debug();
+
 // Use Proton's CubinData directly
 using proton::CubinData;
 
@@ -67,6 +70,7 @@ struct ConfigureData {
   CUpti_PCSamplingConfigurationInfo configureScratchBuffer();
   CUpti_PCSamplingConfigurationInfo configureHardwareBufferSize();
   CUpti_PCSamplingConfigurationInfo configureCollectionMode();
+  CUpti_PCSamplingConfigurationInfo configureStartStopControl();
 
   // Buffer size constants (from Proton)
   static constexpr size_t HardwareBufferSize = 128 * 1024 * 1024;
@@ -94,10 +98,24 @@ class PCSampling {
   PCSampling() = default;
   ~PCSampling() = default;
 
-  // Check if PC sampling is supported (CUPTI >= 12.8.1)
+  // Check if PC sampling is supported (CUPTI >= 12.8.1).
+  // Enabled by default; set PARCAGPU_SAMPLING_FACTOR=0 to disable.
   static bool isSupported();
 
   void initialize(CUcontext context);
+
+  // Start PC sampling — kernels become serialized until stop().
+  // No-op if already started. Thread-safe.
+  void start(CUcontext context);
+
+  // Stop PC sampling, drain accumulated data, and emit probes.
+  // Kernels resume concurrent execution. No-op if not started.
+  void stop(CUcontext context);
+
+  // Emit stall reason map and replay cubin probes for late-attaching tracers.
+  // Call periodically regardless of sampling state.
+  void emitMetadata();
+
   void collectData(CUcontext context);
   void collectAllData();
   void finalize(CUcontext context);
@@ -113,11 +131,17 @@ class PCSampling {
   proton::ThreadSafeMap<size_t, std::pair<CubinData, size_t>>
       cubinCrcToCubinData;
   proton::ThreadSafeSet<uint32_t> contextInitialized;
+  proton::ThreadSafeSet<uint32_t> contextFailed; // contexts where enable failed
 
   // Plain vector of initialized context IDs for iteration in collectAllData.
   // Protected by contextMutex.
   std::vector<uint32_t> initializedContextIds;
 
+  // Tracks whether CUPTI PC sampling is currently active (start/stop).
+  // Only one context can be sampling at a time in KERNEL_SERIALIZED mode.
+  std::atomic<bool> samplingActive{false};
+  CUcontext samplingContext{};
+  std::mutex pcSamplingMutex{};
   std::mutex contextMutex{};
 
   // Contiguous stall reason map for USDT probe emission.
@@ -136,11 +160,12 @@ class PCSampling {
   // tracer. Reset to false when the cubin_loaded semaphore transitions to
   // non-zero.
   bool cubinsEmitted = false;
-
-  // Rate limiter for stall reason map re-emission (1 token per 10 seconds).
-  TokenBucket stallReasonMapLimiter{0.1};
+  bool stallMapEmitted = false;
 };
 
+// Fire the error USDT probe. Callable from any translation unit.
+void fireError(int32_t code, const char *message, const char *component);
+
 } // namespace parcagpu
 
 #endif // PARCAGPU_PC_SAMPLING_H_
diff --git a/src/probes.d b/src/probes.d
index f5cb3a8..dba3db4 100644
--- a/src/probes.d
+++ b/src/probes.d
@@ -10,4 +10,5 @@ provider parcagpu {
   probe stall_reason_map(const char *names, uint32_t count);
   probe cubin_loaded(uint64_t cubinCrc, const char *cubin, uint64_t cubinSize);
   probe cubin_unloaded(uint64_t cubinCrc);
+  probe error(int32_t code, const char *message, const char *component);
 };
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index cf805ef..8a0a46b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -20,6 +20,15 @@ set_target_properties(cupti PROPERTIES
     POSITION_INDEPENDENT_CODE ON
 )
 
+# Mock CUDA driver library
+add_library(cuda SHARED mock_cuda.c)
+target_include_directories(cuda PRIVATE ${CUDA_INCLUDE_DIRS})
+set_target_properties(cuda PROPERTIES
+    C_STANDARD 11
+    C_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON
+)
+
 # CUDA major version - prefer explicitly passed value, fall back to detection
 if(NOT DEFINED CUDA_MAJOR_VERSION)
     execute_process(
diff --git a/test/bpf/activity_parser.bpf.c b/test/bpf/activity_parser.bpf.c
index 9396941..2327842 100644
--- a/test/bpf/activity_parser.bpf.c
+++ b/test/bpf/activity_parser.bpf.c
@@ -22,12 +22,10 @@
 
 #include "usdt_args.h"
 
-#include "cupti_activity_bpf.h"
+#include "cupti_bpf.h"
 
 #define MAX_BATCH_SIZE 128
 #define MAX_KERNEL_NAME 128
-#define STALL_REASON_NAME_LEN 64
-#define MAX_STALL_REASONS 64
 #define MAX_CUBIN_SIZE (64 * 1024 * 1024) // 64MB safety cap
 
 // USDT spec map — populated by Go loader before uprobe attachment.
@@ -67,32 +65,7 @@ struct cubin_event {
 #define EVENT_TYPE_CUBIN_LOADED 2
 #define EVENT_TYPE_CUBIN_UNLOADED 3
 #define EVENT_TYPE_PC_SAMPLE 4
-
-// Matches CUpti_PCSamplingStallReason (packed, aligned 8).
-struct cupti_stall_reason {
-  u32 stall_reason_index;
-  u32 samples;
-};
-
-// Matches CUpti_PCSamplingPCData (packed, aligned 8).
-// Contains user-space pointers that BPF chases with bpf_probe_read_user.
-// We read the base 56-byte struct, then conditionally read correlationId
-// if the size field indicates CUDA 13+ (size > 56).
-struct cupti_pc_data {
-  u64 size;               // struct size (56 = CUDA 12, 60+ = CUDA 13)
-  u64 cubin_crc;
-  u64 pc_offset;
-  u32 function_index;
-  u32 _pad;
-  u64 function_name_ptr;  // const char* in user-space
-  u64 stall_reason_count;
-  u64 stall_reason_ptr;   // CUpti_PCSamplingStallReason* in user-space
-} __attribute__((__packed__)) __attribute__((aligned(8)));
-
-#define CUPTI_PC_DATA_BASE_SIZE 56
-
-#define MAX_PC_BATCH_SIZE 512
-#define MAX_FUNC_NAME 128
+#define EVENT_TYPE_ERROR 5
 
 // PC sample event sent to user-space.
 struct pc_sample_event {
@@ -101,11 +74,21 @@ struct pc_sample_event {
   u64 cubin_crc;
   u64 pc_offset;
   u32 function_index;
-  u32 correlation_id; // kernel correlation ID (CUDA 13+ serialized mode, else 0)
+  u32 correlation_id; // kernel correlation ID (CUDA 12.4+ / CUPTI v22+, else 0)
   char function_name[MAX_FUNC_NAME];
   struct cupti_stall_reason stall_reasons[MAX_STALL_REASONS];
 };
 
+// Error event sent to user-space.
+#define MAX_ERROR_MSG 256
+#define MAX_ERROR_COMPONENT 64
+struct error_event {
+  u32 event_type; // EVENT_TYPE_ERROR
+  s32 error_code;
+  char message[MAX_ERROR_MSG];
+  char component[MAX_ERROR_COMPONENT];
+};
+
 // Ring buffer for sending events to user-space.
 struct bpf_map_def events SEC("maps") = {
     .type = BPF_MAP_TYPE_RINGBUF,
@@ -327,7 +310,7 @@ int BPF_USDT(handle_pc_sample_batch, u64 ptrs_base, u32 count) {
     evt->pc_offset = rec.pc_offset;
     evt->function_index = rec.function_index;
 
-    // Read correlationId if the struct is large enough (CUDA 13+).
+    // Read correlationId if the struct is large enough (CUDA 12.4+).
     // It sits right after the stallReason pointer at offset 56.
     evt->correlation_id = 0;
     if (rec.size > CUPTI_PC_DATA_BASE_SIZE) {
@@ -363,4 +346,26 @@ int BPF_USDT(handle_pc_sample_batch, u64 ptrs_base, u32 count) {
   return 0;
 }
 
+SEC("usdt/parcagpu/error")
+int BPF_USDT(handle_error, s32 code, u64 message_ptr, u64 component_ptr) {
+  struct error_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+  if (!evt) {
+    bump_stat(STAT_DROPS);
+    return 0;
+  }
+
+  evt->event_type = EVENT_TYPE_ERROR;
+  evt->error_code = code;
+  evt->message[0] = '\0';
+  evt->component[0] = '\0';
+  if (message_ptr)
+    bpf_probe_read_user_str(evt->message, sizeof(evt->message),
+                            (void *)message_ptr);
+  if (component_ptr)
+    bpf_probe_read_user_str(evt->component, sizeof(evt->component),
+                            (void *)component_ptr);
+  bpf_ringbuf_submit(evt, 0);
+  return 0;
+}
+
 char LICENSE[] SEC("license") = "GPL";
diff --git a/test/bpf/main.go b/test/bpf/main.go
index e6befbd..3a55551 100644
--- a/test/bpf/main.go
+++ b/test/bpf/main.go
@@ -49,6 +49,7 @@ const (
 	eventTypeCubinLoaded   = 2
 	eventTypeCubinUnloaded = 3
 	eventTypePCSample      = 4
+	eventTypeError         = 5
 )
 
 // KernelEvent matches struct kernel_event in the BPF program.
@@ -80,6 +81,14 @@ type StallReason struct {
 	Samples uint32
 }
 
+// ErrorEvent matches struct error_event in the BPF program.
+type ErrorEvent struct {
+	EventType uint32
+	ErrorCode int32
+	Message   [256]byte
+	Component [64]byte
+}
+
 // PCSampleEvent matches struct pc_sample_event in the BPF program.
 type PCSampleEvent struct {
 	EventType        uint32
@@ -438,6 +447,7 @@ func main() {
 		{"cubin_loaded", objs.HandleCubinLoaded},
 		{"cubin_unloaded", objs.HandleCubinUnloaded},
 		{"pc_sample_batch", objs.HandlePcSampleBatch},
+		{"error", objs.HandleError},
 	}
 
 	var links []link.Link
@@ -585,6 +595,16 @@ func main() {
 			}
 			cubins.unload(event.CubinCRC)
 
+		case eventTypeError:
+			var event ErrorEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing error event: %v", err)
+				continue
+			}
+			msg := cString(event.Message[:])
+			comp := cString(event.Component[:])
+			log.Printf("ERROR [%s] code=%d: %s", comp, event.ErrorCode, msg)
+
 		case eventTypePCSample:
 			var event PCSampleEvent
 			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
diff --git a/test/mock_cuda.c b/test/mock_cuda.c
index 09caa88..a512637 100644
--- a/test/mock_cuda.c
+++ b/test/mock_cuda.c
@@ -24,3 +24,8 @@ CUresult cuDriverGetVersion(int *driverVersion) {
 
     return CUDA_SUCCESS;
 }
+
+// Mock cuCtxSynchronize — no-op in test (no real GPU work to wait for)
+CUresult cuCtxSynchronize(void) {
+    return CUDA_SUCCESS;
+}
diff --git a/test/mock_cupti.c b/test/mock_cupti.c
index b4901f1..01092fd 100644
--- a/test/mock_cupti.c
+++ b/test/mock_cupti.c
@@ -1,7 +1,10 @@
 #include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
 #include <stdint.h>
 #include <cuda.h>
 #include <cupti.h>
+#include <cupti_pcsampling.h>
 
 // Define callback function types if not already defined by CUPTI headers
 #ifndef CUpti_BufferRequestFunc
@@ -146,3 +149,278 @@ CUptiResult cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
     fprintf(stderr, "[MOCK_CUPTI] cuptiUnsubscribe()\n");
     return CUPTI_SUCCESS;
 }
+
+// =========================================================================
+// PC Sampling mock — uses a real cubin from pc_sample_toy for realistic
+// CRC, offsets, and source-line correlation.
+// =========================================================================
+
+// Real cubin loaded from file (set MOCK_CUBIN_PATH, or auto-detected).
+static char *__cubin_data = NULL;
+static size_t __cubin_size = 0;
+static uint64_t __cubin_crc = 0;
+
+// CRC function shared between cuptiGetCubinCrc and internal use.
+static uint64_t __compute_crc(const void *data, size_t size) {
+    const uint8_t *bytes = (const uint8_t *)data;
+    uint64_t crc = 0xDEADBEEFULL;
+    for (size_t i = 0; i < size; i++)
+        crc = crc * 31 + bytes[i];
+    return crc;
+}
+
+static void __load_cubin(void) {
+    if (__cubin_data)
+        return;
+    const char *path = getenv("MOCK_CUBIN_PATH");
+    if (!path) {
+        fprintf(stderr, "[MOCK_CUPTI] MOCK_CUBIN_PATH not set, no cubin loaded\n");
+        return;
+    }
+    FILE *f = fopen(path, "rb");
+    if (!f) {
+        fprintf(stderr, "[MOCK_CUPTI] Failed to open cubin: %s\n", path);
+        return;
+    }
+    fseek(f, 0, SEEK_END);
+    __cubin_size = (size_t)ftell(f);
+    fseek(f, 0, SEEK_SET);
+    __cubin_data = (char *)malloc(__cubin_size);
+    if (fread(__cubin_data, 1, __cubin_size, f) != __cubin_size) {
+        fprintf(stderr, "[MOCK_CUPTI] Short read on cubin: %s\n", path);
+        free(__cubin_data);
+        __cubin_data = NULL;
+        __cubin_size = 0;
+        fclose(f);
+        return;
+    }
+    fclose(f);
+    __cubin_crc = __compute_crc(__cubin_data, __cubin_size);
+    fprintf(stderr, "[MOCK_CUPTI] Loaded cubin: %s (%zu bytes, crc=0x%lx)\n",
+            path, __cubin_size, __cubin_crc);
+}
+
+CUptiResult cuptiGetVersion(uint32_t *version) {
+    *version = 24;
+    return CUPTI_SUCCESS;
+}
+
+static int __pc_sampling_started = 0;
+
+CUptiResult cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *params) {
+    (void)params;
+    fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingEnable()\n");
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *params) {
+    (void)params;
+    fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingDisable()\n");
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingStart(CUpti_PCSamplingStartParams *params) {
+    (void)params;
+    __pc_sampling_started = 1;
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingStop(CUpti_PCSamplingStopParams *params) {
+    (void)params;
+    __pc_sampling_started = 0;
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingSetConfigurationAttribute(
+    CUpti_PCSamplingConfigurationInfoParams *params) {
+    (void)params;
+    fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingSetConfigurationAttribute()\n");
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingGetNumStallReasons(
+    CUpti_PCSamplingGetNumStallReasonsParams *params) {
+    *params->numStallReasons = 3;
+    return CUPTI_SUCCESS;
+}
+
+static const char *__mock_stall_names[] = {
+    "smsp__pcsamp_warps_issue_stalled_not_selected",
+    "smsp__pcsamp_warps_issue_stalled_math_pipe_throttle",
+    "smsp__pcsamp_warps_issue_stalled_barrier",
+};
+
+CUptiResult cuptiPCSamplingGetStallReasons(
+    CUpti_PCSamplingGetStallReasonsParams *params) {
+    size_t n = params->numStallReasons < 3 ? params->numStallReasons : 3;
+    for (size_t i = 0; i < n; i++) {
+        strncpy(params->stallReasons[i], __mock_stall_names[i],
+                CUPTI_STALL_REASON_STRING_SIZE - 1);
+        params->stallReasons[i][CUPTI_STALL_REASON_STRING_SIZE - 1] = '\0';
+        params->stallReasonIndex[i] = (uint32_t)i;
+    }
+    return CUPTI_SUCCESS;
+}
+
+// ---------------------------------------------------------------------------
+// Synthetic PC samples using real offsets from pc_sample_toy.cu kernels.
+// Offsets extracted from nvdisasm -g on the sm_121 cubin.
+// ---------------------------------------------------------------------------
+
+typedef struct {
+    const char *functionName;
+    uint32_t functionIndex;
+    uint64_t pcOffset;
+    // Expected source correlation for this offset:
+    uint32_t lineNumber;
+    const char *fileName;
+    const char *dirName;
+} MockPCSample;
+
+// Representative offsets covering all three kernels and distinct source lines.
+// Offsets and line numbers from: nvdisasm -g -c pc_sample_toy.sm_121.cubin
+static MockPCSample __mock_samples[] = {
+    // shmem_bounce — shared-memory bouncing kernel
+    {"_Z12shmem_bouncePfiy", 0, 0x00b0, 54,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // STS [R2], R3
+    {"_Z12shmem_bouncePfiy", 0, 0x00d0, 55,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // BAR.SYNC
+    {"_Z12shmem_bouncePfiy", 0, 0x01f0, 58,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // LDS (inner loop)
+    {"_Z12shmem_bouncePfiy", 0, 0x0230, 58,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // FFMA (inner loop)
+    {"_Z12shmem_bouncePfiy", 0, 0x0250, 59,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // BAR.SYNC (inner loop)
+
+    // hash_churn — integer bit-twiddling kernel
+    {"_Z10hash_churnPjiy", 1, 0x0050, 34,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // IMAD (idx calc)
+    {"_Z10hash_churnPjiy", 1, 0x0080, 39,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // LDCU (loop start)
+    {"_Z10hash_churnPjiy", 1, 0x0180, 40,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // IMAD.SHL (h ^= h << 13)
+
+    // trig_storm — FP math kernel
+    {"_Z10trig_stormPfiy", 2, 0x0050, 21,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // IMAD (idx calc)
+    {"_Z10trig_stormPfiy", 2, 0x00b0, 25,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // FMUL (x * 0.001f)
+    {"_Z10trig_stormPfiy", 2, 0x0f30, 27,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // FMUL (sinf*cosf inner)
+};
+#define NUM_MOCK_SAMPLES (sizeof(__mock_samples) / sizeof(__mock_samples[0]))
+
+static int __pc_get_data_calls = 0;
+
+CUptiResult cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *params) {
+    CUpti_PCSamplingData *data = params->pcSamplingData;
+
+    __pc_get_data_calls++;
+    if (__pc_get_data_calls % 3 != 0 || data->collectNumPcs == 0) {
+        data->totalNumPcs = 0;
+        data->remainingNumPcs = 0;
+        return CUPTI_SUCCESS;
+    }
+
+    // Cycle through the sample table, emitting a batch each time.
+    static size_t sample_cursor = 0;
+    size_t count = data->collectNumPcs < 4 ? data->collectNumPcs : 4;
+    if (count > NUM_MOCK_SAMPLES)
+        count = NUM_MOCK_SAMPLES;
+
+    data->totalNumPcs = count;
+    data->remainingNumPcs = 0;
+    data->totalSamples = count * 9;
+
+    for (size_t i = 0; i < count; i++) {
+        MockPCSample *s = &__mock_samples[(sample_cursor + i) % NUM_MOCK_SAMPLES];
+        CUpti_PCSamplingPCData *pc = &data->pPcData[i];
+        pc->size = sizeof(CUpti_PCSamplingPCData);
+        pc->cubinCrc = __cubin_crc;
+        pc->pcOffset = s->pcOffset;
+        pc->functionIndex = s->functionIndex;
+        pc->functionName = (char *)s->functionName;
+        pc->stallReasonCount = 3;
+        for (size_t j = 0; j < 3; j++) {
+            pc->stallReason[j].pcSamplingStallReasonIndex = (uint32_t)j;
+            pc->stallReason[j].samples = (uint32_t)(5 - j * 2);
+        }
+    }
+    sample_cursor = (sample_cursor + count) % NUM_MOCK_SAMPLES;
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiGetCubinCrc(CUpti_GetCubinCrcParams *params) {
+    params->cubinCrc = __compute_crc(params->cubin, params->cubinSize);
+    return CUPTI_SUCCESS;
+}
+
+// Source correlation: look up the offset in our known table.
+// Falls back to zeros if the offset isn't in the table (same as real CUPTI
+// when debug info is missing).
+CUptiResult cuptiGetSassToSourceCorrelation(
+    CUpti_GetSassToSourceCorrelationParams *params) {
+    for (size_t i = 0; i < NUM_MOCK_SAMPLES; i++) {
+        MockPCSample *s = &__mock_samples[i];
+        if (params->pcOffset == s->pcOffset &&
+            params->functionName && strcmp(params->functionName, s->functionName) == 0) {
+            params->lineNumber = s->lineNumber;
+            params->fileName = strdup(s->fileName);
+            params->dirName = strdup(s->dirName);
+            return CUPTI_SUCCESS;
+        }
+    }
+    // Unknown offset — no source info available.
+    params->lineNumber = 0;
+    params->fileName = NULL;
+    params->dirName = NULL;
+    return CUPTI_SUCCESS;
+}
+
+// =========================================================================
+// Resource callback helper — called from test harness after init.
+// Fires CONTEXT_CREATED and MODULE_LOADED with the real cubin.
+// =========================================================================
+
+void __mock_cupti_fire_resource_callbacks(void) {
+    if (!__cupti_runtime_api_callback)
+        return;
+
+    __load_cubin();
+
+    fprintf(stderr, "[MOCK_CUPTI] Firing resource callbacks\n");
+
+    // 1. CONTEXT_CREATED
+    CUpti_ResourceData resData;
+    memset(&resData, 0, sizeof(resData));
+    resData.context = (CUcontext)(uintptr_t)0x1;
+    __cupti_runtime_api_callback(
+        __cupti_runtime_api_userdata,
+        CUPTI_CB_DOMAIN_RESOURCE,
+        CUPTI_CBID_RESOURCE_CONTEXT_CREATED,
+        &resData);
+
+    // 2. MODULE_LOADED with the real cubin.
+    CUpti_ModuleResourceData modData;
+    memset(&modData, 0, sizeof(modData));
+    if (__cubin_data) {
+        modData.pCubin = __cubin_data;
+        modData.cubinSize = __cubin_size;
+    } else {
+        // Fallback: minimal fake cubin if no file was loaded.
+        static const char fake[] = {0x7f, 'E', 'L', 'F', 0,0,0,0, 0,0,0,0, 0,0,0,0};
+        modData.pCubin = fake;
+        modData.cubinSize = sizeof(fake);
+    }
+    resData.resourceDescriptor = &modData;
+    __cupti_runtime_api_callback(
+        __cupti_runtime_api_userdata,
+        CUPTI_CB_DOMAIN_RESOURCE,
+        CUPTI_CBID_RESOURCE_MODULE_LOADED,
+        &resData);
+}
+
+CUptiResult cuptiActivityDisable(CUpti_ActivityKind kind) {
+    (void)kind;
+    return CUPTI_SUCCESS;
+}
diff --git a/test/pc_sample_toy.cubin b/test/pc_sample_toy.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..849a3353d6ed62bb8419035853897f981c29691c
GIT binary patch
literal 39864
zcmeHw3vgUlnda$xyQP-oNV1YZ%p-2g){-!`+}&zDObjFWnamLI7-oirVin0!+X7i?
zu3AYOJ4DDZlK?wpva=ATW{MO=?NCg%U~6{SndLENVW|uoC=Ug-Gt2`o6}H$s99eO|
zJSN)j`_DP|-IfQ!Fzoo+y7!*P|DXSL{{KJccK5w!|Emu;nao&T>BCW<QyFv5%dy|r
zg^&GPeRWQ#wox^rTGXf-V9)-AV+(b!J~4?qdp&Vnm*qOXaBOa>q59|M>(l*nclOuc
zIx|<D56VXtP5I7#|K`a2@!CXP_1C8x_5SfUmdXpqj!z#SpPXNuJ2HK!I&%Wxx_k51
zg<})m^whfT-t@w>cPBa)1?SP(`N@e{P%?wzF~*VkxrO?~T)jf}`@QMe%h819iOaQH
zc8bR)7LJV{IkxD{9ePWQnY6Zml|-wLvhK|s9bc%=d&d*<DW<fAP{v!QrY9GVj#s^j
z<I}2N7iMSXa8K@I<%vke+@0e=`}o4d!h!~BQa=F!CjtDrTD>t|Z$J$kH>8UG+RPED
zBz%g3BJ_(&A~l4Wn4LZ9P1KHwifi6{eZGF81`b2`ip?#@+JtmTXe5p!P5MVBYR9L&
zBbT!kAv4-5b*LDow9#a!xC}y-TDurf>U{aL+9)y6=IbVC31+WBBNff181I$FU7}Ci
zR_*ZX&Adpzz(z&mGKe6!9P6foZ*STPQ}CLJV_R^w_7Vj_L)_U|cFp9Bxh82{(7shW
z<8{uzu&&uM{yI4Q{`qQkVY;rpb5bzl<4~I4|Mva2kL&*9#qo*RnF%<cn4HJQk1S42
zc=hp_@_cP(yc(JGzJ`2^qyV#3Z5tn-nX6BGa}%>bjS{hpFB~ryG5YcG#nO=WQ`R4+
zM6C5})Pkl>1V%<qrL?w?E04z&A7#qC@lmF<g;2&@BYq@2lfYgi7zn%J4r#NCKuzkO
z*XZEK3`3%sq#+uEY1=G1q~;+hI$>2@Y9TLBiCEmvEX=Iq8{*A3qrz1-3k+I!Nqt*T
zVeZ~6^)E=&e?aPAl&Jp{YPR5S)U;&H^*gv09(u7~Z>m)bQcG8DvPJ*;0@P<h^*5N7
z_qyzVz2DE?jG#ZqRc1gt?(L}WRWDFQ!}1|VLwo}Du6m&b_pc7xU1dk}tJSYdy~P^T
zPfC3*s4#c$m-?1O{nw?wHBny?zPb|ihfv?s9ms#FU3Zz5`hfl=LqbF91NxUX>i!=R
z{4Yu1|BTeXG*SO|Qh#HjehtSqQ_FiEuF9U`wQIvWmzHi)`L%a0-ISMGrC7f`fkjtX
zon&{TtYNc{LN7cB$7&wD=fQE7mX~gd50k$S!@htm-Jn_@53mlJC!8Jsm}#@)T3g88
ziiS1<yo*3`5bh`C^Sfpod$GF77EojO%!c^<Nyj;4aQNdGT}Q?-CYyuZIqEmjHIw;K
zXFJXrLT7d!n?F9ivtILdE_g?F)+W3o6Gv-{J0w@0+j)HEh&MkuJ$K~T@d@v(3rf|F
zAX7YEo1Gr7&!6Z&vZykTJo2YsZTT=Ja2rO@M>gXYGZ$9fyPZPDb$R05vro$XQ?BEz
zsEnI&KAFj!Upc>myF%tX&S%s=;^Mx;DH6$9LDdWty)b&lIhWpR6-^e4b1n;^M7bzr
z0_uP#YX}vTXNCF;z#zp!=E>kz?$#?mfnKD0<-9>6;%d<Zp}|?nBnY}Togm=>r+b{U
zFF}ucfP-g<c?HteA}Sh$zl|y4sBY>$VhE@RX;8WU;AGq`r#o}eJpm3^o?Us|ZAZ~<
z;PlKhPq{6qz2N4^!V{}cU06BqenGK)l=7gY+z}RU<H^5Uxo}}+WxbQz2KvvtTZrQb
zPAL%X3)whF+}H9{BN=|b@AHnj1%(&gue^gj{v?C*%4$-}%3T7d8_@dH#V0@>SQEV0
z5z6847qlV2k`)Ju^C_^6;dip)Fi%B$w{l@+^#V@#r!S_ox(ay}GtbC4?jpN$ME6dd
zq_c?gSs{5fsd(jHpK(c$Ch_8vZk5n4#_58{di8s&?p?%%<07yWGpiTeJcxY!qI(LL
zXJqP8B<Dm<?%M@4XC_85<8}}sb^ASO15xO_L>q@8K?t0w{CD93$mjf}F|4cz-j!zp
z@C)aKDYx6=dqUs7jpYr-7GaUALiaO|L)oyW3+@6gs~4VFg|sP@i_bt46eh-js86s}
zCiAnXI}g2pkH=RrI<CTRKtY%R3`EbO{R_^QXX7b%BgPRaJb7XDya+<{^o*NM{(1L$
z0=$LPt-77Or(orDe)at0-_umhqwag)v4gCT8mnUCFZ=JD_7-O5=L);HIaQzVj!xGL
z3&+Px<&x^3n3}>OWnp|_=IzskA-z4N*OP^QtbC5n6)J`P$&o^Qz4NAW<;{gx7ETTg
zZZDK7mF<P0Vf+<Q7#v{DuI+_gBijq*5^hSn%D5|+wigC@Qvpc1i24zL4B%R3bp_4r
zG6bk${FO>XHo#gUD-D(Jz6D!ncg`%#OwLXh`lsjW-ibmoNxn(-Bg{-3FZ3@Cf%ID4
zvy7BddXG!r@#6LXYM|V_Un$fN*ps6hd}VdBI;r|+=dlQ|!)rOVjgLz$Hmqv%_2Qe#
zBX7QiOTMF!GBCFFz3x!F{AycK!$u@})OMGPud;QMAh)g0+dWc=RaJKnhN=X$H?_Mm
z9CVYq>eTK*R4H|Q{!Wn?DR}jE=sQg2Eoyd3i`De;rb^J_o0<xs2(AJUO=B5qYz|S1
zKp1*ULze?70|fRPkhHy^i7$eoa0ATCZxB8;K?AU`z9aRCnW_H9Em{Dk0an*HS5MXx
zk*%JXf`0oKGz0-W!ROIrabE@v*l?)zADuRxYSJL5@~&IdXkV>VnDizJZQEWwKE7v9
zalBjt%#l0m{oY)Eb)am(swK>>J`Zij7iU=yeOEcj`lO{;9T*!KD3yl}*a4^C*1sU~
ztC3u23POZLFkU$}9_2)NXv_1aDgZ!TU<%$8d5X{dTa=}E;1xg+P(_>2w1KJv<??}D
zBLlY>j%LV?A>SL(<PX}$2)iJ+4?S*C)#DRJ6lA#sZ-FzJsqQ(jcUSSizA-&qq9@#k
z-*aFLo`+6_7~(zYcx=zH;l1U7%3fiiGy=<-T(Hun0wD5Rv_fa=i>9F_Y@oGRtqkwo
zUl|^;($iKk42##SP^4CMw1Da$>;oDZh-#z+EsN32P5WcW;4;SWfsrB)fviL!hqT(v
z7<8n?s&tnA)R8tSF&<5U5Ko&E69^~<ewbzd;@nJNnk39BedrZo2*Dz<IX2sGywLz@
z_Rs<`&U;czC#7p~-cc@9hASfjLltz07$Br?L}*YAYvT=@WQ9Z0GNYEtAcShj)e<`V
z3V3yI;^^^-Lhl@&`FQiQv%TV%%#P{Yk^V)^_W->apeqCLDaSFj@0yvKwvAI|nMmwo
zn3$@-bQg|!zUyI3<KXDXnZHoizF%Vy#FaOj9@=Lazd`o;Jf0z7x-30&L5SvtmrATf
zvrj3Y#aN4VF+v8>!5&~<gHa=a{R`2CXBiu4%CzhR#pPc@h!>j4?8MxZuSZTqZvfB(
zBs$bE;J1{^mTEH=C!ojn9i=jq9wtL3$pAWF7#eC+^ujH(1Twog=?jSI0VS@_-~i!9
zLYCFqFb1RKOGPr0(vA^rJHwSolD<<Yu8DAfLeI9Ls31$IT{u?l2eEkmW6ho;LYcy<
zxlayT0o`WU(h!GDq7ek_=OYVjx>S5cSa{5u&<g`|IwWz?qx)FUi?$ieQ$3SNu++nx
zwYIeJ5mq!zvosBQ3W#l1p!~V%qkfYJOy&in-fZ0pQiK96lD_HLKv<&oih%-^HCSgO
z>@5UPGe85yY_n8o0U7clFdC_8kK!8?fmU=I5HiprMYKFml{Ht0%pxI^ft1a}Mk6P7
z0!Zi|1aG7qF!Xa%Wg&ga!eTDksE+aU46ja!=Fn9eeO1P*iakqPacW@CfnDX&fjxtj
zN~OH71i~y922$Oh>2)8Vv?{FkC#O{ch)oBo(7b~<tBXiTDjP8B0#np_(NYN(r&a35
zBpDSViNw7~j1iYv9n_H~<|ZWSdJK+eu+vWa`<kBC>>+Ki7t0h)2w8;=fn-pS10WvM
zMUhoaKv|%IKp0IaL_yLMn1}c78QWWg43J)o?HMW#@83u2LKINfRl=ahuFAfOEOn5@
z#iU8?1K+;#*x1;fJ#xRdI5IN0p9YpjFV#{lYwVn_RT@QPWQYo5?mr-hz9;rELZ)Dp
zM6_Dgcr`{CLzH((Q>&&YDXTIJguS`sn4%_n2U<Yj9ZO=p>;dLO$;|*p3nw1**hYZ^
z=-h{UImeVMlp2cIt;T#~w6C@+=60<~hti{xAQW4S<VU4gvRgv@2onl%hY5{kNP<)+
z*#joEG6@M&SMZ$7kU*P1Wh#*ri8-su9|PYzD2|0q(IiRIT#v2Ivn0~4xv@miY(wJ@
zre{9M=*o%gjR4w)YvcCxB1W=*`ZMr`@S4b)20;j8*#H-BiL6Of?XeG5FWMtkAj?SF
zB#l$-N$(m?fW%x7y$LM@Br}wO0iB=tpgKsA9nF+=FLS{XjMgO#>jbG3Prz+nr%??G
z*;G<`j#LVAB;X5A9&x4~;p0itAdOvG;z`n=XoKyEB#ApK078efBuVGn>=RFtgk2-V
z0W`*wq!GwhvXO!GBxwYivF#~Il3tpUS6CnBYbmxW;W{drlyFcefpg5$_ho>`NgE1C
zYmz8w4D4sI5syC!M<SWhkUv~{rZl82!|z3(7tGcndPpHz>itZK+h2ViVPl!nkXW{9
z223bZ;!YW0I2EFqk}RJ+S>duR$dzQd>{&QK*Ge#haN;8iT%iMV%}g?rEe(lPSrWrQ
z!(vpnJ&JHxq-xq=IPt_sFNTI8?{K!nMFIT*q>`2`>Gi=dre!!=V(L#I1B=*a^PEt&
zqz!oh_CzOUvn7dJQ)s)gY)R%xbqefI9X2jeg;h?DxH_~7K^?gaB?5ZaKqLS-Q3pis
z+ug2LXh)`9$gs96oWXW9%7{3y6TtDg9}xB*!`k39m<_%*M-l^K4{>loh9Z&_gpo|C
zmfZN@f(lF%QI$~PP#Db<WE~<suOvs}dL)t^RRvxs2#Zj5nyLeSUKGxkVBN5R7!{Oa
zRaQHO>Ix@fNGV*iFh{a^Q8-^p#VBf*6_CXU6NrsU3#iK4zG@@}1B|Fi=09~T<V15M
zvFl;<fc=D0j|!w7mUk#elGO?Jl4MwM5P@>J3l3s5Nkm<vca0LPGZr>h*_~3|%LAEg
zB)IBTmF)MygtXx^NJ{jt$)ZB<24Y^)ozS(%axo-}*5b=plXA}DL#y*PN#N!?mqH>o
zAc4OM7Wc+8P3bi*Oa_t<$9}!4wXhXVzq&;%vz8&LQeyO%Z8@J1ELD^@UB_5=r#~x*
zezIx3X9dyT&7T#d_DgwIklN2YE09M3{tka=n_jz--9tmMoqO!&m)K1)a>`>2e>VyS
zARFze<E3rQGd*h#k-9Yx^J)&kop~{Nv!PI;Nv7?zi>amTv`hNbe5YOTdD>1H^R)%1
z?X=UC(0aoQiPAWiY|;2B;SQ%AgM+(@10%x&fYVsx+v%KFf-|zfD2*dW7S8~~7t5Ed
z-<v{r?p~Nkucw9V8%q-mpPs>Pz%v8T$Rl}aX&lohid_5-k?*i5OvZ{T1{(iq4(6VO
z5}AT}qLjw5J_%<xTqPqT0sAG;8cq@Prig8)NVw^#htk^{>_H(%pd@nIc7gKuhV|YE
zSG%zkMwY9dZ3d8d1Tz%4(&=xX$L+&VlYZ#JXG=T8l?6GQc@TpnC(e~#nb`IyS9;yS
z9&xVBuC~aE*-co2Ivrw<7*{&0vCUDgbULJa48nXO2svY9!!$v>$U-Mg+DZeJXopIl
z8z)}7RNEdMOGg9th!L;%2W@kdc)drcmn;?T9Hi|M#xxS__Qjc!kmuQUA<wrnvt<Vt
zm0(IlVcVljne9IGh%qJGMxJesGNpH>%sLFjhnUjK40fRErNM+aJHMcOPR0<7K?E95
zFD@VmvSw1^q9=T0<%Ah(!z<>#*?$Whk)Hf!p#bj|+2$8dB<VV~2~8N*i<^iGX!39*
zY=e3OD9tSl3iY-<sxG~;!5(pS>8$|U9#a={9@#92sY~{EWVy^{D|+n~v`#d0C0{M3
zP5U&Qtsbqq#5u`RQF@G+T7pKcmdM6`X`E3ZtYDd?n#7^#Brx`Z4jjcgUaSq8WlDNW
zZ3^BknJDQIFhQ+-Szd(6DFaFo1c{P(X}xxrZIYo78GMRx*+ePvx$|{Wr9}Ux$x^eP
z$#kh%&(wseR0`6g(hvjif_$*Se6&K_#O_i#=p|Lv-IbA`%1Cv0X+ZKSeDQ(b{fd5o
zWBiRN-y_>!sn5(#zo}HTPnx3n7>tU!QZt6qGO~3*Qd2UM#5Efxo|JLMrRHR)PfN*A
zotlw}4a)R|fg(P`fb9Z4^dSZ|F%b&lv&<dsPbFQ^I)byjjEE!dRh*E708APfzWKU=
zT_MQR%POp1O-|t1(TTYug*Hs;cvdGwPl0|t;Uk%186T1|?BJ^zHt|`oe2OP9*2Y!5
z#}*7I-Z#`=B!qf=#iJj1#5#re1q}tc1Y5JZT|Wm|KrcJxwRpt!3(u0nMw3F4Ku+ML
zVhN{=!zM@y0V9&SmKkiKi2Uw5&|mwm@;E;1k@h&9%a5?+@gM=_4rh%uK%xnfKocLw
zMi;J>2H~(`#iua#4HxC39kiDwY6ZzLwI;@bsG5<%*_B2Ht+)!fC>dOcHz9-ZAZlF+
zGGOM-$bcSG$q?mN0T`tNDPl7^#2D5ggi)dyB@8g>q|kP&1dY;SVog}_0x!LxxAmmL
zq^T(}T60M&ktLP=)Kkb#xVz9)GRzs)lMM9Ulo+kQ6k<3F<x@#9$z0AEG0R_v7%qLA
z5yQ{*h?wK^$2{%iMto!j@#8(sBvYnqTaOq$VN>TaSAOZl&`IHOVuBb<+NQ)ng)w3x
zIkxslG*h~ySkgut#X6V%L)=?r@eIcm_%JMd1vbc~>3INaK_h9ncK;FANY=#1(l<pH
z^|}d|z-BxtFH7c_@C{rBH`TQ^%B10$=ehum>6&i8iJ#)4IVRpDG1?ua>YA(UR9(v=
zy2|Bulo)x)(u5d3Dp`*hF2I{fhD-ExVkC?ZG)By20tq+%*Cj?fq;$zNizQ}r9iTB{
zE)z~<Js+8pHuz$>7Pnjs3{xb2vA4)dMO$vn)EV6b0P=@{4u|)|0KnWRVHDhFXsU;1
z@k_QD%yWfEJikl?<1X)ddf*mCGh(<)k!lori=UuTJ;(!uCKk#KjVLiz53+KKcmxPY
zdjz^#^C=0vpI6gOIHL)_TY>CJe}J2jgCDVh5+b|mCf!UZfqN}Y`PR0XbQ9)D0YGDX
zUnW@c`Fs<;%hmuxyR2{uNa}e49GEE;V<yG9mE<OHY|Ylf6p%MXV*I?=J~PmXF$M+h
zNQ8ib0iz*l%W#F}0}FORfOWDwGN`~&`ugNnsQ<`7ZNPT=gZMj(uTZ|`H*VW^+gM@m
zyjR0ln)Jahwej1PL-@M`e+s`~fNzKI#kaT%{I!vp*@-&8YEhq`pIz8qz!!t3Cl;o$
z0oe~({RS`nBO_|h;>_$+LB9GeRUXefHT<;KkI%)HE5oJAcv0=(uP5*;)`h|je0lDe
z`QAoh$8mgFqkymbfrLE%zKgG2Md>}EmQkPGnU`;~@*A7kjrtz9Qhyz;gk?SNd9eSr
zxDqexJ6MT>_pIlU?TOy|u&B4+oOSS3U&qNPCzn%hYbN9HYhm~@ZnlFX>LWXx)il14
zqnWRbt!z<QQ_hgOm5qK`Iz78Zxrgu_+83`W=hYeIjsy3vfTow^)fp(J0FdmBS?TI-
zx<O^z^Z0hIQcEuGhjjf<J8pQ)8gSnaw010hWA<k{7QR^*=a0G#kBOpuFO}El@I6yr
z|G`{;Y_3zR#6i2ZdS&_L<~yIPw|Zs$r$hBt$E>&C?2Y4T4fPM>`QuQ17!PsrAfCWN
zl={6U{kMne!}z}_SOUK6cX9W{>P=BS)TG|FU+H!EWM)l$3%yDFx4YD)6FCjK1(*I#
z^@S6Lt9wzN>{QDQ7!Cf*@zpN0oBn;M`6Sv;X4lkT;htb0=~Tb#<@4;mcIJ1w)T2Aq
z##2s?O>5mKe|xjqc1mT5A?L9CTJ`W<fcH?*4h%PSs{I}&Adl+`N`1LY-MgSTB@Ml<
zdY`8N0C#ryLI?4^^duX3K+r2)>fqwTc}?L`hkAUc>Tb9>s$lKhTB~}jOZ7CeSy~$C
zYbF0qqfL#n1HW>Pa!<o`{dVxX@5E;e`)6GBco*nx3Fz(ZWdFS1AMg*~)u!9?;HTON
z{#`fVze_Ej$|*5)a)kcxYp52s*Y0>m_<5j_ZKK~7`8~2T%I_oO??1PZT+E~nb)Mt3
zt}Ri191rw7&}eD9gt;T|f7PYlbt0<;%0V#R2OBLd(td8OOMPiG+WC!WfyWO7baFf}
zNcobUF159h%eMcot-D30U*0VGkb8qKBL632<9(Oq)&T#ue&A~reB@CXd~$!q_{_I+
z*?(In@U;m(`Nal<PwuZ6Ux<If_k8k8`mXXD=+}+TSJ=zJ#n)%(^Vil+W4^NdnY?e`
zC}VzP=}&5RaK3kmebb=DzENI2)f&vFFQI?fer3KK3h-gRMCOCo=g(6Azv%X!Nz+&0
zf1n@wEAg-N_s|FZ8Oz!~cd1f6C+=>Im5*#z&Z(^Sr(N(@U+z*HpkL{K=z>fCd~orm
zKwokH^>f!(QXcQR&kg)5>ANo*_`8o{{@<H(=|3fee)+ZP-qT1mAqsF0f1Fb<eJ=|<
zAO*|^iHC1Pk^cO#PPO5b4!^a@l@9gI_p96epQc%oc=WClZZO`rO*}pac>X?0@4wOb
z-YR9pn@-h-curcf4&opD<!_x(5+Jghx;LnA1OEv$%AxVMEabGlx{=T!J~$2jmdnzw
z+IZQ~P+(^XysWy2$5~T;huX8^sz;z?#2d8#GXApEw<*60NV2q-4K|)1()|lwYfk@K
z%&4M0aC{N)S&R?)<`-IJT_eBT0DoQZT3W=vDD($E-58Gok$8{s(5~^FLHna!D!0&r
z-*UvUQjqx3)5x{>@f-Hp?d8N^*Zw#11;B6f@{+H(lo#lAysQ!xn0z5(-)z4(uX9?q
z!yg?w@lf`n__r^Jz7C%Fv0`ouf9I+*UFrs3-st}?pq$J<_4JV+Et33GW{OgMkO%DT
z;k%#?J0Ip~-*ERf--P~=Pr{I}u3<h2{d2x>e(CwLGcjKtzAGQg-*W2d&;7^0U)!zq
z`|w@uYs$`-Ny&!*VEozFHp4$X7WgORm(0H;e>M3R{Ga5%I-g7Cd&{R<f_Mdcf&Xmv
z;~DHPlHZCyyg%@_s}lcO#eR&xE!7=yt#SfB=AV8(_;SeWR9@RVJqqo)0dp>3ekjfF
z2OIgk;A1|B@q4_i*t?Yn=rQ~wAA~(`YPf3~A&)G7m<sc|$8%lEU;1nOF3@ZID5G8c
zi?pLZ@^OdO?=l}Z{e60*e>T$prydXVh5Z5ecYmhifj>w_{gd(6$nmGT^mvbTWxE@W
zBl@6xVGjovmxBCC@~uk9e&Mf9IoduLkH4B`A6zft5c{A$6ZR3T$DR-S@N$8^VE^QY
z@)UnA{Cr*Z2qa$wKi!QS)@y5P4*3QB%O{;Rbq)ZGfA9}|4M+Tg6T~}-zwHw5x)Sm3
z|9AhK^uLf3k1hDSBLC~g{I8z>D)7JV<^6AHy$}DsAMsQo0N4BCzu<1H|2>BIj`jL{
z&uQ#^j!JvPKZ-v-m%j$2E3Sv(kC%~OO2Vl!;_nK`FHirV7x{<4Ge5~L9sKG+HHmx!
z<LyWM{Uj3t2l)>2iyVI3OO8@O>}S76Zg9wc#AdALT5{j)`ino8C<TAU^06%+`4h=0
zq@2(FkIH}lBPj#FW8s+(pxyDf(SgI{4+rnNSL_q?yUhPcAMWo_Uju#LME<hez9wTC
z{+(s|)-b#PphCmT{sfar4g>jaV>2d@_4kfv;!hVUF~1ydgTaR~=5J4f$ko!?k4?UF
zs@?bp$n&wykZ&k{A)k8Zi8kz9;`q>w8`MXzAM2?N5-)p2zFFk=8%Texm-%6rl-a-5
zd(kg*SGCsrJ->`h59LdKnfqq7-XBW&e&lzcl>R{08-tAvf*<xL`#Xb;_I8mM%U?u1
z#d?Igs2xWB41M%@Et0;qF#dh%wb=Xh{U7Zs@AO=6{h==RUKcHwuZ@=X^hC>tk+Whv
zt6$1n@>4%wM81H-#%t_XMf};OhFas^ti)Hb&x~4fP(l0rKh#f~xHs9qg1_%+tf|e!
zcji69e|Mv;ZAts<R@&R9M%x-KxQ+2z;%!Hp#C!6OGW65&UGxv>qvJ^_`}*iuk+QG9
z4(7A9K))TtSL^+;)|aO`wEm&Lv#_t;HrXpR^2l>M?myY_oN{}(T{-18y*IDrCH=Mi
z2tCT_X%Dxf+@|+`O@GH}50|0eHob4mcG#=a+a8oTKS5s)nk;?JWP7-*oXIw6^4s^c
zhs(;@(-ziup8SOMfAFuj6FUw)Kb@Y|D8Ei?gdfUB&kyaNoZi+bKa;Id`uDU(=`FWL
z=;M6W_5pp<9=taB<;=ED*aQ6$#`o!kJ#6yY#a!Fjj(9qDg0l?=?eB5g(>3ihM}fzD
z-e%^T^)HzJIv+6pb-CWMX~(|T|7gn(Zjt>?+BfOB<~luo5BwnslKhbN>~$#mb5q86
zYK_0XDfSD0)r0k$Fz}M1tPgs;wrgk~7Jj2A;A$=Ze#~!KUt&WR^M6w87wfw=@ppE8
zM}O+C=l(vn9$T*r`P_7>Mf?kf$g3M~jIQU_gO9gEAKi9++714;-Kg?jyT+ec%Brt!
zhCVP752KHsMl0RBzn@*IWBc~u|2ZuF1@bC*9ql47H7fCw``ws+uQK)we}d;z0>3Qz
zBIDr-4TAVFKVXmYh@{5#Im%n>8#<n9{nURGo8kPy_@Dmx%U>z@An$yG>fw4H2io)S
z&#xQ1{m)R6^?tt5+4|VK^ZZFuQ{LG2v5&u;FJX{!JU(yruIV5u>JafNyVcv!!FbWh
zGUR2;T$AW>=Qa8%oi1<b>0CqKB{q|ugZC|AARc-?CgW{B7H`ddz46}%@2f@Nmrr87
zsN=2JV<g@}{)zp8M7)*gspBo|sjtzUBYo}k&yQ|KeBIa`#8-*u(r)G(^8H-!d;{{p
z{1aJ);;qnY{Vnt<`9~<;$}`kZyp^pL8*kZP$6xX9v<KNwK|B?E=<_z<i7TJOlHS+C
zA3K82*jL`^y*^qlUl%R!xi(rp+!HNNf+yg&_R4%k@;##u$GgVlFX%)4wmlr|uOOe=
z)M#HPo~rh901d^{_RUg$pNXfi6igW88z!DAiKi-vr+U2SJ)Y`(2lDQqJ&1lJo?>dE
zY~{=LXgtk@+YwJyIKJdzA0|HAc$y2hLx1LZsvl2t(fEVG<vFY$Pjf*T@^^aiJIx{b
zoWr?rSviL_>6U({oReE$zo0jy@4T}o7cQ%V|93mF<FN5GALZ9E&-RUdP(IOkYCc0?
z?dP!hjDRhd^HF;D<RkR;Al_;Fu<=yo#J~FSG#}6_@idobd#-6b-KZZxu>IRz*KNam
zroL@Fwee5v4f0LK)4asfjf|&g58|(mcUZ4)6nxy@V*K<r$n#MfFMGUe^7Nfj{$iKf
z=;g$fO8jL1g?7!4iMIiMJ^#Bs71=L!9x&@A$}=9{)+=Mb|E5#z?Gjs_w|J`a3+2bt
zypD&AH>vSdX@7bK@<M)t_?Z{~t42k>@#nu0{|kA+-XtCe@v<cnFF%O&ti)@xenI>d
z{jZ%t`~^JXr;U#?9{0;l{50h@8!t_~Yw0lZGVOHl5`VMIf3|wMU2+`o^FHKf>~Ei+
zZ}mEY_$u+VBfy99mY$9^bXWq29<QG5{K}(`zEXhbZx^?z3-phCeg}WMd=mM`IrOE`
z>-`tmA8ZNjztA4@|3dd)#_r#{`$PYJ@^8-kUFF%o{LPpB_~z^Py=Lz84?Y5sTs^Kz
z;Hm_!O5mylu1esl1g=Wpssye|;1^B;{0uwRH)lSC>my(J#5cZq^6Afh|5HzY_P^f!
z^e%kPPh~o??U%6hQfiDlfZ;MeAkT73`wyi(<7|(V$1?n6RRE6{_W<7M&A0qeVN*f5
znNhtHfO~!LInr47+uJh--}nMOrqC9GIh_ilaqc0slZMmjZgl=oVWR@i7?(5h^b+@a
zUyWCe8Q}PQVhrx5IUCx6gNvlonari-jQs+Gd^Q)WsWiIdeEh7-AwCt}4MW!Cp!Rq?
zA2jMQehJtzYB?jTO2N%FG7qP_-g!yA9C!GRMu-O6ekC&CQlAH50=;Asdf__X>FsQ}
z)Wn_yM@J{@lY1O)W*6;mq_ccUjmn;hlYu^+%o+Bx$05CWV2kpmHPCV9S5b5B{!jnf
zAAjTVgP)Dq^;m3Pd|t)oh0KG1kFiufLi0q$=7}?!GEbbFv*4R@%3K-~LHQ)h=c6({
z-=`h&y`GJ8?uG0p@90nQ3sS9(IF2zVPFHlE;Bz&X@UYgHh&;o@JjvXwvZOn6DTCKU
zjb_9q{kqSqGhis_^LdocsGv-kaQWxcKkx-Ed;vc0VtC}P3XdqgSH@#t$EQ_Xu2e$U
zZkOqYmYq?m-5!Tp_1z7)p#Dq;PUk+p_U9KrEF`Vn^VeIUf^YF_J3Qu3z3b`cK6hC=
zhNz|8_wWBmJH|ZVd|Sug2X^ZEb~@H^lkvxl6Q?r=WA;x@$2tn_`fppiv*9p&4t=xh
zLLaWxhqc?F{WNTH*|$mCzNh0~bp|?nC%9g6g7W849(9BAm^13C%vlxw7t)^7jN0fZ
z@Me5(n{WAWl~8C<9%DHh9^-6CmQCWIAETmnY3)S&0P8Q4I3U2JonZ=Fc(xyErTJ{*
zEe_uEfIv?1+U!k=72Q2??v}@baG-QQ*#ZVKuNge+KNFNO`1p|r$Kb*7ewsHb)-*b0
z$3JUW%r4>Pir%PrW&Ey~aqOjFW~t_P3Dt(*C4Wy?c>GD_dq-GRz<$4Z(Pm=2mEg^s
zM`7RpbZ=Nx&I5eQA}%WSrp03BS^MV2Vuo*9Jd@#D$Ic*&;G5j!1!Vf2Zmd=Eg0v;P
z=nbVSyWl-+`jvv{ei<P9hDN;RF|8M0Mu*XHz8?wjIap#Q`~pI9mT_UpV34n(1K|Pn
z*-)`a6lcTC$@>(g8Yg+FqIt`sspE?oZL{2^^pUsbp&#FL=Z|X<-%aqAKnPBED)HB@
zPrMcY&v<p?%gBlFc@7o!HY4wFgp5#wdKqggto#bz;V2@+dmJgl=(`-<lN`vr&ylYO
zq(u2nM=<8FDJPus)q(PcKm8s}w&>RZ@?DQ;8fE{isBwhl+juf}-G&7Mh2D~>`TF@+
z;=O}3V$tmuvT=kb5qfZX9>~O-DuH*7g1MHbng})+LhhQ5^Zhi6^jk~$?oHss+bwaH
zw_EDIF3QR21be}BLO2#%&~LjG8|ND@wRVL@^VUl%3KC}Deu=w-<VEUa)EtXD+2t@#
z{#!7G5Uul7-JM+O3UiJWq<@p1e8(ejx6t<a`b^v}^5nlhGc*|9P1z#G$aj;Xp7Tn?
z5ucx08rN%K==W+`VO-Sg`tR4&^k>28s@qDmLXLjdrVOo5qKjZezI+q1ky$29GLyt@
z3A~)-4Y(m3VhV5P7$Rc(a!#75yg8C@=hQSvC*N``igV^|oyE}GIyI_AdUkn}rg>wh
z1`&G9i$Be%{@XiI=6HO2CvMH_FERb~dppHnX!hj&oq8C)!&A$L3_PK412XR<CXt2=
zEKEu(i`pOrgzx->@c7zUjY3}aYjkSs1X^aV!xsr_T8wP9`=s`-NNx`S-1vCDca|iD
zUKDy+3Cc9sNJ1MtF_45_97@NUkerNUUK?te5xP5xK{P8T=_d_xC#EC@C_M3%qS_X;
z@H$K^jhOt;>$fHR-*0&1sCv&IzUU8@Z!L9y;0rAsU9Z9$oi;BV{^8%g<GGjpyQOb*
z-t-!EP5a!tdq4CmuiyR5$=y%?(a1;NcJk!yAO7rf&;8l&{?6<E@|*AXe_KMr-B0n`
z6L;{#Q^=C-Z!`RJpfUVA70h9~k=6clq-zP;ua(;=lNsk#Mn3;^#+2>K!bLq-koLf`
zM-Hxj!oMf6-@vh+`yKX}GB>Ob|L(=G*@lbbxjBSCj3@ZrizpsD4)KJys6J}wd1Z{w
zN~k`J|51a8{Yj@?0qu*`+wnfuq~5m2uQ#K@jZmn2?2>x<R7(Wp%~H=#Xhn|?py;X>
zMJrMI6#lk^YP074JGd4eDur6~Ex&x=s$!y^9|ydq5Y%(ugvZOI|4v;9okvi=B~-TK
z-j4cSd}7<m(XJ#vA$STQ2z^VtawC{@{jW=XZ=(OC)Nf7HzhCONCF;K}^?ix@3hdC;
zpNxyN9YTFicOd_B{2UmLX{j&BJx_M!+K~D{5_W~@cl-%&ATH3CJVs5`UAbr*`-cy1
z!KK0fje&bE))<H9T!82j$BHr8H<(6VHsR0CL2EZ(1Hw2?BD>XWWc+EYF;-OHY67o6
zIE_t-7;$m<4QP+oas0M!o|Ss<I{g1y6ZqdV&J3^gjvVo^IPcBoMugHwjAQv^1R8!h
zR;GU-{Ai4g#(v$gpf4hK>N*a8n{Ka}ZvN##1K%(;Rt#}??lk@uK4#|wnw#ze`es|K
zAr5cw*G6OgDh570KKEYY@FuP#;2$>dvj#8em+u?J#(7XHBmsZJLEW()f~U>K$Nx`S
z{|WxzXW$DVc-~rgI}bKseAbP!F?4*=z}qU)PyQ_&Y2opsb>MH+-DHk)O2^?xK!4i!
zm(2KK_?8rSeC;$9e$)V3I!QnCf%y0zZUX-X10U8u>5RkwE$XO3potu34ZQ7d8{=ih
zQNokA$YCbNl1XkzKd1Ckq?1zo-K#YCrPt|>OU929p83R*uJ+IOG=W$DT6bBJCvMtD
zQ+`W6q`qNZ!tseed@Q|1m#qA48)X<im@iN!d&8Buf6&1D0!Lli{QDH@2_GNd!jGaY
ze2~`g@f9eJb$h~<xVP{Rh7hNXztRN0IHwVY;nU{-M&M5y|B<&P;VH|w{K|j_U7Y{j
z8bY7%Sk%`ZLw^}X+E1ht%5P`_zhn&m<HoQFPy30(pK1bMG`|<dJctM3X+LrJhX8MM
z5@~wiZbIRA7h?*G71e)f0zY|=hJULWUzhx`Y*GD16Zo&ZQ+Ih;Y+}a8|4tM5D~-Qv
zfIpq0zl*=E$G856^5YTLe>W-!KN=J5?7h-meN<mW{>&ER@caqHH2CiOG~k0F_(BT&
z8=Js?%)mbsg5Qz?{|>+}r_g`vdo*A%1W)>{|KwcY@gJJNHw^r(x|{I7758y?e!)C#
z{4*)=+fv}qHG$8+R}XCIr2e=%iI4wW6ZnFG59_}-1^x#3cTANU%seuGt~CCX01ujK
zZVdiP{r7Ev$GlF!KWOOJQV7oR`EwNT?*=@2MCiBh-C&FdFZ5q=`EhzegzDJu<CJn7
zG6UMHg=h6o183zMAJXEFUpXFFRm#NmINZ_}U1P5no^a&R>LU(k?duH&APh(TY#fWj
MJ<_gg?A5~kKYv*kCjbBd

literal 0
HcmV?d00001

diff --git a/test/test-pc-mock.sh b/test/test-pc-mock.sh
new file mode 100755
index 0000000..2d8772a
--- /dev/null
+++ b/test/test-pc-mock.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# Mock PC sampling test: runs test_cupti_prof (mock CUPTI) under parcagpu
+# with BPF activity parser. Verifies stall reason map, PC samples, and
+# cubin loading WITHOUT requiring a real GPU.
+#
+# Prerequisites:
+#   make local bpf-test
+#
+# Usage:
+#   sudo -E test/test-pc-mock.sh          # default
+#   sudo -E test/test-pc-mock.sh -v       # verbose
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+LIB="$ROOT/build-local/lib/libparcagpucupti.so"
+TEST_BIN="$ROOT/build-local/bin/test_cupti_prof"
+BPF="$ROOT/test/bpf/activity_parser"
+CUBIN="$ROOT/test/pc_sample_toy.cubin"
+BPF_LOG="/tmp/parcagpu-pc-mock-bpf.log"
+TEST_LOG="/tmp/parcagpu-pc-mock-test.log"
+VERBOSE=""
+
+for arg in "$@"; do
+  case "$arg" in
+    -v) VERBOSE="-v" ;;
+  esac
+done
+
+# --- Preflight checks ---
+for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do
+  if [ ! -x "$f" ] && [ ! -f "$f" ]; then
+    echo "error: $f not found" >&2
+    exit 1
+  fi
+done
+
+cleanup() {
+  [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true
+  [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true
+  wait 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# --- Launch mock workload ---
+# Mock CUPTI/CUDA libs from build-local so proton's dynamic loader finds them
+# instead of real libcupti.so / libcuda.so.
+# Set probability=1 so every interval check triggers sampling.
+echo "=== Starting test_cupti_prof (mock) ==="
+LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \
+  PARCAGPU_DEBUG=1 \
+  PARCAGPU_PC_SAMPLING_PROBABILITY=1 \
+  PARCAGPU_PC_SAMPLING_INTERVAL=0.1 \
+  MOCK_CUBIN_PATH="$CUBIN" \
+  "$TEST_BIN" "$LIB" --duration=5 > "$TEST_LOG" 2>&1 &
+TEST_PID=$!
+echo "test_cupti_prof PID: $TEST_PID"
+
+# Wait for library to be loaded into the process.
+while kill -0 "$TEST_PID" 2>/dev/null &&
+      ! grep -q libparcagpucupti "/proc/$TEST_PID/maps" 2>/dev/null; do
+  sleep 0.1
+done
+
+if ! kill -0 "$TEST_PID" 2>/dev/null; then
+  echo "error: test_cupti_prof exited before library loaded" >&2
+  cat "$TEST_LOG" >&2
+  exit 1
+fi
+
+# --- Attach BPF parser ---
+echo "=== Starting BPF activity parser ==="
+"$BPF" -pid "$TEST_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 &
+BPF_PID=$!
+echo "activity_parser PID: $BPF_PID"
+
+# --- Wait for workload to finish ---
+wait "$TEST_PID" 2>/dev/null || true
+TEST_PID=""
+sleep 2
+
+# --- Stop BPF parser ---
+kill "$BPF_PID" 2>/dev/null || true
+wait "$BPF_PID" 2>/dev/null || true
+BPF_PID=""
+
+# --- Results ---
+echo
+echo "=== Mock test output (parcagpu debug) ==="
+cat "$TEST_LOG"
+echo
+echo "=== BPF parser output ==="
+cat "$BPF_LOG"
+echo
+
+# --- Checks ---
+PASS=true
+
+check() {
+  local label="$1" pattern="$2" file="$3"
+  if grep -q "$pattern" "$file"; then
+    echo "PASS: $label"
+  else
+    echo "FAIL: $label" >&2
+    PASS=false
+  fi
+}
+
+check "PC sampling initialized" "PC sampling initialized" "$TEST_LOG"
+check "real cubin loaded (mock)" "Loaded cubin.*pc_sample_toy" "$TEST_LOG"
+check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TEST_LOG"
+check "source correlation: shmem_bounce" "func=_Z12shmem_bounce.*pc_sample_toy.cu" "$TEST_LOG"
+check "source correlation: hash_churn"  "func=_Z10hash_churn.*pc_sample_toy.cu"  "$TEST_LOG"
+check "source correlation: trig_storm"  "func=_Z10trig_storm.*pc_sample_toy.cu"  "$TEST_LOG"
+check "stall reason map received" "\[ 0\] smsp__pcsamp" "$BPF_LOG"
+check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG"
+check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG"
+check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG"
+
+if $PASS; then
+  echo
+  echo "=== ALL CHECKS PASSED ==="
+else
+  echo
+  echo "=== SOME CHECKS FAILED ===" >&2
+  exit 1
+fi
diff --git a/test/test-pc-real.sh b/test/test-pc-real.sh
index 6fc4d99..5892957 100755
--- a/test/test-pc-real.sh
+++ b/test/test-pc-real.sh
@@ -45,7 +45,10 @@ trap cleanup EXIT
 
 # --- Launch the toy workload ---
 echo "=== Starting pc_sample_toy ==="
-PARCAGPU_DEBUG=1 PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$LIB" "$TOY" 8 > "$TOY_LOG" 2>&1 &
+PARCAGPU_DEBUG=1 PARCAGPU_SAMPLING_FACTOR=18 \
+  PARCAGPU_PC_SAMPLING_PROBABILITY=1 \
+  PARCAGPU_PC_SAMPLING_INTERVAL=0.5 \
+  CUDA_INJECTION64_PATH="$LIB" "$TOY" 8 > "$TOY_LOG" 2>&1 &
 TOY_PID=$!
 echo "pc_sample_toy PID: $TOY_PID"
 
@@ -99,7 +102,7 @@ check() {
 }
 
 check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TOY_LOG"
-check "stall reason map received" "stall reason map:" "$BPF_LOG"
+check "stall reason map received" "\[ 0\] smsp__pcsamp" "$BPF_LOG"
 check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG"
 check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG"
 check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG"
diff --git a/test/test_cupti_prof.c b/test/test_cupti_prof.c
index 30b763a..c16649c 100644
--- a/test/test_cupti_prof.c
+++ b/test/test_cupti_prof.c
@@ -901,6 +901,16 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    // Fire resource callbacks (CONTEXT_CREATED, MODULE_LOADED) so that
+    // PC sampling initialization runs in the profiler library.
+    typedef void (*FireResourceCallbacksFunc)(void);
+    FireResourceCallbacksFunc fireResourceCbs =
+        (FireResourceCallbacksFunc)dlsym(RTLD_DEFAULT,
+                                         "__mock_cupti_fire_resource_callbacks");
+    if (fireResourceCbs) {
+        fireResourceCbs();
+    }
+
     // Load kernel names if specified
     KernelNameList *kernel_names = NULL;
     if (config.kernel_names) {

From eece2f440b74b2b53c72c10cd203864bf1b38b4b Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Thu, 9 Apr 2026 09:04:57 -0500
Subject: [PATCH 14/14] Cleanup related to usdt/cupti extraction

usdt headers now live in parca-dev/usdt and the cupti bpf
headers now live in this project.  So we don't need to vendor
otel anymore.
---
 Makefile                           |  5 +-
 ebpf/cupti_bpf.h                   | 87 ++++++++++++++++++++++++++++++
 go.mod                             | 10 ++++
 test/bpf/activity_parser.bpf.c     | 44 +++++++--------
 test/bpf/cupti_activity_bpf.h      | 51 ------------------
 test/bpf/go.mod                    | 16 ++----
 test/bpf/go.sum                    | 46 +++-------------
 test/bpf/main.go                   | 36 +++++--------
 test/bpf/usdt_compat.h             | 25 ++-------
 vendor/opentelemetry-ebpf-profiler |  1 -
 10 files changed, 152 insertions(+), 169 deletions(-)
 create mode 100644 ebpf/cupti_bpf.h
 create mode 100644 go.mod
 delete mode 100644 test/bpf/cupti_activity_bpf.h
 delete mode 160000 vendor/opentelemetry-ebpf-profiler

diff --git a/Makefile b/Makefile
index 9408e80..f64bb34 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,10 @@ bpf-test:
 		echo "Generating vmlinux.h from kernel BTF..."; \
 		bpftool btf dump file /sys/kernel/btf/vmlinux format c > test/bpf/vmlinux.h; \
 	fi
-	@cd test/bpf && go generate ./... && CGO_ENABLED=0 go build -o activity_parser .
+	@cd test/bpf && \
+		export USDT_HEADERS=$$(go mod download github.com/parca-dev/usdt >/dev/null && \
+			go list -m -f '{{.Dir}}' github.com/parca-dev/usdt)/ebpf && \
+		go generate ./... && CGO_ENABLED=0 go build -o activity_parser .
 	@echo "BPF test built: test/bpf/activity_parser"
 
 # Run test_cupti_prof and BPF activity parser in parallel.
diff --git a/ebpf/cupti_bpf.h b/ebpf/cupti_bpf.h
new file mode 100644
index 0000000..edc8df6
--- /dev/null
+++ b/ebpf/cupti_bpf.h
@@ -0,0 +1,87 @@
+// CUPTI BPF definitions — shared between test/bpf/ and production cuda.ebpf.c.
+//
+// Contains BPF-side layouts for:
+//   - CUpti_ActivityKernel5 (kernel activity records)
+//   - CUpti_PCSamplingPCData (PC sampling records)
+//   - CUpti_PCSamplingStallReason (stall reason entries)
+
+#ifndef CUPTI_BPF_H
+#define CUPTI_BPF_H
+
+// ---------------------------------------------------------------------------
+// Kernel activity records
+// ---------------------------------------------------------------------------
+
+// CUpti_ActivityKind values we care about
+#define CUPTI_ACTIVITY_KIND_KERNEL            3
+#define CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL 10
+
+// Matches the layout of CUpti_ActivityKernel5 exactly.
+// Explicit padding replaces __packed__ to avoid unnecessary unaligned-access
+// handling in BPF.  The struct uses aligned(8) and a static size assert.
+struct cupti_activity_kernel5 {
+  u32 kind;            // offset 0   - CUpti_ActivityKind
+  u8 _pad1[12];        // offset 4   - cacheConfig, sharedMemConfig,
+                       //              registersPerThread,
+                       //              partitionedGlobalCache x2
+  u64 start;           // offset 16  - kernel start timestamp (ns)
+  u64 end;             // offset 24  - kernel end timestamp (ns)
+  u64 completed;       // offset 32  - completion timestamp
+  u32 device_id;       // offset 40
+  u32 context_id;      // offset 44
+  u32 stream_id;       // offset 48
+  u8 _pad2[40];        // offset 52  - gridX/Y/Z, blockX/Y/Z,
+                       //              staticSharedMemory,
+                       //              dynamicSharedMemory,
+                       //              localMemoryPerThread,
+                       //              localMemoryTotal
+  u32 correlation_id;  // offset 92
+  s64 grid_id;         // offset 96
+  u64 name_ptr;        // offset 104 - const char* (user-space pointer)
+  u64 _reserved0;      // offset 112
+  u64 queued;          // offset 120
+  u64 submitted;       // offset 128
+  u8 _pad3[8];         // offset 136 - launchType, isSharedMemoryCarveout,
+                       //              sharedMemoryCarveoutRequested,
+                       //              padding, sharedMemoryExecuted
+  u64 graph_node_id;   // offset 144
+  u32 shmem_limit_cfg; // offset 152 - CUpti_FuncShmemLimitConfig
+  u32 graph_id;        // offset 156
+} __attribute__((aligned(8)));
+
+_Static_assert(
+  sizeof(struct cupti_activity_kernel5) == 160, "cupti_activity_kernel5 size mismatch");
+
+// ---------------------------------------------------------------------------
+// PC sampling records
+// ---------------------------------------------------------------------------
+
+#define STALL_REASON_NAME_LEN 64
+#define MAX_STALL_REASONS     64
+#define MAX_PC_BATCH_SIZE     512
+#define MAX_FUNC_NAME         128
+
+// Matches CUpti_PCSamplingStallReason (packed, aligned 8).
+struct cupti_stall_reason {
+  u32 stall_reason_index;
+  u32 samples;
+};
+
+// Matches CUpti_PCSamplingPCData (packed, aligned 8).
+// Contains user-space pointers that BPF chases with bpf_probe_read_user.
+// We read the base 56-byte struct, then conditionally read correlationId
+// if the size field indicates CUPTI 12.4+ / v22+ (size > 56).
+struct cupti_pc_data {
+  u64 size; // struct size (56 = pre-12.4, 60+ = CUDA 12.4+)
+  u64 cubin_crc;
+  u64 pc_offset;
+  u32 function_index;
+  u32 _pc_pad;
+  u64 function_name_ptr; // const char* in user-space
+  u64 stall_reason_count;
+  u64 stall_reason_ptr; // CUpti_PCSamplingStallReason* in user-space
+} __attribute__((__packed__)) __attribute__((aligned(8)));
+
+#define CUPTI_PC_DATA_BASE_SIZE 56
+
+#endif // CUPTI_BPF_H
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..5114572
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,10 @@
+// parcagpu — host-side CUDA profiler sources and shared BPF headers.
+//
+// This module exists primarily so external projects (e.g.
+// opentelemetry-ebpf-profiler) can pull in the canonical BPF-side
+// CUPTI definitions at ebpf/cupti_bpf.h via `go mod download`, the same
+// way they consume github.com/parca-dev/usdt. The Go module has no Go
+// source of its own — ebpf/ is a plain header directory.
+module github.com/parca-dev/parcagpu
+
+go 1.25.0
diff --git a/test/bpf/activity_parser.bpf.c b/test/bpf/activity_parser.bpf.c
index 2327842..cb8c495 100644
--- a/test/bpf/activity_parser.bpf.c
+++ b/test/bpf/activity_parser.bpf.c
@@ -90,35 +90,35 @@ struct error_event {
 };
 
 // Ring buffer for sending events to user-space.
-struct bpf_map_def events SEC("maps") = {
-    .type = BPF_MAP_TYPE_RINGBUF,
-    .max_entries = 1 << 20, // 1 MB
-};
+struct {
+    __uint(type, BPF_MAP_TYPE_RINGBUF);
+    __uint(max_entries, 1 << 20); // 1 MB
+} events SEC(".maps");
 
 // Stall reason name table — indexed by stall reason index.
 // Value is a 64-byte null-terminated string.
-struct bpf_map_def stall_reasons SEC("maps") = {
-    .type = BPF_MAP_TYPE_ARRAY,
-    .key_size = sizeof(u32),
-    .value_size = STALL_REASON_NAME_LEN,
-    .max_entries = MAX_STALL_REASONS,
-};
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __type(key, u32);
+    __type(value, char[STALL_REASON_NAME_LEN]);
+    __uint(max_entries, MAX_STALL_REASONS);
+} stall_reasons SEC(".maps");
 
 // Whether the stall reason map has been populated.
-struct bpf_map_def stall_map_loaded SEC("maps") = {
-    .type = BPF_MAP_TYPE_ARRAY,
-    .key_size = sizeof(u32),
-    .value_size = sizeof(u32),
-    .max_entries = 1,
-};
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __type(key, u32);
+    __type(value, u32);
+    __uint(max_entries, 1);
+} stall_map_loaded SEC(".maps");
 
 // Stats counters.
-struct bpf_map_def stats SEC("maps") = {
-    .type = BPF_MAP_TYPE_ARRAY,
-    .key_size = sizeof(u32),
-    .value_size = sizeof(u64),
-    .max_entries = 4,
-};
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __type(key, u32);
+    __type(value, u64);
+    __uint(max_entries, 4);
+} stats SEC(".maps");
 
 enum stat_key {
   STAT_BATCHES = 0,    // number of batch probe invocations
diff --git a/test/bpf/cupti_activity_bpf.h b/test/bpf/cupti_activity_bpf.h
deleted file mode 100644
index 38086c3..0000000
--- a/test/bpf/cupti_activity_bpf.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Minimal CUPTI activity buffer definitions for BPF programs.
-// Stripped-down version of CUpti_ActivityKernel5 with exact layout
-// matching the CUDA 12.x CUPTI headers (sizeof = 160, packed aligned 8).
-//
-// Only the fields needed by BPF are named; the rest are padding.
-// Field offsets verified against the real struct with offsetof().
-
-#ifndef CUPTI_ACTIVITY_BPF_H
-#define CUPTI_ACTIVITY_BPF_H
-
-// CUpti_ActivityKind values we care about
-#define CUPTI_ACTIVITY_KIND_KERNEL 3
-#define CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL 10
-
-// Matches the layout of CUpti_ActivityKernel5 exactly.
-// PACKED_ALIGNMENT in CUPTI is: __attribute__((__packed__)) __attribute__((aligned(8)))
-struct cupti_activity_kernel5 {
-  u32 kind;            // offset 0   - CUpti_ActivityKind
-  u8 _pad1[12];        // offset 4   - cacheConfig, sharedMemConfig,
-                        //              registersPerThread,
-                        //              partitionedGlobalCache x2
-  u64 start;            // offset 16  - kernel start timestamp (ns)
-  u64 end;              // offset 24  - kernel end timestamp (ns)
-  u64 completed;        // offset 32  - completion timestamp
-  u32 device_id;        // offset 40
-  u32 context_id;       // offset 44
-  u32 stream_id;        // offset 48
-  u8 _pad2[40];         // offset 52  - gridX/Y/Z, blockX/Y/Z,
-                         //              staticSharedMemory,
-                         //              dynamicSharedMemory,
-                         //              localMemoryPerThread,
-                         //              localMemoryTotal
-  u32 correlation_id;   // offset 92
-  s64 grid_id;          // offset 96
-  u64 name_ptr;         // offset 104 - const char* (user-space pointer)
-  u64 _reserved0;       // offset 112
-  u64 queued;           // offset 120
-  u64 submitted;        // offset 128
-  u8 _pad3[8];          // offset 136 - launchType, isSharedMemoryCarveout,
-                         //              sharedMemoryCarveoutRequested,
-                         //              padding, sharedMemoryExecuted
-  u64 graph_node_id;   // offset 144
-  u32 shmem_limit_cfg; // offset 152 - CUpti_FuncShmemLimitConfig
-  u32 graph_id;         // offset 156
-} __attribute__((__packed__)) __attribute__((aligned(8)));
-
-// Verify expected size at compile time
-_Static_assert(sizeof(struct cupti_activity_kernel5) == 160,
-               "cupti_activity_kernel5 size mismatch");
-
-#endif // CUPTI_ACTIVITY_BPF_H
diff --git a/test/bpf/go.mod b/test/bpf/go.mod
index c674309..b3ab1bf 100644
--- a/test/bpf/go.mod
+++ b/test/bpf/go.mod
@@ -1,22 +1,12 @@
-module github.com/parca-dev/batch-kernel/test/bpf
+module parcagpu/test/bpf
 
 go 1.25.1
 
 require (
 	github.com/cilium/ebpf v0.21.0
 	github.com/gnurizen/sass-table v0.0.1
-	go.opentelemetry.io/ebpf-profiler v0.0.0-00010101000000-000000000000
+	github.com/parca-dev/usdt v0.0.2
 	golang.org/x/sys v0.41.0
 )
 
-require (
-	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/google/uuid v1.6.0 // indirect
-	github.com/klauspost/cpuid/v2 v2.2.10 // indirect
-	github.com/minio/sha256-simd v1.0.1 // indirect
-	github.com/rogpeppe/go-internal v1.14.1 // indirect
-	go.opentelemetry.io/otel v1.41.0 // indirect
-	go.opentelemetry.io/otel/metric v1.41.0 // indirect
-)
-
-replace go.opentelemetry.io/ebpf-profiler => ../../vendor/opentelemetry-ebpf-profiler
+require github.com/rogpeppe/go-internal v1.14.1 // indirect
diff --git a/test/bpf/go.sum b/test/bpf/go.sum
index aea53ac..f1dfcbd 100644
--- a/test/bpf/go.sum
+++ b/test/bpf/go.sum
@@ -1,27 +1,15 @@
-github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
-github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k=
 github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ=
-github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
-github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/gnurizen/sass-table v0.0.1 h1:LvV2GIUAIv6ZXLzr6TbC51Domg/5iVihhc7JXVGZXe0=
 github.com/gnurizen/sass-table v0.0.1/go.mod h1:epXL4VyR6Yuec1rY5RV9awBdWm6ibGUKRB5zajgyUjo=
-github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
-github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
-github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
-github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6 h1:teYtXy9B7y5lHTp8V9KPxpYRAVA7dozigQcMiBust1s=
 github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6/go.mod h1:p4lGIVX+8Wa6ZPNDvqcxq36XpUDLh42FLetFU7odllI=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
-github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
-github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
 github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
-github.com/jsimonetti/rtnetlink/v2 v2.0.3 h1:Jcp7GTnTPepoUAJ9+LhTa7ZiebvNS56T1GtlEUaPNFE=
-github.com/jsimonetti/rtnetlink/v2 v2.0.3/go.mod h1:atIkksp/9fqtf6rpAw45JnttnP2gtuH9X88WPfWfS9A=
-github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE=
-github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
+github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM=
+github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
@@ -30,31 +18,13 @@ github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/
 github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
 github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos=
 github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ=
-github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM=
-github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8=
-github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
-github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/parca-dev/usdt v0.0.2 h1:bpKQycQ++zV8pwkMaJSxZS07XnEXqO3rkHcLYFJDTl4=
+github.com/parca-dev/usdt v0.0.2/go.mod h1:bjh3OTksk+pyP7WsHWlRKWaMSJTUr0gx0piZ/tAv6/w=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
-github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
-github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
-go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
-go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
-go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c=
-go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE=
-go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ=
-go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps=
-go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0=
-go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis=
-golang.org/x/arch v0.24.0 h1:qlJ3M9upxvFfwRM51tTg3Yl+8CP9vCC1E7vlFpgv99Y=
-golang.org/x/arch v0.24.0/go.mod h1:dNHoOeKiyja7GTvF9NJS1l3Z2yntpQNzgrjh1cU103A=
-golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
-golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
-golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
-golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
+golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
+golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
+golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
 golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/test/bpf/main.go b/test/bpf/main.go
index 3a55551..681e4a0 100644
--- a/test/bpf/main.go
+++ b/test/bpf/main.go
@@ -35,13 +35,13 @@ import (
 	ebpf2 "github.com/cilium/ebpf"
 	"github.com/cilium/ebpf/link"
 	"github.com/cilium/ebpf/ringbuf"
-	"go.opentelemetry.io/ebpf-profiler/libpf/pfelf"
+	"github.com/parca-dev/usdt"
 	"golang.org/x/sys/unix"
 
 	sasstable "github.com/gnurizen/sass-table"
 )
 
-//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $GOARCH -cflags "-I../../vendor/opentelemetry-ebpf-profiler/support/ebpf" activityParser activity_parser.bpf.c
+//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $GOARCH -cflags "-I../../ebpf -I$USDT_HEADERS" activityParser activity_parser.bpf.c
 
 // Event type tags — must match BPF #defines.
 const (
@@ -129,13 +129,13 @@ type textSection struct {
 }
 
 type cubinInfo struct {
-	crc       uint64
-	size      uint64
-	lines     []lineEntry // sorted by addr
-	files     []string    // file table from line table header
-	archSM    int              // SM version from ELF e_flags (e.g. 121)
-	texts     []textSection    // .text sections for instruction decoding
-	tmpPath   string           // temp file for llvm-dwarfdump
+	crc     uint64
+	size    uint64
+	lines   []lineEntry   // sorted by addr
+	files   []string      // file table from line table header
+	archSM  int           // SM version from ELF e_flags (e.g. 121)
+	texts   []textSection // .text sections for instruction decoding
+	tmpPath string        // temp file for llvm-dwarfdump
 }
 
 func newCubinStore(pid int) *cubinStore {
@@ -416,19 +416,9 @@ func main() {
 	defer objs.Close()
 
 	// Parse USDT probes from the shared library's .note.stapsdt section.
-	ef, err := pfelf.Open(realLib)
+	probes, err := usdt.ParseProbesFromFile(realLib)
 	if err != nil {
-		log.Fatalf("Opening ELF %s: %v", realLib, err)
-	}
-	defer ef.Close()
-
-	if err := ef.LoadSections(); err != nil {
-		log.Fatalf("Loading ELF sections: %v", err)
-	}
-
-	probes, err := ef.ParseUSDTProbes()
-	if err != nil {
-		log.Fatalf("Parsing USDT probes: %v", err)
+		log.Fatalf("Parsing USDT probes from %s: %v", realLib, err)
 	}
 
 	// Find USDT probes and attach uprobes at each site.
@@ -458,12 +448,12 @@ func main() {
 				continue
 			}
 
-			spec, err := pfelf.ParseUSDTArguments(probe.Arguments)
+			spec, err := usdt.ParseUSDTArguments(probe.Arguments)
 			if err != nil {
 				log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err)
 			}
 
-			specBytes := pfelf.USDTSpecToBytes(spec)
+			specBytes := usdt.SpecToBytes(spec)
 			if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil {
 				log.Fatalf("Populating USDT spec map: %v", err)
 			}
diff --git a/test/bpf/usdt_compat.h b/test/bpf/usdt_compat.h
index 4dc35bd..fc3793e 100644
--- a/test/bpf/usdt_compat.h
+++ b/test/bpf/usdt_compat.h
@@ -1,32 +1,17 @@
-// Compatibility shim for including the otel-ebpf-profiler usdt.h and
-// usdt_args.h from our vmlinux.h-based BPF program.
+// Compatibility shim for including parca-dev/usdt's usdt_args.h from our
+// vmlinux.h-based BPF program.
 //
 // Must be included AFTER vmlinux.h + bpf_helpers.h but BEFORE usdt_args.h.
-// Defines the include guards for bpfdefs.h and types.h so those vendor
-// headers (and their deep dependency trees) are skipped entirely.
+// Provides macros that usdt_args.h expects (normally from kernel.h) and
+// bridges bpf2go architecture defines to standard compiler builtins.
 
 #ifndef USDT_COMPAT_H
 #define USDT_COMPAT_H
 
-// Skip vendor bpfdefs.h and types.h — we already have everything from
-// vmlinux.h.
-#define OPTI_BPFDEFS_H
-#define OPTI_TYPES_H
-
-// Macros expected by usdt_args.h (normally from bpfdefs.h).
+// Macros expected by usdt_args.h (normally from kernel.h).
 #define EBPF_INLINE __always_inline
 #define UNUSED __attribute__((unused))
 
-// bpf_map_def — usdt_args.h uses "extern bpf_map_def __bpf_usdt_specs"
-// (C++ style, no struct keyword), so we need a typedef.
-typedef struct bpf_map_def {
-  unsigned int type;
-  unsigned int key_size;
-  unsigned int value_size;
-  unsigned int max_entries;
-  unsigned int map_flags;
-} bpf_map_def;
-
 // bpf2go passes -target bpfel (not the real platform triple), so the
 // usual compiler builtins (__x86_64__, __aarch64__) are never set.
 // Bridge from bpf2go's __TARGET_ARCH_* defines to the builtins that
diff --git a/vendor/opentelemetry-ebpf-profiler b/vendor/opentelemetry-ebpf-profiler
deleted file mode 160000
index 96c49d2..0000000
--- a/vendor/opentelemetry-ebpf-profiler
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 96c49d2f29166767c8847260d489b63bc2d4bdbd