diff --git a/.clangd b/.clangd
new file mode 100644
index 0000000..1e71e2d
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,2 @@
+CompileFlags:
+  CompilationDatabase: build-local
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..fe8a30f
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,17 @@
+# Git metadata (especially important for proton submodule)
+.git/
+.gitmodules
+.github/
+
+# Build artifacts (in case they exist in directories we copy)
+build/
+build-*/
+*.o
+*.so
+*.so.*
+
+# CMake artifacts
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+compile_commands.json
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..b98edf8
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,118 @@
+name: Build
+
+on:
+  push:
+    branches: [main, proton]
+    tags: ['v*']
+  pull_request:
+    branches: [main, proton]
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+  CUDA_HEADERS: ghcr.io/parca-dev/cuda-headers:12
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build library (amd64)
+        run: |
+          mkdir -p build/amd64
+          docker buildx build -f Dockerfile \
+            --build-arg CUDA_HEADERS=${{ env.CUDA_HEADERS }} \
+            --target export \
+            --output type=local,dest=build/amd64 \
+            --platform linux/amd64 \
+            .
+
+      - name: Verify build artifacts
+        run: |
+          echo "=== Build artifacts ==="
+          ls -lh build/amd64/
+          file build/amd64/libparcagpucupti.so
+          readelf -n build/amd64/libparcagpucupti.so | grep -A2 "stapsdt" | head -20
+
+      - name: Log in to Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels)
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=raw,value={{date 'YYYYMMDD'}}-{{sha}},enable={{is_default_branch}}
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build and push multi-arch image
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./Dockerfile
+          build-args: |
+            CUDA_HEADERS=${{ env.CUDA_HEADERS }}
+          platforms: linux/amd64,linux/arm64
+          target: runtime
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+  release-binaries:
+    if: startsWith(github.ref, 'refs/tags/v')
+    needs: build
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+
+    strategy:
+      matrix:
+        arch: [amd64, arm64]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build and extract binary for ${{ matrix.arch }}
+        run: |
+          mkdir -p build/${{ matrix.arch }}
+          docker buildx build -f Dockerfile \
+            --build-arg CUDA_HEADERS=${{ env.CUDA_HEADERS }} \
+            --target export \
+            --output type=local,dest=build/${{ matrix.arch }} \
+            --platform linux/${{ matrix.arch }} \
+            .
+          mv build/${{ matrix.arch }}/libparcagpucupti.so \
+             build/${{ matrix.arch }}/libparcagpucupti-${{ matrix.arch }}.so
+
+      - name: Upload to GitHub Release
+        uses: softprops/action-gh-release@v1
+        with:
+          files: build/${{ matrix.arch }}/libparcagpucupti-${{ matrix.arch }}.so
diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml
deleted file mode 100644
index 7e21e01..0000000
--- a/.github/workflows/container.yml
+++ /dev/null
@@ -1,135 +0,0 @@
-name: Container Image Build
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'v*'
-  pull_request:
-    branches:
-      - main
-
-env:
-  REGISTRY: ghcr.io
-  IMAGE_NAME: ${{ github.repository }}
-  CUDA_HEADERS_REGISTRY: ghcr.io/parca-dev/cuda-headers
-
-jobs:
-  build-and-push:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Container Registry
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata (tags, labels)
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
-          tags: |
-            # For releases: use semantic version tags
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            # For main branch: use date and commit SHA
-            type=raw,value={{date 'YYYYMMDD'}}-{{sha}},enable={{is_default_branch}}
-            # Tag as latest for main branch
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Build and push multi-arch image with both CUDA versions
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile
-          build-args: |
-            CUDA_12_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:12
-            CUDA_13_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:13
-          platforms: linux/amd64,linux/arm64
-          target: runtime
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-  extract-release-binaries:
-    # Only run this job for tagged releases
-    if: startsWith(github.ref, 'refs/tags/v')
-    needs: build-and-push
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-
-    strategy:
-      matrix:
-        include:
-          - cuda_major: "12"
-            cuda_full: "12.9.1"
-            arch: amd64
-            platform: linux/amd64
-          - cuda_major: "12"
-            cuda_full: "12.9.1"
-            arch: arm64
-            platform: linux/arm64
-          - cuda_major: "13"
-            cuda_full: "13.0.2"
-            arch: amd64
-            platform: linux/amd64
-          - cuda_major: "13"
-            cuda_full: "13.0.2"
-            arch: arm64
-            platform: linux/arm64
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build and extract binary for ${{ matrix.arch }} (CUDA ${{ matrix.cuda_major }})
-        env:
-          LIB_NAME: libparcagpucupti.so.${{ matrix.cuda_major }}
-        run: |
-          mkdir -p build/${{ matrix.arch }}
-          docker buildx build -f Dockerfile \
-            --build-arg CUDA_12_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:12 \
-            --build-arg CUDA_13_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:13 \
-            --target export-cuda${{ matrix.cuda_major }} \
-            --output type=local,dest=build/${{ matrix.arch }} \
-            --platform ${{ matrix.platform }} \
-            .
-
-      - name: Rename binary with arch and CUDA version suffix
-        env:
-          LIB_NAME: libparcagpucupti.so.${{ matrix.cuda_major }}
-          OUTPUT_NAME: libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
-        run: |
-          mv build/${{ matrix.arch }}/${LIB_NAME} \
-             build/${{ matrix.arch }}/${OUTPUT_NAME}
-
-      - name: Upload binary as artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
-          path: build/${{ matrix.arch }}/libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
-
-      - name: Upload to GitHub Release
-        uses: softprops/action-gh-release@v1
-        with:
-          files: build/${{ matrix.arch }}/libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }}
diff --git a/.gitignore b/.gitignore
index 07d7b12..1fb4839 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,9 @@
-cupti/build-*
 build/
-.zig-cache/
-zig-out/
 test/build/
 test/bpf/vmlinux.h
 test/bpf/activity_parser
 test/bpf/activityparser_*.go
-test/bpf/activityparser_*.o
\ No newline at end of file
+test/bpf/activityparser_*.o
+src/probes.h
+microbenchmarks/rapid_launch
+microbenchmarks/pc_sample_toy
diff --git a/.gitmodules b/.gitmodules
index e7d7dd2..73f0eb4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "vendor/opentelemetry-ebpf-profiler"]
 	path = vendor/opentelemetry-ebpf-profiler
 	url = https://github.com/parca-dev/opentelemetry-ebpf-profiler.git
+[submodule "proton"]
+	path = proton
+	url = https://github.com/parca-dev/proton.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..be64c53
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,184 @@
+cmake_minimum_required(VERSION 3.18)
+project(parcagpu LANGUAGES C CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Default to Release with debug symbols
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE RelWithDebInfo)
+endif()
+
+# Compiler flags for different build types
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -O0")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -g -O2")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -g -O2")
+
+# Set proton directory (git submodule or Docker build)
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/proton")
+    set(PROTON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/proton")
+elseif(EXISTS "/build/proton")
+    set(PROTON_DIR "/build/proton")
+else()
+    message(FATAL_ERROR "Could not find proton directory. Expected at ${CMAKE_CURRENT_SOURCE_DIR}/proton or /build/proton")
+endif()
+message(STATUS "Using Proton from: ${PROTON_DIR}")
+
+# Find CUDA headers (allow fallback to manual path for header-only builds)
+if(NOT DEFINED CUDA_INCLUDE_DIR)
+    # Prefer /usr/local/cuda over system-installed CUDA packages, which may
+    # ship older headers (e.g. CUDA 12.0) missing symbols that Proton needs.
+    if(NOT DEFINED CUDAToolkit_ROOT AND EXISTS "/usr/local/cuda/include/cuda.h")
+        set(CUDAToolkit_ROOT "/usr/local/cuda")
+    endif()
+    find_package(CUDAToolkit QUIET)
+    if(CUDAToolkit_FOUND)
+        set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS})
+    else()
+        # Fallback for header-only builds (e.g., Docker with ghcr.io/parca-dev/cuda-headers)
+        set(CUDA_INCLUDE_DIR "/usr/local/cuda/include")
+        if(NOT EXISTS "${CUDA_INCLUDE_DIR}/cuda.h")
+            message(FATAL_ERROR "Could not find CUDA headers. Set CUDA_INCLUDE_DIR or install CUDA Toolkit.")
+        endif()
+        message(STATUS "Using CUDA headers from: ${CUDA_INCLUDE_DIR}")
+    endif()
+endif()
+
+# Include directories from proton
+include_directories(
+    ${PROTON_DIR}/csrc/include
+    ${PROTON_DIR}/common/include
+    ${CUDA_INCLUDE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+# Generate USDT probe header and object from probes.d
+find_program(DTRACE dtrace REQUIRED)
+
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h
+    COMMAND ${DTRACE} -h -s ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+                         -o ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+    COMMENT "Generating USDT probe header (probes.h)"
+)
+
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/probes.o
+    COMMAND ${DTRACE} -G -s ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+                         -o ${CMAKE_CURRENT_BINARY_DIR}/probes.o
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d
+    COMMENT "Generating USDT probe object (probes.o)"
+)
+
+# Collect proton driver source files we need
+# Note: We only link driver APIs and callback utilities (no profiler dependencies)
+set(PROTON_SOURCES
+    ${PROTON_DIR}/csrc/lib/Driver/GPU/CuptiApi.cpp
+    ${PROTON_DIR}/csrc/lib/Driver/GPU/CudaApi.cpp
+    ${PROTON_DIR}/csrc/lib/Profiler/Cupti/CuptiCallbacks.cpp
+)
+
+# Build the shared library
+add_library(parcagpucupti SHARED
+    src/cupti.cpp
+    src/pc_sampling.cpp
+    src/correlation_filter.cpp
+    src/env_config.cpp
+    ${PROTON_SOURCES}
+)
+
+# Wire USDT probe artifacts into the build
+add_custom_target(probes_generated
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h
+            ${CMAKE_CURRENT_BINARY_DIR}/probes.o
+)
+add_dependencies(parcagpucupti probes_generated)
+target_link_libraries(parcagpucupti ${CMAKE_CURRENT_BINARY_DIR}/probes.o)
+
+# Link against system libraries only (NOT libcupti - loaded dynamically by Proton)
+target_link_libraries(parcagpucupti
+    dl
+    pthread
+)
+
+# Set output directory
+set_target_properties(parcagpucupti PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    OUTPUT_NAME "parcagpucupti"
+)
+
+# Install target
+install(TARGETS parcagpucupti
+    LIBRARY DESTINATION lib
+)
+
+# Build test infrastructure
+option(BUILD_TESTS "Build test infrastructure" ON)
+
+if(BUILD_TESTS)
+    # Mock CUDA library for testing (minimal - just cuDriverGetVersion)
+    add_library(cuda_mock SHARED
+        test/mock_cuda.c
+    )
+    target_include_directories(cuda_mock PRIVATE
+        ${CUDA_INCLUDE_DIR}
+    )
+    set_target_properties(cuda_mock PROPERTIES
+        OUTPUT_NAME "cuda"
+        LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    )
+
+    # Mock CUPTI library for testing
+    add_library(cupti_mock SHARED
+        test/mock_cupti.c
+    )
+    target_include_directories(cupti_mock PRIVATE
+        ${CUDA_INCLUDE_DIR}
+    )
+    set_target_properties(cupti_mock PROPERTIES
+        OUTPUT_NAME "cupti"
+        LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    )
+
+    # Test executable
+    add_executable(test_cupti_prof
+        test/test_cupti_prof.c
+    )
+    target_include_directories(test_cupti_prof PRIVATE
+        ${CUDA_INCLUDE_DIR}
+    )
+    # Link against mock libraries so their symbols are globally visible.
+    # This lets Proton's dlopen(RTLD_NOLOAD) find them, and lets the test
+    # access __cupti_runtime_api_callback etc. via dlsym(RTLD_DEFAULT).
+    # --no-as-needed forces the linker to keep them even though the test
+    # only accesses their symbols via dlsym, not direct references.
+    target_link_libraries(test_cupti_prof
+        dl
+        -Wl,--no-as-needed
+        cuda_mock
+        cupti_mock
+        -Wl,--as-needed
+    )
+    set_target_properties(test_cupti_prof PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+    )
+
+    # Create libcuda.so.1 symlink for test (Proton expects libcuda.so.1)
+    add_custom_command(TARGET cuda_mock POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink
+            libcuda.so
+            ${CMAKE_BINARY_DIR}/lib/libcuda.so.1
+        COMMENT "Creating libcuda.so.1 symlink"
+    )
+
+    # Create libcupti.so.12 symlink for test
+    add_custom_command(TARGET cupti_mock POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink
+            libcupti.so
+            ${CMAKE_BINARY_DIR}/lib/libcupti.so.12
+        COMMENT "Creating libcupti.so.12 symlink"
+    )
+endif()
diff --git a/Dockerfile b/Dockerfile
index b691c6e..b8eddd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,83 +1,47 @@
 # Slim multi-platform build for libparcagpucupti.so
 # Uses pre-built CUDA header images instead of full CUDA development images
 # This significantly reduces build time and disk space requirements
+#
+# Thanks to Proton's dynamic CUPTI loading, we only need to build once
+# and the library works with any CUDA version at runtime.
 
-# CUDA header images (can be overridden at build time)
-ARG CUDA_12_HEADERS=ghcr.io/parca-dev/cuda-headers:12
-ARG CUDA_13_HEADERS=ghcr.io/parca-dev/cuda-headers:13
+# CUDA header image (can be overridden at build time)
+ARG CUDA_HEADERS=ghcr.io/parca-dev/cuda-headers:12
 
-# Import CUDA 12 headers
-FROM ${CUDA_12_HEADERS} AS cuda12-headers
+# Import CUDA headers
+FROM ${CUDA_HEADERS} AS cuda-headers
 
-# Import CUDA 13 headers
-FROM ${CUDA_13_HEADERS} AS cuda13-headers
-
-# Build stage for CUDA 12
-FROM ubuntu:22.04 AS builder-cuda12
+# Build stage
+FROM ubuntu:24.04 AS builder
 
 # Install only build tools (no CUDA toolkit needed)
 RUN apt-get update && apt-get install -y \
     cmake \
     make \
-    gcc \
     g++ \
     systemtap-sdt-dev \
     && rm -rf /var/lib/apt/lists/*
 
-WORKDIR /build/cupti
+WORKDIR /build
 
-# Copy CUDA headers and libraries from header image
-COPY --from=cuda12-headers /usr/local/cuda /usr/local/cuda
+# Copy CUDA headers from header image
+COPY --from=cuda-headers /usr/local/cuda /usr/local/cuda
 
-# Copy source code
-COPY cupti/cupti-prof.c cupti/correlation_filter.cpp cupti/correlation_filter.h cupti/CMakeLists.txt ./
+# Copy parcagpu source files and proton submodule
+COPY src /build/src
+COPY proton /build/proton
+COPY CMakeLists.txt /build/
 
-# Build the library for CUDA 12
-ENV CUDA_ROOT=/usr/local/cuda
+# Build the library (disable tests for Docker build)
 RUN mkdir -p build && \
     cd build && \
-    cmake -DCUDA_ROOT=${CUDA_ROOT} -DCMAKE_BUILD_TYPE=RelWithDebInfo .. && \
-    make VERBOSE=1 && \
-    mv libparcagpucupti.so libparcagpucupti.so.12
-
-# Build stage for CUDA 13
-FROM ubuntu:22.04 AS builder-cuda13
-
-# Install only build tools (no CUDA toolkit needed)
-RUN apt-get update && apt-get install -y \
-    cmake \
-    make \
-    gcc \
-    g++ \
-    systemtap-sdt-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /build/cupti
-
-# Copy CUDA headers and libraries from header image
-COPY --from=cuda13-headers /usr/local/cuda /usr/local/cuda
-
-# Copy source code
-COPY cupti/cupti-prof.c cupti/correlation_filter.cpp cupti/correlation_filter.h cupti/CMakeLists.txt ./
-
-# Build the library for CUDA 13
-ENV CUDA_ROOT=/usr/local/cuda
-RUN mkdir -p build && \
-    cd build && \
-    cmake -DCUDA_ROOT=${CUDA_ROOT} .. && \
-    make VERBOSE=1 && \
-    mv libparcagpucupti.so libparcagpucupti.so.13
-
-# Export stages for extracting single libraries (used by Makefile and release binaries)
-FROM scratch AS export-cuda12
-COPY --from=builder-cuda12 /build/cupti/build/libparcagpucupti.so.12 /
+    cmake -DCUDA_INCLUDE_DIR=/usr/local/cuda/include -DBUILD_TESTS=OFF .. && \
+    make -j$(nproc)
 
-FROM scratch AS export-cuda13
-COPY --from=builder-cuda13 /build/cupti/build/libparcagpucupti.so.13 /
+# Export stage for extracting the library (used by Makefile and release binaries)
+FROM scratch AS export
+COPY --from=builder /build/build/lib/libparcagpucupti.so /
 
-# Runtime image with both CUDA versions (for container registry)
+# Runtime image (for container registry)
 FROM busybox:latest AS runtime
-COPY --from=builder-cuda12 /build/cupti/build/libparcagpucupti.so.12 /usr/lib/
-COPY --from=builder-cuda13 /build/cupti/build/libparcagpucupti.so.13 /usr/lib/
-# Default symlink points to CUDA 12
-RUN ln -s /usr/lib/libparcagpucupti.so.12 /usr/lib/libparcagpucupti.so
+COPY --from=builder /build/build/lib/libparcagpucupti.so /usr/lib/libparcagpucupti.so
diff --git a/Dockerfile.test b/Dockerfile.test
index 4137314..65bf375 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -9,20 +9,22 @@ RUN apt-get update && apt-get install -y \
 WORKDIR /test
 
 # Create directory structure
-RUN mkdir -p cupti/build-amd64 zig-out/bin zig-out/lib
+RUN mkdir -p build/bin build/lib
 
 # Copy the built library
-COPY cupti/build-amd64/libparcagpucupti.so ./cupti/build-amd64/
+COPY build/lib/libparcagpucupti.so ./build/lib/
 
-# Copy the zig-built test files
-COPY zig-out/bin/test_cupti_prof ./zig-out/bin/
-COPY zig-out/lib/libcupti.so ./zig-out/lib/
-COPY zig-out/lib/libcupti.so.12 ./zig-out/lib/
+# Copy the test files
+COPY build/bin/test_cupti_prof ./build/bin/
+COPY build/lib/libcuda.so ./build/lib/
+COPY build/lib/libcuda.so.1 ./build/lib/
+COPY build/lib/libcupti.so ./build/lib/
+COPY build/lib/libcupti.so.12 ./build/lib/
 
 # Set environment
-ENV LD_LIBRARY_PATH=/test/zig-out/lib
+ENV LD_LIBRARY_PATH=/test/build/lib
 ENV PARCAGPU_DEBUG=1
 
 # Run tests by default
-ENTRYPOINT ["/test/zig-out/bin/test_cupti_prof", "/test/cupti/build-amd64/libparcagpucupti.so"]
+ENTRYPOINT ["/test/build/bin/test_cupti_prof", "/test/build/lib/libparcagpucupti.so"]
 CMD []
diff --git a/Makefile b/Makefile
index fedd0c8..f64bb34 100644
--- a/Makefile
+++ b/Makefile
@@ -1,153 +1,90 @@
-.PHONY: all clean test cupti-amd64 cupti-arm64 cupti-all cupti-all-versions cross test-infra docker-push push-cuda-headers docker-test-build docker-test-run format bpf-test test-multi
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test microbenchmarks test-multi test-pc-real test-pc-mock
 
-# CUDA version configuration
-CUDA_MAJOR ?= 12
-CUDA_FULL_VERSION ?= 12.9.1
-LIB_NAME = libparcagpucupti.so.$(CUDA_MAJOR)
+LIB_NAME = libparcagpucupti.so
 
-# Default target: build all CUDA versions (12 & 13) for both architectures and test infrastructure
-all: cupti-all-versions test-infra
-
-local:
-	mkdir -p build && \
-    cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -B build -S cupti && \
-    cd build && make -j$(nproc)
+# Default target: build for both architectures
+all: build-all
 
 # Build libparcagpucupti.so for AMD64 using Docker
-cupti-amd64:
-	@echo "=== Building $(LIB_NAME) for AMD64 with Docker (CUDA $(CUDA_MAJOR)) ==="
+build-amd64:
+	@echo "=== Building $(LIB_NAME) for AMD64 with Docker ==="
 	@mkdir -p /tmp/parcagpu-build-amd64
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
-		--target export-cuda$(CUDA_MAJOR) \
+		--target export \
 		--output type=local,dest=/tmp/parcagpu-build-amd64 \
 		--platform linux/amd64 .
-	@mkdir -p build/$(CUDA_MAJOR)/amd64
-	@cp /tmp/parcagpu-build-amd64/$(LIB_NAME) build/$(CUDA_MAJOR)/amd64/
-	@ln -sf $(LIB_NAME) build/$(CUDA_MAJOR)/amd64/libparcagpucupti.so
-	@echo "AMD64 library built: build/$(CUDA_MAJOR)/amd64/$(LIB_NAME)"
+	@mkdir -p build/amd64
+	@cp /tmp/parcagpu-build-amd64/$(LIB_NAME) build/amd64/
+	@echo "AMD64 library built: build/amd64/$(LIB_NAME)"
 
 # Build libparcagpucupti.so for ARM64 using Docker
-cupti-arm64:
-	@echo "=== Building $(LIB_NAME) for ARM64 with Docker (CUDA $(CUDA_MAJOR)) ==="
+build-arm64:
+	@echo "=== Building $(LIB_NAME) for ARM64 with Docker ==="
 	@mkdir -p /tmp/parcagpu-build-arm64
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
-		--target export-cuda$(CUDA_MAJOR) \
+		--target export \
 		--output type=local,dest=/tmp/parcagpu-build-arm64 \
 		--platform linux/arm64 .
-	@mkdir -p build/$(CUDA_MAJOR)/arm64
-	@cp /tmp/parcagpu-build-arm64/$(LIB_NAME) build/$(CUDA_MAJOR)/arm64/
-	@ln -sf $(LIB_NAME) build/$(CUDA_MAJOR)/arm64/libparcagpucupti.so
-	@echo "ARM64 library built: build/$(CUDA_MAJOR)/arm64/$(LIB_NAME)"
-
-# Build both architectures for current CUDA version (controlled by CUDA_MAJOR variable)
-# Example: make cupti-all CUDA_MAJOR=13
-cupti-all: cupti-amd64 cupti-arm64
-
-# Build all local artifacts: CUDA 12 & 13 for both amd64 and arm64
-# This extracts the .so files to build/12/{amd64,arm64}/ and build/13/{amd64,arm64}/
-# Use this target when you want local build artifacts for testing
-cupti-all-versions:
-	@echo "=== Building all CUDA versions (12 and 13) for both architectures ==="
-	@$(MAKE) cupti-amd64 CUDA_MAJOR=12 CUDA_FULL_VERSION=12.9.1
-	@$(MAKE) cupti-arm64 CUDA_MAJOR=12 CUDA_FULL_VERSION=12.9.1
-	@$(MAKE) cupti-amd64 CUDA_MAJOR=13 CUDA_FULL_VERSION=13.0.2
-	@$(MAKE) cupti-arm64 CUDA_MAJOR=13 CUDA_FULL_VERSION=13.0.2
+	@mkdir -p build/arm64
+	@cp /tmp/parcagpu-build-arm64/$(LIB_NAME) build/arm64/
+	@echo "ARM64 library built: build/arm64/$(LIB_NAME)"
+
+# Build both architectures
+build-all: build-amd64 build-arm64
 	@echo "=== All artifacts built ==="
-	@echo "CUDA 12 amd64: build/12/amd64/libparcagpucupti.so.12"
-	@echo "CUDA 12 arm64: build/12/arm64/libparcagpucupti.so.12"
-	@echo "CUDA 13 amd64: build/13/amd64/libparcagpucupti.so.13"
-	@echo "CUDA 13 arm64: build/13/arm64/libparcagpucupti.so.13"
+	@echo "AMD64: build/amd64/$(LIB_NAME)"
+	@echo "ARM64: build/arm64/$(LIB_NAME)"
 
-# Build runtime container image with both CUDA versions for both architectures
-# Note: This builds a Docker image but doesn't extract local artifacts or load into Docker
+# Build runtime container image for both architectures
 # Multi-platform images stay in buildx cache. Use docker-push to push to registry.
 cross:
-	@echo "=== Building runtime container for AMD64 and ARM64 (includes CUDA 12 and 13) ==="
+	@echo "=== Building runtime container for AMD64 and ARM64 ==="
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
 		--target runtime \
 		--platform linux/amd64,linux/arm64 \
 		.
 	@echo "Runtime container built for both platforms (cached, not loaded into Docker)"
 
-# CUDA header image configuration
-# Can be overridden to use local images (e.g., make cupti-all CUDA_12_HEADERS=cuda-headers:12)
-CUDA_HEADERS_REGISTRY ?= ghcr.io/parca-dev/cuda-headers
-CUDA_12_HEADERS ?= $(CUDA_HEADERS_REGISTRY):12
-CUDA_13_HEADERS ?= $(CUDA_HEADERS_REGISTRY):13
-
-# Build and push CUDA header images to registry
-# These are lightweight images (~35MB each) containing only CUDA headers and libcupti
-# Note: Only needs to be run manually when:
-#   - CUDA versions are updated (12.9.1 -> 12.x.x, 13.0.2 -> 13.x.x)
-#   - New CUDA major versions are added
-#   - CUPTI API changes require header updates
-push-cuda-headers:
-	@echo "=== Building and pushing CUDA header images ==="
-	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
-	@echo "Building CUDA 12 headers..."
-	@docker buildx build -f Dockerfile.cuda-headers \
-		--build-arg CUDA_VERSION=12.9.1 \
-		--platform linux/amd64,linux/arm64 \
-		--tag $(CUDA_HEADERS_REGISTRY):12 \
-		--push \
-		.
-	@echo "Building CUDA 13 headers..."
-	@docker buildx build -f Dockerfile.cuda-headers \
-		--build-arg CUDA_VERSION=13.0.2 \
-		--platform linux/amd64,linux/arm64 \
-		--tag $(CUDA_HEADERS_REGISTRY):13 \
-		--push \
-		.
-	@echo "CUDA header images pushed to $(CUDA_HEADERS_REGISTRY):12 and :13"
-
-# Build test infrastructure with CMake
-test-infra:
-	@echo "=== Building test infrastructure with CMake ==="
-	@mkdir -p test/build
-	@cd test/build && cmake .. -DCUDA_MAJOR_VERSION=$(CUDA_MAJOR) && make
-
-# Run tests (using AMD64 library)
-test: cupti-amd64 test-infra
-	@./test.sh
-
-# Run advanced test (8 GPUs @ 2500 launches/s, multi-threaded)
-test-advanced: cupti-amd64 test-infra
-	@echo "=== Running advanced test (8 GPUs @ 2500 launches/s) ==="
-	@cd test/build && LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \
-		./test_cupti_prof ../../build/$(CUDA_MAJOR)/amd64/libparcagpucupti.so \
-		--threads=4 --num-gpus=8 --launch-rate=2500 --duration=10
+# Local build with CMake (for development/testing) - default is Release with symbols
+local:
+	@echo "=== Building locally with CMake (RelWithDebInfo) ==="
+	@cmake -B build-local -S . -DCMAKE_BUILD_TYPE=RelWithDebInfo
+	@cmake --build build-local
+	@echo "Local build complete: build-local/lib/$(LIB_NAME)"
+
+# Debug build with CMake (full debug, no optimizations)
+debug:
+	@echo "=== Building debug version with CMake ==="
+	@cmake -B build-local -S . -DCMAKE_BUILD_TYPE=Debug
+	@cmake --build build-local
+	@echo "Debug build complete: build-local/lib/$(LIB_NAME)"
+
+# Run local tests
+test: local
+	@echo "=== Running tests ==="
+	@LD_LIBRARY_PATH="$(CURDIR)/build-local/lib:$$LD_LIBRARY_PATH" \
+		./build-local/bin/test_cupti_prof build-local/lib/libparcagpucupti.so --duration=5
 
 # Clean build artifacts
 clean:
 	@echo "=== Cleaning build artifacts ==="
-	@rm -rf cupti/build cupti/build-amd64 cupti/build-arm64 build
-	@rm -rf test/build
-	@rm -f test/bpf/activity_parser test/bpf/activityparser_*.go test/bpf/activityparser_*.o
+	@rm -rf build build-local bin lib zig-out .zig-cache
+	@rm -rf CMakeCache.txt CMakeFiles/ cmake_install.cmake compile_commands.json
 	@echo "Clean complete"
 
 # Build and push multi-arch Docker images to ghcr.io
 # Set IMAGE_TAG to override the default tag (e.g., make docker-push IMAGE_TAG=v1.0.0)
 # Set IMAGE to override the image name (e.g., make docker-push IMAGE=ghcr.io/myuser/parcagpu)
-# Set CUDA_12_HEADERS and CUDA_13_HEADERS to override header images (e.g., cuda-headers:12 for local)
-# Note: Runtime image includes both CUDA 12 and 13
 IMAGE ?= ghcr.io/parca-dev/parcagpu
 IMAGE_TAG ?= latest
 docker-push:
 	@echo "=== Setting up buildx builder ==="
 	@docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder
-	@echo "=== Building and pushing multi-arch Docker images to $(IMAGE):$(IMAGE_TAG) (includes CUDA 12 and 13) ==="
+	@echo "=== Building and pushing multi-arch Docker images to $(IMAGE):$(IMAGE_TAG) ==="
 	@docker buildx build -f Dockerfile \
-		--build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \
-		--build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \
 		--target runtime \
 		--platform linux/amd64,linux/arm64 \
 		--tag $(IMAGE):$(IMAGE_TAG) \
@@ -156,7 +93,7 @@ docker-push:
 	@echo "Images pushed successfully to $(IMAGE):$(IMAGE_TAG)"
 
 # Build test container image
-docker-test-build: cupti-amd64 test-infra
+docker-test-build: build-amd64
 	@echo "=== Building test container image ==="
 	@docker build -f Dockerfile.test -t parcagpu-test:latest .
 	@echo "Test container built: parcagpu-test:latest"
@@ -167,6 +104,17 @@ docker-test-run: docker-test-build
 	@echo "=== Running tests in container ==="
 	@docker run --rm parcagpu-test:latest $(ARGS)
 
+# Build microbenchmark CUDA toys (with DWARF debug info for cubin symbolization)
+NVCC ?= nvcc
+CUDA_ARCH ?= native
+MICROBENCH_SRCS := $(wildcard microbenchmarks/*.cu)
+MICROBENCH_BINS := $(MICROBENCH_SRCS:.cu=)
+
+microbenchmarks: $(MICROBENCH_BINS)
+
+microbenchmarks/%: microbenchmarks/%.cu
+	$(NVCC) -g -lineinfo -arch=$(CUDA_ARCH) -o $@ $<
+
 # Build the BPF activity parser test program
 # Requires: clang, libbpf-dev, bpftool (for vmlinux.h), Go 1.21+
 bpf-test:
@@ -175,21 +123,20 @@ bpf-test:
 		echo "Generating vmlinux.h from kernel BTF..."; \
 		bpftool btf dump file /sys/kernel/btf/vmlinux format c > test/bpf/vmlinux.h; \
 	fi
-	@cd test/bpf && go generate ./... && CGO_ENABLED=0 go build -o activity_parser .
+	@cd test/bpf && \
+		export USDT_HEADERS=$$(go mod download github.com/parca-dev/usdt >/dev/null && \
+			go list -m -f '{{.Dir}}' github.com/parca-dev/usdt)/ebpf && \
+		go generate ./... && CGO_ENABLED=0 go build -o activity_parser .
 	@echo "BPF test built: test/bpf/activity_parser"
 
 # Run test_cupti_prof and BPF activity parser in parallel.
-# The BPF test attaches to the test program's PID and logs kernel activities.
+# The BPF test attaches to the activity_batch USDT probe and logs kernel activities.
 # Requires root (sudo) for BPF.
-test-multi: cupti-amd64 test-infra bpf-test
+test-multi: local bpf-test
 	@echo "=== Running test with BPF activity parser ==="
-	@case "$$(uname -m)" in \
-		aarch64|arm64) ARCH=arm64 ;; \
-		*) ARCH=amd64 ;; \
-	esac; \
-	LIB_PATH="build/$(CUDA_MAJOR)/$${ARCH}/libparcagpucupti.so"; \
-	export LD_LIBRARY_PATH="$$(pwd)/test/build:$$LD_LIBRARY_PATH"; \
-	test/build/test_cupti_prof "$${LIB_PATH}" --kernel-names=kernel_names.txt --duration=10 & \
+	@LIB_PATH="build-local/lib/libparcagpucupti.so"; \
+	export LD_LIBRARY_PATH="$(CURDIR)/build-local/lib:$$LD_LIBRARY_PATH"; \
+	./build-local/bin/test_cupti_prof "$${LIB_PATH}" --kernel-names=kernel_names.txt --duration=10 & \
 	TEST_PID=$$!; \
 	sleep 1; \
 	echo "test_cupti_prof PID: $${TEST_PID}"; \
@@ -202,5 +149,15 @@ test-multi: cupti-amd64 test-infra bpf-test
 	sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
 	echo "=== test-multi completed (test exit: $${TEST_EXIT}) ==="
 
+# Run pc_sample_toy with BPF activity parser and verify stall reason map is received.
+# Requires: real GPU, root (sudo) for BPF, pc_sample_toy compiled separately.
+test-pc-real: local bpf-test microbenchmarks
+	sudo -E test/test-pc-real.sh
+
+# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA.
+test-pc-mock: local bpf-test
+	sudo -E test/test-pc-mock.sh
+
 format:
-	clang-format -i -style=file cupti/*.[ch]
+	@echo "=== Formatting source files ==="
+	@clang-format -i -style=file src/*.cpp src/*.h test/*.c
diff --git a/README_TEST.md b/README_TEST.md
index 1c90651..f797556 100644
--- a/README_TEST.md
+++ b/README_TEST.md
@@ -13,9 +13,9 @@ make test
 ```
 
 This builds:
-- `cupti/build/libparcagpucupti.so` - Production library (CMake + real CUPTI)
-- `zig-out/lib/libcupti.so` - Mock CUPTI for test infrastructure
-- `zig-out/bin/test_cupti_prof` - Test program
+- `build/lib/libparcagpucupti.so` - Production library (CMake)
+- `build/lib/libcupti.so` - Mock CUPTI for test infrastructure
+- `build/bin/test_cupti_prof` - Test program
 
 ## Quick Start
 
@@ -39,7 +39,7 @@ This will:
 
 For extended testing or continuous probe monitoring:
 ```bash
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so --forever
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so --forever
 ```
 
 This runs indefinitely at 1000 events/second until interrupted (Ctrl-C).
diff --git a/bpft_test.sh b/bpft_test.sh
new file mode 100755
index 0000000..2c2a285
--- /dev/null
+++ b/bpft_test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -eu
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LIB="$SCRIPT_DIR/build-local/lib/libparcagpucupti.so"
+
+if [ ! -f "$LIB" ]; then
+    echo "error: $LIB not found" >&2
+    exit 1
+fi
+
+export PARCAGPU_DEBUG=1
+PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$LIB" "$SCRIPT_DIR/microbenchmarks/pc_sample_toy" 4 &
+TOY_PID=$!
+trap "kill $TOY_PID 2>/dev/null; wait $TOY_PID 2>/dev/null" EXIT
+
+# Wait for the injection library to be loaded (CUDA must initialize first)
+while kill -0 "$TOY_PID" 2>/dev/null && ! grep -q libparcagpucupti "/proc/$TOY_PID/maps" 2>/dev/null; do
+    sleep 0.1
+done
+
+exec bpftrace -p "$TOY_PID" "$SCRIPT_DIR/parcagpu.bt"
diff --git a/cupti/CMakeLists.txt b/cupti/CMakeLists.txt
deleted file mode 100644
index ead5ead..0000000
--- a/cupti/CMakeLists.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-project(parcagpucupti C CXX)
-
-# CUDA root directory (can be overridden via -DCUDA_ROOT=...)
-if(NOT DEFINED CUDA_ROOT)
-    set(CUDA_ROOT "/usr/local/cuda" CACHE PATH "CUDA installation directory")
-endif()
-
-# CUDA library directory
-if(NOT DEFINED CUDA_LIBDIR)
-    set(CUDA_LIBDIR "${CUDA_ROOT}/lib64" CACHE PATH "CUDA library directory")
-endif()
-
-# Set include directories for CUDA headers
-set(CUDAToolkit_INCLUDE_DIRS
-    "${CUDA_ROOT}/include"
-    "${CUDA_ROOT}/extras/CUPTI/include"
-)
-
-# Add CUDA library directory to link directories
-link_directories(${CUDA_LIBDIR})
-
-# Create shared library with both C and C++ sources
-add_library(parcagpucupti SHARED cupti-prof.c correlation_filter.cpp)
-
-# Set properties
-set_target_properties(parcagpucupti PROPERTIES
-    C_STANDARD 11
-    C_STANDARD_REQUIRED ON
-    CXX_STANDARD 17
-    CXX_STANDARD_REQUIRED ON
-    POSITION_INDEPENDENT_CODE ON
-)
-
-# Add debug symbols and disable C++ exceptions
-target_compile_options(parcagpucupti PRIVATE -g -fno-exceptions)
-
-# Include directories
-target_include_directories(parcagpucupti PRIVATE
-    ${CUDAToolkit_INCLUDE_DIRS}
-)
-
-# systemtap headers are now in /usr/include/sys/ (copied in Dockerfile for cross-compilation)
-
-# Find the CUPTI library explicitly
-find_library(CUPTI_LIBRARY
-    NAMES cupti
-    PATHS ${CUDA_LIBDIR}
-    REQUIRED
-)
-
-# Find the CUDA driver library
-find_library(CUDA_LIBRARY
-    NAMES cuda
-    PATHS ${CUDA_LIBDIR} ${CUDA_LIBDIR}/stubs
-    REQUIRED
-)
-
-# Link libraries
-target_link_libraries(parcagpucupti PRIVATE
-    ${CUPTI_LIBRARY}
-    ${CUDA_LIBRARY}
-)
-
-# Installation
-install(TARGETS parcagpucupti
-    LIBRARY DESTINATION lib
-)
\ No newline at end of file
diff --git a/cupti/correlation_filter.cpp b/cupti/correlation_filter.cpp
deleted file mode 100644
index 8d48dfa..0000000
--- a/cupti/correlation_filter.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-#include "correlation_filter.h"
-#include <unordered_set>
-#include <unordered_map>
-#include <mutex>
-#include <memory>
-
-// CorrelationFilter implementation using std::unordered_set with mutex protection
-// This provides thread-safe access with minimal overhead for our use case
-class CorrelationFilter {
-public:
-    CorrelationFilter() = default;
-
-    // Insert a correlation ID into the filter
-    // Thread-safe
-    void insert(uint32_t correlation_id) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        set_.insert(correlation_id);
-    }
-
-    // Check if correlation ID exists and remove it atomically
-    // Returns true if found and removed, false if not found
-    // Thread-safe
-    bool check_and_remove(uint32_t correlation_id) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        auto it = set_.find(correlation_id);
-        if (it != set_.end()) {
-            set_.erase(it);
-            return true;
-        }
-        return false;
-    }
-
-    // Get current size
-    // Thread-safe
-    size_t size() const {
-        std::lock_guard<std::mutex> lock(mutex_);
-        return set_.size();
-    }
-
-private:
-    std::unordered_set<uint32_t> set_;
-    mutable std::mutex mutex_;
-};
-
-// GraphCorrelationMap implementation for tracking graph launches across buffer cycles
-// Uses a 2-slot state machine per correlation ID to detect when graph launches are complete
-struct GraphCorrelationEntry {
-    uint8_t state[2];         // State for alternating cycles
-    bool ever_seen_kernel;    // True once we've seen at least one kernel activity
-    uint32_t insertion_cycle; // Buffer cycle when entry was created (for fallback cleanup)
-
-    GraphCorrelationEntry(uint32_t cycle)
-        : state{GRAPH_STATE_UNINITIALIZED, GRAPH_STATE_UNINITIALIZED}
-        , ever_seen_kernel(false)
-        , insertion_cycle(cycle) {}
-};
-
-class GraphCorrelationMap {
-public:
-    GraphCorrelationMap() : current_cycle_(0) {}
-
-    // Insert a correlation ID (called when sampling a graph launch)
-    // Thread-safe
-    void insert(uint32_t correlation_id) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        map_.emplace(correlation_id, GraphCorrelationEntry(current_cycle_));
-    }
-
-    // Start a new processing cycle - clear the appropriate slot for all entries
-    // Thread-safe
-    void cycle_start(uint32_t cycle) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        current_cycle_ = cycle;
-        uint32_t slot = cycle % 2;
-        for (auto& pair : map_) {
-            pair.second.state[slot] = GRAPH_STATE_CYCLE_CLEARED;
-        }
-    }
-
-    // Check if correlation ID is tracked and mark as seen for this cycle
-    // Returns true if tracked (should fire probe)
-    // Thread-safe
-    bool check_and_mark_seen(uint32_t correlation_id, uint32_t cycle) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        auto it = map_.find(correlation_id);
-        if (it != map_.end()) {
-            uint32_t slot = cycle % 2;
-            it->second.state[slot] = GRAPH_STATE_KERNEL_SEEN;
-            it->second.ever_seen_kernel = true;  // Mark that we've seen at least one kernel
-            return true;
-        }
-        return false;
-    }
-
-    // End processing cycle - remove entries based on two conditions:
-    // 1. Primary: Both slots CYCLE_CLEARED AND we've seen at least one kernel (graph completed)
-    // 2. Fallback: Both slots CYCLE_CLEARED AND never seen kernel AND age > 100 cycles
-    //    (handles GPU reset, failed launches, etc.)
-    // Thread-safe
-    void cycle_end() {
-        std::lock_guard<std::mutex> lock(mutex_);
-        size_t removed_normal = 0;
-        size_t removed_fallback = 0;
-
-        for (auto it = map_.begin(); it != map_.end(); ) {
-            bool should_remove = false;
-            bool is_fallback = false;
-
-            if (it->second.state[0] == GRAPH_STATE_CYCLE_CLEARED &&
-                it->second.state[1] == GRAPH_STATE_CYCLE_CLEARED) {
-
-                if (it->second.ever_seen_kernel) {
-                    // Primary: Graph completed normally (saw kernels, then stopped)
-                    should_remove = true;
-                    removed_normal++;
-                } else if ((current_cycle_ - it->second.insertion_cycle) > 100) {
-                    // Fallback: Never saw kernels and entry is very old (>100 cycles)
-                    // Prevents leaking entries when GPU resets or launches fail
-                    should_remove = true;
-                    is_fallback = true;
-                    removed_fallback++;
-                }
-            }
-
-            if (should_remove) {
-                it = map_.erase(it);
-            } else {
-                ++it;
-            }
-        }
-    }
-
-    // Get cleanup stats (for debugging)
-    void get_stats(size_t& size, size_t& oldest_age) const {
-        std::lock_guard<std::mutex> lock(mutex_);
-        size = map_.size();
-        oldest_age = 0;
-        for (const auto& pair : map_) {
-            uint32_t age = current_cycle_ - pair.second.insertion_cycle;
-            if (age > oldest_age) {
-                oldest_age = age;
-            }
-        }
-    }
-
-    // Get current size
-    // Thread-safe
-    size_t size() const {
-        std::lock_guard<std::mutex> lock(mutex_);
-        return map_.size();
-    }
-
-private:
-    std::unordered_map<uint32_t, GraphCorrelationEntry> map_;
-    uint32_t current_cycle_;
-    mutable std::mutex mutex_;
-};
-
-// C API implementation
-extern "C" {
-
-CorrelationFilterHandle correlation_filter_create(void) {
-    return new CorrelationFilter();
-}
-
-void correlation_filter_destroy(CorrelationFilterHandle filter) {
-    if (filter) {
-        delete static_cast<CorrelationFilter*>(filter);
-    }
-}
-
-void correlation_filter_insert(CorrelationFilterHandle filter, uint32_t correlation_id) {
-    if (filter) {
-        static_cast<CorrelationFilter*>(filter)->insert(correlation_id);
-    }
-}
-
-bool correlation_filter_check_and_remove(CorrelationFilterHandle filter, uint32_t correlation_id) {
-    if (filter) {
-        return static_cast<CorrelationFilter*>(filter)->check_and_remove(correlation_id);
-    }
-    return false;
-}
-
-size_t correlation_filter_size(CorrelationFilterHandle filter) {
-    if (filter) {
-        return static_cast<CorrelationFilter*>(filter)->size();
-    }
-    return 0;
-}
-
-GraphCorrelationMapHandle graph_correlation_map_create(void) {
-    return new GraphCorrelationMap();
-}
-
-void graph_correlation_map_destroy(GraphCorrelationMapHandle map) {
-    if (map) {
-        delete static_cast<GraphCorrelationMap*>(map);
-    }
-}
-
-void graph_correlation_map_insert(GraphCorrelationMapHandle map, uint32_t correlation_id) {
-    if (map) {
-        static_cast<GraphCorrelationMap*>(map)->insert(correlation_id);
-    }
-}
-
-void graph_correlation_map_cycle_start(GraphCorrelationMapHandle map, uint32_t cycle) {
-    if (map) {
-        static_cast<GraphCorrelationMap*>(map)->cycle_start(cycle);
-    }
-}
-
-bool graph_correlation_map_check_and_mark_seen(GraphCorrelationMapHandle map, uint32_t correlation_id, uint32_t cycle) {
-    if (map) {
-        return static_cast<GraphCorrelationMap*>(map)->check_and_mark_seen(correlation_id, cycle);
-    }
-    return false;
-}
-
-void graph_correlation_map_cycle_end(GraphCorrelationMapHandle map) {
-    if (map) {
-        static_cast<GraphCorrelationMap*>(map)->cycle_end();
-    }
-}
-
-size_t graph_correlation_map_size(GraphCorrelationMapHandle map) {
-    if (map) {
-        return static_cast<GraphCorrelationMap*>(map)->size();
-    }
-    return 0;
-}
-
-void graph_correlation_map_get_stats(GraphCorrelationMapHandle map, size_t* size, size_t* oldest_age) {
-    if (map && size && oldest_age) {
-        static_cast<GraphCorrelationMap*>(map)->get_stats(*size, *oldest_age);
-    }
-}
-
-} // extern "C"
diff --git a/cupti/correlation_filter.h b/cupti/correlation_filter.h
deleted file mode 100644
index 53a2c78..0000000
--- a/cupti/correlation_filter.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#pragma once
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Opaque handle to correlation filter
-typedef void *CorrelationFilterHandle;
-
-// Create a new correlation filter
-CorrelationFilterHandle correlation_filter_create(void);
-
-// Destroy the correlation filter
-void correlation_filter_destroy(CorrelationFilterHandle filter);
-
-// Insert a correlation ID into the filter
-// Thread-safe: can be called from multiple threads concurrently
-void correlation_filter_insert(CorrelationFilterHandle filter,
-                               uint32_t correlation_id);
-
-// Check if a correlation ID exists and remove it if found
-// Returns true if the correlation ID was found and removed, false otherwise
-// Thread-safe: safe to call concurrently with inserts
-bool correlation_filter_check_and_remove(CorrelationFilterHandle filter,
-                                         uint32_t correlation_id);
-
-// Get the current size of the filter (number of tracked correlation IDs)
-// Note: This is an approximate count in concurrent scenarios
-size_t correlation_filter_size(CorrelationFilterHandle filter);
-
-// Graph correlation state values
-enum GraphCorrelationState {
-  GRAPH_STATE_UNINITIALIZED = 0, // Entry just created, slot not yet processed
-  GRAPH_STATE_CYCLE_CLEARED = 1, // Cycle started, no kernels seen yet
-  GRAPH_STATE_KERNEL_SEEN = 2    // At least one kernel seen this cycle
-};
-
-// Opaque handle to graph correlation map
-typedef void *GraphCorrelationMapHandle;
-
-// Create a new graph correlation map
-GraphCorrelationMapHandle graph_correlation_map_create(void);
-
-// Destroy the graph correlation map
-void graph_correlation_map_destroy(GraphCorrelationMapHandle map);
-
-// Insert a correlation ID into the map (called when sampling a graph launch)
-// Thread-safe: can be called from multiple threads concurrently
-void graph_correlation_map_insert(GraphCorrelationMapHandle map,
-                                  uint32_t correlation_id);
-
-// Start a new processing cycle - clears the appropriate slot for all entries
-// Thread-safe
-void graph_correlation_map_cycle_start(GraphCorrelationMapHandle map,
-                                       uint32_t cycle);
-
-// Check if correlation ID should fire probe and mark as seen for this cycle
-// Returns true if the correlation ID is tracked (should fire probe)
-// Thread-safe
-bool graph_correlation_map_check_and_mark_seen(GraphCorrelationMapHandle map,
-                                               uint32_t correlation_id,
-                                               uint32_t cycle);
-
-// End processing cycle - removes entries that haven't seen kernels in 2
-// consecutive cycles Thread-safe
-void graph_correlation_map_cycle_end(GraphCorrelationMapHandle map);
-
-// Get the current size of the map (number of tracked correlation IDs)
-// Note: This is an approximate count in concurrent scenarios
-size_t graph_correlation_map_size(GraphCorrelationMapHandle map);
-
-// Get statistics about the map (for debugging)
-// Returns the current size and age of the oldest entry (in cycles)
-void graph_correlation_map_get_stats(GraphCorrelationMapHandle map,
-                                     size_t *size, size_t *oldest_age);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/cupti/cupti-prof.c b/cupti/cupti-prof.c
deleted file mode 100644
index 11b61b0..0000000
--- a/cupti/cupti-prof.c
+++ /dev/null
@@ -1,575 +0,0 @@
-#define _POSIX_C_SOURCE 199309L
-#include <execinfo.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/sdt.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <cupti.h>
-
-#include "correlation_filter.h"
-
-// Debug logging control
-static bool debug_enabled = false;
-
-// Activity buffer management
-// A kernel activity is around 224 bytes so a 128kb buffer
-// will hold ~500 activities, we want to flush regularly since
-// we are a continuous profiler so we don't need a huge buffer
-// like most CUPTI profilers.  Also a small size avoid malloc
-// just going to mmap every time so the allocator should cache
-// and re-use these for us.
-static size_t activityBufferSize = 128 * 1024;
-
-// Global variables
-static CUpti_SubscriberHandle subscriber = 0;
-
-static size_t outstandingEvents = 0;
-
-// Thread-local tracking: store correlation ID from runtime ENTER
-// so we can skip driver EXIT probe when it matches (driver calls happen under
-// runtime calls)
-static __thread uint32_t runtimeEnterCorrelationId = 0;
-
-// Rate limiting - token bucket algorithm (configurable via PARCAGPU_RATE_LIMIT)
-static double rateLimitPerSec = 100.0;
-
-// Thread-local token bucket state
-static __thread uint64_t lastRefillNs = 0;
-static __thread double tokens = 0;
-
-// Returns true if the sample should be emitted, false if rate limited
-static bool rateLimiterTryAcquire(uint64_t nowNs) {
-  // Refill tokens based on elapsed time
-  if (lastRefillNs > 0) {
-    double elapsedSec = (nowNs - lastRefillNs) / 1e9;
-    tokens = tokens + elapsedSec * rateLimitPerSec;
-    if (tokens > rateLimitPerSec) {
-      tokens = rateLimitPerSec;
-    }
-  } else {
-    tokens = rateLimitPerSec; // Start with full bucket
-  }
-  lastRefillNs = nowNs;
-
-  if (tokens >= 1.0) {
-    tokens -= 1.0;
-    return true;
-  }
-  return false;
-}
-
-// Correlation ID filter (for regular kernel launches)
-static CorrelationFilterHandle correlationFilter = NULL;
-
-// Graph correlation map (for graph launches with multiple kernels per
-// correlation ID)
-static GraphCorrelationMapHandle graphCorrelationMap = NULL;
-
-// Buffer processing cycle counter (for graph map state machine)
-static uint32_t bufferCycle = 0;
-
-static void init_debug(void) {
-  static bool initialized = false;
-  if (!initialized) {
-    debug_enabled = getenv("PARCAGPU_DEBUG") != NULL;
-    const char *rateEnv = getenv("PARCAGPU_RATE_LIMIT");
-    if (rateEnv != NULL) {
-      double rate = atof(rateEnv);
-      if (rate > 0) {
-        rateLimitPerSec = rate;
-      }
-    }
-    initialized = true;
-  }
-}
-
-#define DEBUG_PRINTF(...)                                                      \
-  do {                                                                         \
-    init_debug();                                                              \
-    if (debug_enabled) {                                                       \
-      struct timespec ts;                                                      \
-      clock_gettime(CLOCK_REALTIME, &ts);                                      \
-      printf("[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec);                           \
-      printf(__VA_ARGS__);                                                     \
-    }                                                                          \
-  } while (0)
-
-// Forward declarations
-static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
-                                  CUpti_CallbackId cbid,
-                                  const CUpti_CallbackData *cbdata);
-static void parcagpuBufferRequested(uint8_t **buffer, size_t *size,
-                                    size_t *maxNumRecords);
-static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId,
-                                    uint8_t *buffer, size_t size,
-                                    size_t validSize);
-
-void cleanup(void);
-
-// CUPTI initialization function required for CUDA_INJECTION64_PATH
-int InitializeInjection(void) {
-  DEBUG_PRINTF("[CUPTI] InitializeInjection called\n");
-  CUptiResult result;
-
-  // Set flush period BEFORE enabling activities (in milliseconds)
-  // Try a larger value like 1000ms (1 second) for better compatibility
-  result = cuptiActivityFlushPeriod(1000);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to set activity flush period: %s\n",
-            errstr);
-  } else {
-    DEBUG_PRINTF("[CUPTI] Set activity flush period to 1000ms\n");
-  }
-
-  // Try to subscribe to callbacks
-  result = cuptiSubscribe(&subscriber,
-                          (CUpti_CallbackFunc)parcagpuCuptiCallback, NULL);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to subscribe to callbacks: %s\n", errstr);
-    return 1; // Still return success to not break the injection
-  }
-
-  // Enable all runtime API kernel launch callbacks
-  CUpti_CallbackId runtimeCallbacks[] = {
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000,
-      CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000,
-  };
-  for (size_t i = 0; i < sizeof(runtimeCallbacks) / sizeof(runtimeCallbacks[0]);
-       i++) {
-    result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
-                                 runtimeCallbacks[i]);
-    if (result != CUPTI_SUCCESS) {
-      const char *errstr;
-      cuptiGetResultString(result, &errstr);
-      fprintf(stderr, "[CUPTI] Failed to enable runtime callback %d: %s\n",
-              runtimeCallbacks[i], errstr);
-    }
-  }
-
-  // Enable all driver API kernel launch callbacks
-  CUpti_CallbackId driverCallbacks[] = {
-      CUPTI_DRIVER_TRACE_CBID_cuLaunch,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz,
-      CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice,
-      CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch,
-      CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz,
-  };
-  for (size_t i = 0; i < sizeof(driverCallbacks) / sizeof(driverCallbacks[0]);
-       i++) {
-    result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
-                                 driverCallbacks[i]);
-    if (result != CUPTI_SUCCESS) {
-      const char *errstr;
-      cuptiGetResultString(result, &errstr);
-      fprintf(stderr, "[CUPTI] Failed to enable driver callback %d: %s\n",
-              driverCallbacks[i], errstr);
-    }
-  }
-
-  // Register activity buffer callbacks
-  result = cuptiActivityRegisterCallbacks(parcagpuBufferRequested,
-                                          parcagpuBufferCompleted);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to register activity callbacks: %s\n",
-            errstr);
-    return 1; // Still return success to not break the injection
-  }
-
-  result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
-  if (result != CUPTI_SUCCESS) {
-    const char *errstr;
-    cuptiGetResultString(result, &errstr);
-    fprintf(stderr, "[CUPTI] Failed to enable concurrent kernel activity: %s\n",
-            errstr);
-  } else {
-    DEBUG_PRINTF("[CUPTI] Enabled CONCURRENT_KERNEL activity\n");
-  }
-
-  // Create correlation filter
-  correlationFilter = correlation_filter_create();
-  if (correlationFilter) {
-    DEBUG_PRINTF("[CUPTI] Correlation filter created and enabled\n");
-  } else {
-    fprintf(stderr, "[CUPTI] Warning: Failed to create correlation filter\n");
-  }
-
-  // Create graph correlation map
-  graphCorrelationMap = graph_correlation_map_create();
-  if (graphCorrelationMap) {
-    DEBUG_PRINTF("[CUPTI] Graph correlation map created and enabled\n");
-  } else {
-    fprintf(stderr,
-            "[CUPTI] Warning: Failed to create graph correlation map\n");
-  }
-
-  atexit(cleanup);
-
-  DEBUG_PRINTF("[CUPTI] Successfully initialized CUPTI callbacks with external "
-               "correlation and activity API\n");
-
-  return 1;
-}
-
-// Helper function to print stack trace
-static void print_backtrace(const char *prefix) {
-  void *array[20];
-  size_t size;
-  char **strings;
-
-  size = backtrace(array, 20);
-  strings = backtrace_symbols(array, size);
-
-  if (strings != NULL) {
-    printf("%s Stack trace (%zu frames):\n", prefix, size);
-    for (size_t i = 0; i < size; i++) {
-      printf("  [%zu] %s\n", i, strings[i]);
-    }
-    free(strings);
-  }
-}
-
-// Callback handler for driver and runtime API
-static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
-                                  CUpti_CallbackId cbid,
-                                  const CUpti_CallbackData *cbdata) {
-  uint32_t correlationId = cbdata->correlationId;
-
-  // Track runtime ENTER so we can skip driver EXIT when they match
-  if (domain == CUPTI_CB_DOMAIN_RUNTIME_API &&
-      cbdata->callbackSite == CUPTI_API_ENTER) {
-    runtimeEnterCorrelationId = correlationId;
-    DEBUG_PRINTF("[CUPTI] Runtime API ENTER: correlationId=%u\n",
-                 correlationId);
-    return;
-  }
-
-  // We hook on EXIT because that makes our probe overhead not add to GPU
-  // launch latency and hopefully covers some of the overhead in the shadow of
-  // GPU async work.
-  if (cbdata->callbackSite != CUPTI_API_EXIT) {
-    if (cbdata->callbackSite == CUPTI_API_ENTER &&
-        domain == CUPTI_CB_DOMAIN_DRIVER_API) {
-      DEBUG_PRINTF(
-          "[CUPTI] Driver API ENTER: correlationId=%u (will check on EXIT)\n",
-          correlationId);
-    }
-    return;
-  }
-
-  const char *name =
-      cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
-  int signedCbid;
-
-  if (domain == CUPTI_CB_DOMAIN_DRIVER_API) {
-    // Skip if this driver call is under a runtime call (same correlation ID)
-    if (correlationId == runtimeEnterCorrelationId) {
-      DEBUG_PRINTF("[CUPTI] Skipping driver EXIT correlationId=%u "
-                   "(runtimeEnter=%u) - runtime "
-                   "will handle\n",
-                   correlationId, runtimeEnterCorrelationId);
-      return;
-    }
-    // Pure driver call (no runtime wrapper) - use negative cbid
-    signedCbid = -(int)cbid;
-    DEBUG_PRINTF("[CUPTI] Driver API EXIT callback: cbid=%d, correlationId=%u, "
-                 "runtimeEnter=%u, func=%s\n",
-                 cbid, correlationId, runtimeEnterCorrelationId, name);
-  } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
-    signedCbid = (int)cbid;
-    DEBUG_PRINTF("[CUPTI] Runtime API EXIT callback: cbid=%d, "
-                 "correlationId=%u, runtimeEnter=%u, func=%s\n",
-                 cbid, correlationId, runtimeEnterCorrelationId, name);
-    runtimeEnterCorrelationId = 0; // Clear after use
-  } else {
-    return;
-  }
-
-  // Rate limit probes
-  struct timespec ts;
-  clock_gettime(CLOCK_MONOTONIC, &ts);
-  uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
-  if (!rateLimiterTryAcquire(nowNs)) {
-    DEBUG_PRINTF("[CUPTI] Rate limited: skipping probe for correlationId=%u\n",
-                 correlationId);
-    return;
-  }
-
-  outstandingEvents++;
-  DTRACE_PROBE3(parcagpu, cuda_correlation, correlationId, signedCbid, name);
-
-  // Detect graph launches by callback ID
-  bool is_graph_launch =
-      (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000) ||
-      (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000) ||
-      (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch) ||
-      (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
-
-  // Insert into appropriate map based on launch type
-  if (is_graph_launch) {
-    // Graph launch - will generate multiple kernels with same correlation ID
-    if (graphCorrelationMap) {
-      graph_correlation_map_insert(graphCorrelationMap, correlationId);
-      DEBUG_PRINTF(
-          "[CUPTI] Inserted correlationId=%u into graph map (size=%zu)\n",
-          correlationId, graph_correlation_map_size(graphCorrelationMap));
-    }
-  } else {
-    // Regular kernel launch - single kernel per correlation ID
-    if (correlationFilter) {
-      correlation_filter_insert(correlationFilter, correlationId);
-      DEBUG_PRINTF("[CUPTI] Inserted correlationId=%u into filter (size=%zu)\n",
-                   correlationId, correlation_filter_size(correlationFilter));
-    }
-  }
-
-  // If we let too many events pile up it overwhelms the perf_event buffers,
-  // just another reason to explore just passing the activity buffer through to
-  // eBPF.
-  if (outstandingEvents > 3000) {
-    DEBUG_PRINTF("[CUPTI] Flushing: outstandingEvents=%zu\n",
-                 outstandingEvents);
-    cuptiActivityFlushAll(0);
-    outstandingEvents = 0;
-  }
-}
-
-// Out-of-line USDT probe site for activity batches.
-// Single call site ensures one probe location in the ELF .note.stapsdt section.
-__attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs,
-                                                     uint32_t count) {
-  DTRACE_PROBE2(parcagpu, activity_batch, ptrs, count);
-}
-
-// Buffer request callback
-static void parcagpuBufferRequested(uint8_t **buffer, size_t *size,
-                                    size_t *maxNumRecords) {
-  *buffer = (uint8_t *)aligned_alloc(8, activityBufferSize);
-  *size = activityBufferSize;
-  *maxNumRecords = 0; // Let CUPTI decide
-
-  DEBUG_PRINTF("[CUPTI:bufferRequested] Allocated buffer %p, size=%zu\n",
-               *buffer, *size);
-}
-
-// Buffer completion callback
-static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId,
-                                    uint8_t *buffer, size_t size,
-                                    size_t validSize) {
-  CUptiResult result;
-  CUpti_Activity *record = NULL;
-  int recordCount = 0;
-  static int calls = 0;
-
-  // Batch probe: collect pointers to activity records and pass them to
-  // BPF/USDT every ACTIVITY_BATCH_SIZE records. Stack-allocated array
-  // of pointers — no heap allocation, no copying, version-independent.
-  // BPF consumers filter by activity kind (kernel, memcpy, etc.).
-#define ACTIVITY_BATCH_SIZE 128
-  const void *batchPtrs[ACTIVITY_BATCH_SIZE];
-  uint32_t batchCount = 0;
-
-  DEBUG_PRINTF("[CUPTI] bufferCompleted called: buffer=%p validSize=%zu (%d)\n",
-               buffer, validSize, calls++);
-
-  // Start new cycle for graph correlation map
-  uint32_t currentCycle = bufferCycle++;
-  if (graphCorrelationMap) {
-    graph_correlation_map_cycle_start(graphCorrelationMap, currentCycle);
-    DEBUG_PRINTF("[CUPTI] Started graph correlation map cycle %u\n",
-                 currentCycle);
-  }
-
-  while (1) {
-    result = cuptiActivityGetNextRecord(buffer, validSize, &record);
-    if (result == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-      break;
-    } else if (result != CUPTI_SUCCESS) {
-      const char *errstr;
-      cuptiGetResultString(result, &errstr);
-      fprintf(stderr, "[CUPTI] Error reading activity record: %s\n", errstr);
-      break;
-    }
-
-    recordCount++;
-    if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL ||
-        record->kind == CUPTI_ACTIVITY_KIND_KERNEL) {
-      CUpti_ActivityKernel5 *k = (CUpti_ActivityKernel5 *)record;
-
-      DEBUG_PRINTF("[CUPTI] Kernel activity: graphId=%u graphNodeId=%lu "
-                   "name=%s, correlationId=%u, deviceId=%u, "
-                   "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n",
-                   k->graphId, k->graphNodeId, k->name, k->correlationId,
-                   k->deviceId, k->streamId, k->start, k->end,
-                   k->end - k->start);
-
-      // Route to appropriate map based on whether this is a graph kernel
-      bool should_fire = true;
-      if (k->graphId != 0) {
-        // Graph kernel - check graph correlation map
-        if (graphCorrelationMap) {
-          should_fire = graph_correlation_map_check_and_mark_seen(
-              graphCorrelationMap, k->correlationId, currentCycle);
-          if (!should_fire) {
-            DEBUG_PRINTF(
-                "[CUPTI] Filtered out graph correlationId=%u (not tracked)\n",
-                k->correlationId);
-          } else {
-            DEBUG_PRINTF("[CUPTI] Matched graph correlationId=%u - firing "
-                         "kernel_executed (map size=%zu)\n",
-                         k->correlationId,
-                         graph_correlation_map_size(graphCorrelationMap));
-          }
-        }
-      } else {
-        // Regular kernel - check regular correlation filter
-        if (correlationFilter) {
-          should_fire = correlation_filter_check_and_remove(correlationFilter,
-                                                            k->correlationId);
-          if (!should_fire) {
-            DEBUG_PRINTF(
-                "[CUPTI] Filtered out correlationId=%u (not tracked)\n",
-                k->correlationId);
-          } else {
-            DEBUG_PRINTF("[CUPTI] Matched correlationId=%u - firing "
-                         "kernel_executed (filter size=%zu)\n",
-                         k->correlationId,
-                         correlation_filter_size(correlationFilter));
-          }
-        }
-      }
-
-      // Only fire probe if correlation ID was tracked (or filters disabled)
-      if (should_fire) {
-        DTRACE_PROBE8(parcagpu, kernel_executed, k->start, k->end,
-                      k->correlationId, k->deviceId, k->streamId, k->graphId,
-                      k->graphNodeId, k->name);
-      }
-    }
-
-    // Collect pointer for batch probe (all activity kinds).
-    // BPF consumers inspect the kind field to filter types they care about.
-    batchPtrs[batchCount++] = record;
-    if (batchCount < ACTIVITY_BATCH_SIZE)
-      continue;
-
-    parcagpuActivityBatch(batchPtrs, batchCount);
-    batchCount = 0;
-  }
-
-  if (batchCount > 0)
-    parcagpuActivityBatch(batchPtrs, batchCount);
-
-  DEBUG_PRINTF("[CUPTI] Processed %d activity records from buffer %p\n",
-               recordCount, buffer);
-
-  // End cycle for graph correlation map - clean up completed graph launches
-  if (graphCorrelationMap) {
-    graph_correlation_map_cycle_end(graphCorrelationMap);
-
-    size_t map_size = 0;
-    size_t oldest_age = 0;
-    graph_correlation_map_get_stats(graphCorrelationMap, &map_size,
-                                    &oldest_age);
-
-    DEBUG_PRINTF("[CUPTI] Ended graph correlation map cycle %u (map size=%zu, "
-                 "oldest_age=%zu cycles)\n",
-                 currentCycle, map_size, oldest_age);
-
-    // Log warning if we have old entries (potential leaked graph launches)
-    if (oldest_age > 50 && currentCycle % 10 == 0) {
-      DEBUG_PRINTF("[CUPTI] WARNING: Graph map has entries aged %zu cycles "
-                   "(may be dropped launches)\n",
-                   oldest_age);
-    }
-  }
-
-  // Reset to 0 rather than decrement - one API callback can produce N
-  // activities so decrementing by recordCount can cause underflow (size_t wraps
-  // to huge value)
-  outstandingEvents = 0;
-
-  // Free the buffer
-  DEBUG_PRINTF("[CUPTI] Freeing buffer %p\n", buffer);
-  free(buffer);
-
-  // Report any records dropped due to buffer overflow
-  size_t dropped;
-  result = cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped);
-  if (result == CUPTI_SUCCESS && dropped > 0) {
-    fprintf(stderr, "[CUPTI] Warning: %zu activity records dropped\n", dropped);
-  }
-}
-
-// Cleanup function (destructor disabled to prevent early cleanup)
-void cleanup(void) {
-  static bool cleanup_done = false;
-
-  // Make cleanup idempotent - safe to call multiple times
-  if (cleanup_done) {
-    return;
-  }
-  cleanup_done = true;
-
-  DEBUG_PRINTF("[CUPTI] Cleanup started\n");
-  // Flush any remaining activity records
-  cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
-
-  // Unsubscribe from callbacks
-  if (subscriber) {
-    cuptiUnsubscribe(subscriber);
-    subscriber = 0;
-  }
-
-  // Destroy correlation filter
-  if (correlationFilter) {
-    size_t remaining = correlation_filter_size(correlationFilter);
-    if (remaining > 0) {
-      DEBUG_PRINTF(
-          "[CUPTI] Warning: %zu correlation IDs still in filter at cleanup\n",
-          remaining);
-    }
-    correlation_filter_destroy(correlationFilter);
-    correlationFilter = NULL;
-  }
-
-  // Destroy graph correlation map
-  if (graphCorrelationMap) {
-    size_t remaining = graph_correlation_map_size(graphCorrelationMap);
-    if (remaining > 0) {
-      DEBUG_PRINTF("[CUPTI] Warning: %zu correlation IDs still in graph map at "
-                   "cleanup\n",
-                   remaining);
-    }
-    graph_correlation_map_destroy(graphCorrelationMap);
-    graphCorrelationMap = NULL;
-  }
-
-  DEBUG_PRINTF("[CUPTI] Cleanup completed\n");
-}
diff --git a/cupti/toolchain-arm64.cmake b/cupti/toolchain-arm64.cmake
deleted file mode 100644
index 5542249..0000000
--- a/cupti/toolchain-arm64.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-# CMake toolchain file for cross-compiling to ARM64
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR aarch64)
-
-# Specify the cross compiler
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
-
-# Where to look for the target environment
-set(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
-
-# Adjust the default behavior of the FIND_XXX() commands:
-# search headers and libraries in the target environment
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-# Force position-independent code for shared libraries
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
diff --git a/ebpf/cupti_bpf.h b/ebpf/cupti_bpf.h
new file mode 100644
index 0000000..edc8df6
--- /dev/null
+++ b/ebpf/cupti_bpf.h
@@ -0,0 +1,87 @@
+// CUPTI BPF definitions — shared between test/bpf/ and production cuda.ebpf.c.
+//
+// Contains BPF-side layouts for:
+//   - CUpti_ActivityKernel5 (kernel activity records)
+//   - CUpti_PCSamplingPCData (PC sampling records)
+//   - CUpti_PCSamplingStallReason (stall reason entries)
+
+#ifndef CUPTI_BPF_H
+#define CUPTI_BPF_H
+
+// ---------------------------------------------------------------------------
+// Kernel activity records
+// ---------------------------------------------------------------------------
+
+// CUpti_ActivityKind values we care about
+#define CUPTI_ACTIVITY_KIND_KERNEL            3
+#define CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL 10
+
+// Matches the layout of CUpti_ActivityKernel5 exactly.
+// Explicit padding replaces __packed__ to avoid unnecessary unaligned-access
+// handling in BPF.  The struct uses aligned(8) and a static size assert.
+struct cupti_activity_kernel5 {
+  u32 kind;            // offset 0   - CUpti_ActivityKind
+  u8 _pad1[12];        // offset 4   - cacheConfig, sharedMemConfig,
+                       //              registersPerThread,
+                       //              partitionedGlobalCache x2
+  u64 start;           // offset 16  - kernel start timestamp (ns)
+  u64 end;             // offset 24  - kernel end timestamp (ns)
+  u64 completed;       // offset 32  - completion timestamp
+  u32 device_id;       // offset 40
+  u32 context_id;      // offset 44
+  u32 stream_id;       // offset 48
+  u8 _pad2[40];        // offset 52  - gridX/Y/Z, blockX/Y/Z,
+                       //              staticSharedMemory,
+                       //              dynamicSharedMemory,
+                       //              localMemoryPerThread,
+                       //              localMemoryTotal
+  u32 correlation_id;  // offset 92
+  s64 grid_id;         // offset 96
+  u64 name_ptr;        // offset 104 - const char* (user-space pointer)
+  u64 _reserved0;      // offset 112
+  u64 queued;          // offset 120
+  u64 submitted;       // offset 128
+  u8 _pad3[8];         // offset 136 - launchType, isSharedMemoryCarveout,
+                       //              sharedMemoryCarveoutRequested,
+                       //              padding, sharedMemoryExecuted
+  u64 graph_node_id;   // offset 144
+  u32 shmem_limit_cfg; // offset 152 - CUpti_FuncShmemLimitConfig
+  u32 graph_id;        // offset 156
+} __attribute__((aligned(8)));
+
+_Static_assert(
+  sizeof(struct cupti_activity_kernel5) == 160, "cupti_activity_kernel5 size mismatch");
+
+// ---------------------------------------------------------------------------
+// PC sampling records
+// ---------------------------------------------------------------------------
+
+#define STALL_REASON_NAME_LEN 64
+#define MAX_STALL_REASONS     64
+#define MAX_PC_BATCH_SIZE     512
+#define MAX_FUNC_NAME         128
+
+// Matches CUpti_PCSamplingStallReason (packed, aligned 8).
+struct cupti_stall_reason {
+  u32 stall_reason_index;
+  u32 samples;
+};
+
+// Matches CUpti_PCSamplingPCData (packed, aligned 8).
+// Contains user-space pointers that BPF chases with bpf_probe_read_user.
+// We read the base 56-byte struct, then conditionally read correlationId
+// if the size field indicates CUPTI 12.4+ / v22+ (size > 56).
+struct cupti_pc_data {
+  u64 size; // struct size (56 = pre-12.4, 60+ = CUDA 12.4+)
+  u64 cubin_crc;
+  u64 pc_offset;
+  u32 function_index;
+  u32 _pc_pad;
+  u64 function_name_ptr; // const char* in user-space
+  u64 stall_reason_count;
+  u64 stall_reason_ptr; // CUpti_PCSamplingStallReason* in user-space
+} __attribute__((__packed__)) __attribute__((aligned(8)));
+
+#define CUPTI_PC_DATA_BASE_SIZE 56
+
+#endif // CUPTI_BPF_H
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..5114572
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,10 @@
+// parcagpu — host-side CUDA profiler sources and shared BPF headers.
+//
+// This module exists primarily so external projects (e.g.
+// opentelemetry-ebpf-profiler) can pull in the canonical BPF-side
+// CUPTI definitions at ebpf/cupti_bpf.h via `go mod download`, the same
+// way they consume github.com/parca-dev/usdt. The Go module has no Go
+// source of its own — ebpf/ is a plain header directory.
+module github.com/parca-dev/parcagpu
+
+go 1.25.0
diff --git a/microbenchmarks/pc_sample_toy.cu b/microbenchmarks/pc_sample_toy.cu
new file mode 100644
index 0000000..beef66b
--- /dev/null
+++ b/microbenchmarks/pc_sample_toy.cu
@@ -0,0 +1,110 @@
+// pc_sample_toy.cu — a simple GPU busy-loop for testing PC sampling
+// Compile: make microbenchmarks  (or: nvcc -g -lineinfo -arch=native -o pc_sample_toy pc_sample_toy.cu)
+// Run:     ./pc_sample_toy
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#define CHECK(call)                                                            \
+  do {                                                                         \
+    cudaError_t err = (call);                                                  \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,         \
+              cudaGetErrorString(err));                                        \
+      exit(1);                                                                 \
+    }                                                                          \
+  } while (0)
+
+// Kernel A: heavy FP math (sin/cos chain)
+__global__ void trig_storm(float *out, int n, unsigned long long iters) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  float x = (float)idx * 0.001f;
+  for (unsigned long long i = 0; i < iters; i++) {
+    x = sinf(x) * cosf(x) + 0.1f;
+  }
+  out[idx] = x;
+}
+
+// Kernel B: integer bit-twiddling
+__global__ void hash_churn(unsigned int *out, int n, unsigned long long iters) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  unsigned int h = idx ^ 0xdeadbeef;
+  for (unsigned long long i = 0; i < iters; i++) {
+    h ^= h << 13;
+    h ^= h >> 17;
+    h ^= h << 5;
+    h += (unsigned int)i;
+  }
+  out[idx] = h;
+}
+
+// Kernel C: shared-memory bouncing
+__global__ void shmem_bounce(float *out, int n, unsigned long long iters) {
+  __shared__ float tile[256];
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int tid = threadIdx.x;
+
+  tile[tid] = (float)idx;
+  __syncthreads();
+
+  for (unsigned long long i = 0; i < iters; i++) {
+    tile[tid] += tile[(tid + 1) % blockDim.x] * 0.01f;
+    __syncthreads();
+  }
+
+  if (idx < n)
+    out[idx] = tile[tid];
+}
+
+void go() {
+  const int N = 1 << 18; // 256K elements
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  float *d_float;
+  unsigned int *d_uint;
+
+  CHECK(cudaMalloc(&d_float, N * sizeof(float)));
+  CHECK(cudaMalloc(&d_uint, N * sizeof(unsigned int)));
+
+  printf("Launching GPU kernels — attach your profiler now.\n");
+  printf("PID: %d\n\n", getpid());
+
+  sleep(1);
+  // Each kernel runs for roughly 0.5–1 second depending on GPU.
+  // Tune the iteration count up/down as needed.
+
+  printf("  [1/3] trig_storm ...\n");
+  trig_storm<<<blocks, threads>>>(d_float, N, 500000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("  [2/3] hash_churn ...\n");
+  hash_churn<<<blocks, threads>>>(d_uint, N, 2000000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("  [3/3] shmem_bounce ...\n");
+  shmem_bounce<<<blocks, threads>>>(d_float, N, 50000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("\nDone.\n");
+
+  CHECK(cudaFree(d_float));
+  CHECK(cudaFree(d_uint));
+}
+
+int main(int argc, char **argv) {
+  int loops = 1;
+  if (argc > 1) {
+    loops = atoi(argv[1]);
+  }
+  while (loops-- > 0) {
+    go();
+  }
+}
diff --git a/microbenchmarks/rapid_launch.cu b/microbenchmarks/rapid_launch.cu
new file mode 100644
index 0000000..aa02679
--- /dev/null
+++ b/microbenchmarks/rapid_launch.cu
@@ -0,0 +1,44 @@
+// rapid_launch.cu — measures per-kernel-launch overhead from CUPTI injection.
+// Launches many tiny kernels to stress the callback path.
+//
+// Compile: nvcc -o rapid_launch rapid_launch.cu
+// Run:     ./rapid_launch [num_launches]
+//
+// Compare:
+//   ./rapid_launch 50000                                          # baseline
+//   CUDA_INJECTION64_PATH=.../libparcagpucupti.so ./rapid_launch 50000  # injected
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <time.h>
+
+__global__ void empty_kernel() {}
+
+static double now_sec() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+int main(int argc, char **argv) {
+  int n = 50000;
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  // Warm up the CUDA context and any injection library init.
+  empty_kernel<<<1, 1>>>();
+  cudaDeviceSynchronize();
+
+  // Synchronous launches — each one round-trips through CUPTI callbacks.
+  double t0 = now_sec();
+  for (int i = 0; i < n; i++) {
+    empty_kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+  double t1 = now_sec();
+
+  double elapsed = t1 - t0;
+  printf("%d launches in %.3f s  (%.1f us/launch)\n", n, elapsed,
+         elapsed / n * 1e6);
+  return 0;
+}
diff --git a/parcagpu.bt b/parcagpu.bt
index c35ca43..4132c47 100755
--- a/parcagpu.bt
+++ b/parcagpu.bt
@@ -1,50 +1,85 @@
 #!/usr/bin/env bpftrace
 
-/*
- * Simple CUPTI USDT Test
- * Tests the three parcagpu USDT probes
- */
+BEGIN {
+    printf("Monitoring CUDA activity via parcagpu USDT probes...\n");
+    printf("%-20s %-12s %-10s %-10s %-10s %-10s %-10s %-10s %s\n",
+           "TIME", "DURATION_NS", "CORR_ID", "DEVICE", "STREAM", "GRAPH", "NODE", "START", "KERNEL");
+}
 
-BEGIN
-{
-    printf("Simple CUPTI USDT Test Started\n");
-    printf("Monitoring parcagpu USDT probes...\n\n");
+usdt:*:parcagpu:kernel_executed {
+    $start = arg0;
+    $end = arg1;
+    $duration = $end - $start;
+    $correlation_id = arg2;
+    $device_id = arg3;
+    $stream_id = arg4;
+    $graph_id = arg5;
+    $graph_node_id = arg6;
+    $kernel_name = str(arg7);
+
+    printf("%-12s.%-6u %-12lu %-10u %-10u %-10u %-10u %-10lu %-10lu ",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $duration,
+           $correlation_id,
+           $device_id,
+           $stream_id,
+           $graph_id,
+           $graph_node_id,
+           $start);
+    printf("%s\n", $kernel_name);
 }
 
+usdt:*:parcagpu:cuda_correlation {
+    $correlation_id = arg0;
+    $cbid = arg1;
+    $name = str(arg2);
 
+    printf("%-12s.%-6u [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S", nsecs), (nsecs % 1000000000) / 1000, $correlation_id, $cbid, $name);
+}
 
-// Track CUDA API correlation events
-// Arguments: correlationId (uint32), signedCbid (int32), name (char*)
-usdt:*:parcagpu:cuda_correlation
-{
-    $corId = (uint32)arg0;
-    $cbid = (int32)arg1;
-    printf("[%d] cuda_correlation: correlationId=%u, cbid=%d, name=%s\n",
-           pid, $corId, $cbid, str(arg2));
-    @cuda_correlations = count();
+usdt:*:parcagpu:pc_sample_batch {
+    $records = arg0;
+    $count = arg1;
+
+    printf("%-12s.%-6u [PC_BATCH] count=%u records=%p\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $count,
+           $records);
 }
 
-// Track kernel executions
-// Arguments: start, end, correlationId, deviceId, streamId, name (pointer)
-usdt:*:parcagpu:kernel_executed
-{
-    $start = arg0;
-    $end = arg1;
-    printf("[%d] Kernel executed:\n", pid);
-    printf("  start=%lu, end=%lu, duration=%lu ns\n", $start, $end, $end - $start);
-    $devId = arg2 >>32;
-    $corId = (uint64)arg2 & 0xFFFFFFFF;		
-    printf("  correlationId=%u, deviceId=%u, streamId=%u\n",
-           $corId, $devId, arg3);
-    printf("  name=%s\n", str(arg4));
-    @kernel_executions = count();
-}
-
-END
-{
-    printf("\n=== Summary ===\n");
-    printf("CUDA correlations: ");
-    print(@cuda_correlations);
-    printf("Kernel executions: ");
-    print(@kernel_executions);
+usdt:*:parcagpu:stall_reason_map {
+    $names = arg0;
+    $count = arg1;
+
+    printf("%-12s.%-6u [STALL_MAP] count=%u base=%p\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $count,
+           $names);
+}
+
+usdt:*:parcagpu:cubin_loaded {
+    $cubin_crc = arg0;
+    $cubin_size = arg2;
+
+    printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx size=%lu\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $cubin_crc,
+           $cubin_size);
+}
+
+usdt:*:parcagpu:cubin_unloaded {
+    $cubin_crc = arg0;
+
+    printf("%-12s.%-6u [CUBIN_UNLOAD] crc=0x%lx\n",
+           strftime("%H:%M:%S", nsecs),
+           (nsecs % 1000000000) / 1000,
+           $cubin_crc);
+}
+
+END {
+    printf("\nMonitoring complete.\n");
 }
diff --git a/proton b/proton
new file mode 160000
index 0000000..bfc06a7
--- /dev/null
+++ b/proton
@@ -0,0 +1 @@
+Subproject commit bfc06a7e18f129da8832a433f993700036121daa
diff --git a/src/correlation_filter.cpp b/src/correlation_filter.cpp
new file mode 100644
index 0000000..c246c77
--- /dev/null
+++ b/src/correlation_filter.cpp
@@ -0,0 +1,97 @@
+#include "correlation_filter.h"
+
+namespace parcagpu {
+
+//=============================================================================
+// CorrelationFilter implementation
+//=============================================================================
+
+void CorrelationFilter::insert(uint32_t correlation_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  set_.insert(correlation_id);
+}
+
+bool CorrelationFilter::check_and_remove(uint32_t correlation_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto it = set_.find(correlation_id);
+  if (it != set_.end()) {
+    set_.erase(it);
+    return true;
+  }
+  return false;
+}
+
+size_t CorrelationFilter::size() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return set_.size();
+}
+
+//=============================================================================
+// GraphCorrelationEntry implementation
+//=============================================================================
+
+GraphCorrelationEntry::GraphCorrelationEntry(uint32_t cycle)
+    : state{GRAPH_STATE_UNINITIALIZED, GRAPH_STATE_UNINITIALIZED},
+      ever_seen_kernel(false), insertion_cycle(cycle) {}
+
+//=============================================================================
+// GraphCorrelationMap implementation
+//=============================================================================
+
+GraphCorrelationMap::GraphCorrelationMap() : current_cycle_(0) {}
+
+void GraphCorrelationMap::insert(uint32_t correlation_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  map_.emplace(correlation_id, GraphCorrelationEntry(current_cycle_));
+}
+
+void GraphCorrelationMap::cycle_start(uint32_t cycle) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  current_cycle_ = cycle;
+  uint32_t slot = cycle % 2;
+  for (auto &pair : map_) {
+    pair.second.state[slot] = GRAPH_STATE_CYCLE_CLEARED;
+  }
+}
+
+bool GraphCorrelationMap::check_and_mark_seen(uint32_t correlation_id,
+                                               uint32_t cycle) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto it = map_.find(correlation_id);
+  if (it != map_.end()) {
+    uint32_t slot = cycle % 2;
+    it->second.state[slot] = GRAPH_STATE_KERNEL_SEEN;
+    it->second.ever_seen_kernel = true;
+    return true;
+  }
+  return false;
+}
+
+void GraphCorrelationMap::cycle_end() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  for (auto it = map_.begin(); it != map_.end();) {
+    bool should_remove = false;
+    if (it->second.state[0] == GRAPH_STATE_CYCLE_CLEARED &&
+        it->second.state[1] == GRAPH_STATE_CYCLE_CLEARED) {
+      if (it->second.ever_seen_kernel) {
+        // Graph completed normally
+        should_remove = true;
+      } else if ((current_cycle_ - it->second.insertion_cycle) > 100) {
+        // Fallback: never saw kernels and entry is very old
+        should_remove = true;
+      }
+    }
+    if (should_remove) {
+      it = map_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+size_t GraphCorrelationMap::size() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return map_.size();
+}
+
+} // namespace parcagpu
diff --git a/src/correlation_filter.h b/src/correlation_filter.h
new file mode 100644
index 0000000..953733f
--- /dev/null
+++ b/src/correlation_filter.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <cstdint>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace parcagpu {
+
+//=============================================================================
+// Correlation Filter - tracks which kernel launches we've sampled
+//=============================================================================
+
+// CorrelationFilter for non-graph kernel launches
+// Insert on API callback, check-and-remove on kernel activity
+class CorrelationFilter {
+public:
+  void insert(uint32_t correlation_id);
+  bool check_and_remove(uint32_t correlation_id);
+  size_t size() const;
+
+private:
+  std::unordered_set<uint32_t> set_;
+  mutable std::mutex mutex_;
+};
+
+// Graph correlation state values
+enum GraphCorrelationState {
+  GRAPH_STATE_UNINITIALIZED = 0, // Entry just created, slot not yet processed
+  GRAPH_STATE_CYCLE_CLEARED = 1, // Cycle started, no kernels seen yet
+  GRAPH_STATE_KERNEL_SEEN = 2    // At least one kernel seen this cycle
+};
+
+// GraphCorrelationMap for graph launches (multiple kernels per launch)
+// Uses 2-slot state machine to detect when all kernels from a graph have arrived
+struct GraphCorrelationEntry {
+  uint8_t state[2];         // State for alternating cycles
+  bool ever_seen_kernel;    // True once we've seen at least one kernel activity
+  uint32_t insertion_cycle; // Buffer cycle when entry was created
+
+  GraphCorrelationEntry(uint32_t cycle);
+};
+
+class GraphCorrelationMap {
+public:
+  GraphCorrelationMap();
+
+  void insert(uint32_t correlation_id);
+  void cycle_start(uint32_t cycle);
+  bool check_and_mark_seen(uint32_t correlation_id, uint32_t cycle);
+  void cycle_end();
+  size_t size() const;
+
+private:
+  std::unordered_map<uint32_t, GraphCorrelationEntry> map_;
+  uint32_t current_cycle_;
+  mutable std::mutex mutex_;
+};
+
+} // namespace parcagpu
diff --git a/src/cupti.cpp b/src/cupti.cpp
new file mode 100644
index 0000000..51bd1f5
--- /dev/null
+++ b/src/cupti.cpp
@@ -0,0 +1,586 @@
+#include <atomic>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <time.h>
+#include <unistd.h>
+
+// USDT probes — must come before any header that might include <sys/sdt.h>,
+// so that _SDT_HAS_SEMAPHORES is defined first.
+#include "probes.h"
+
+// Include proton headers
+#include "Driver/GPU/CuptiApi.h"
+#include "Profiler/Cupti/CuptiCallbacks.h"
+#include "Utility/Singleton.h"
+#include "correlation_filter.h"
+#include "env_config.h"
+#include "pc_sampling.h"
+#include "token_bucket.h"
+
+namespace parcagpu {
+
+// Debug logging control
+bool debug_enabled = false;
+bool limiter_disabled = false;
+
+// Global correlation tracking instances
+static CorrelationFilter g_correlationFilter;
+static GraphCorrelationMap g_graphCorrelationMap;
+static std::atomic<uint32_t> g_bufferCycle{0};
+
+// Thread-local tracking: store correlation ID from runtime ENTER
+// so we can skip driver EXIT probe when it matches (driver calls happen under
+// runtime calls)
+thread_local uint32_t runtimeEnterCorrelationId = 0;
+
+// Thread-local rate limiter for callback probes (default 100/sec,
+// configurable via PARCAGPU_RATE_LIMIT).
+thread_local TokenBucket callbackLimiter(100.0);
+
+// ---------------------------------------------------------------------------
+// PC sampling probabilistic control.
+// Sampling is gated by an interval + probability: at most once per interval,
+// roll the probability die; if it hits, sample all kernels until the interval
+// window closes.
+// ---------------------------------------------------------------------------
+
+// Config — read from env at startup. Stored in a struct so it can be made
+// runtime-adjustable later (e.g. via signal or control file).
+struct PCSamplingConfig {
+  double probability;  // PARCAGPU_PC_SAMPLING_PROBABILITY (default 1/100)
+  uint64_t intervalNs; // PARCAGPU_PC_SAMPLING_INTERVAL in nanoseconds
+};
+static PCSamplingConfig g_pcSamplingConfig = {0.01, 1000000000ULL};
+
+// Per-thread sampling state.
+struct PCSamplingState {
+  bool active = false;        // Currently sampling
+  uint64_t windowStartNs = 0; // When the current window opened
+  uint64_t lastCheckNs = 0;   // Last time we checked the probability
+  unsigned int rngSeed = 0;   // Thread-local RNG state
+};
+thread_local PCSamplingState g_pcSamplingState;
+
+static uint64_t nowNs() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+// Seed the per-thread RNG lazily.
+static void ensureRngSeeded(PCSamplingState &s) {
+  if (s.rngSeed == 0) {
+    uint64_t t = nowNs();
+    s.rngSeed = (unsigned int)(t ^ (uintptr_t)&s);
+    if (s.rngSeed == 0)
+      s.rngSeed = 1;
+  }
+}
+
+static double threadRandom(PCSamplingState &s) {
+  ensureRngSeeded(s);
+  return (double)rand_r(&s.rngSeed) / RAND_MAX;
+}
+
+void init_debug() {
+  static bool initialized = false;
+  if (!initialized) {
+    debug_enabled = getenv("PARCAGPU_DEBUG") != nullptr;
+    limiter_disabled = getenv("PARCAGPU_LIMITER_DISABLE") != nullptr;
+    const char *rateEnv = getenv("PARCAGPU_RATE_LIMIT");
+    if (rateEnv != nullptr) {
+      double rate = atof(rateEnv);
+      if (rate > 0) {
+        callbackLimiter.setRate(rate);
+      }
+    }
+
+    const char *probEnv = getenv("PARCAGPU_PC_SAMPLING_PROBABILITY");
+    if (probEnv) {
+      double p = atof(probEnv);
+      if (p > 0.0 && p <= 1.0)
+        g_pcSamplingConfig.probability = p;
+    }
+    const char *intervalEnv = getenv("PARCAGPU_PC_SAMPLING_INTERVAL");
+    if (intervalEnv) {
+      double s = atof(intervalEnv);
+      if (s > 0.0)
+        g_pcSamplingConfig.intervalNs = (uint64_t)(s * 1e9);
+    }
+
+    validateEnvVars();
+    initialized = true;
+  }
+}
+
+// Out-of-line USDT probe site for activity batches.
+// Single call site ensures one probe location in the ELF .note.stapsdt section.
+static constexpr int ACTIVITY_BATCH_SIZE = 128;
+
+} // namespace parcagpu
+
+__attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs,
+                                                     uint32_t count) {
+  PARCAGPU_ACTIVITY_BATCH(ptrs, count);
+}
+
+namespace parcagpu {
+
+// Simplified profiler using Proton's patterns
+class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
+public:
+  CuptiProfiler() {
+    DEBUG_PRINTF("[PARCAGPU] Initializing ParcaGPUProfiler\n");
+  }
+
+  ~CuptiProfiler() { cleanup(); }
+
+  bool initialize() {
+    if (initialized.exchange(true)) {
+      return true; // Already initialized
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Starting initialization\n");
+
+    // Check if PC sampling is supported
+    pcSamplingEnabled = parcagpu::PCSampling::isSupported();
+    if (pcSamplingEnabled) {
+      DEBUG_PRINTF("[PARCAGPU] PC sampling enabled (serialized mode)\n");
+    } else {
+      DEBUG_PRINTF(
+          "[PARCAGPU] PC sampling disabled, using kernel activity only\n");
+    }
+
+    // Subscribe to callbacks
+    auto result =
+        proton::cupti::subscribe<true>(&subscriber, callbackHandler, nullptr);
+    if (result != CUPTI_SUCCESS) {
+      DEBUG_PRINTF("[PARCAGPU] Failed to subscribe to callbacks: error %d\n",
+                   result);
+      return false;
+    }
+
+    // Enable runtime and driver API callbacks (using Proton's utilities)
+    proton::setRuntimeCallbacks(subscriber, /*enable=*/true);
+    proton::setLaunchCallbacks(subscriber, /*enable=*/true);
+
+    // Enable resource callbacks only if PC sampling is enabled
+    if (pcSamplingEnabled) {
+      proton::setResourceCallbacks(subscriber, /*enable=*/true);
+    }
+
+    // Register activity buffer callbacks (using Proton's pattern)
+    result = proton::cupti::activityRegisterCallbacks<true>(allocBuffer,
+                                                            completeBuffer);
+    if (result != CUPTI_SUCCESS) {
+      DEBUG_PRINTF(
+          "[PARCAGPU] Failed to register activity callbacks: error %d\n",
+          result);
+      return false;
+    }
+
+    // Enable kernel activity recording
+    result = proton::cupti::activityEnable<true>(
+        CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+    if (result != CUPTI_SUCCESS) {
+      DEBUG_PRINTF(
+          "[PARCAGPU] Failed to enable concurrent kernel activity: error %d\n",
+          result);
+    } else {
+      DEBUG_PRINTF("[PARCAGPU] Enabled CONCURRENT_KERNEL activity\n");
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Successfully initialized CUPTI callbacks\n");
+    return true;
+  }
+
+  void cleanup() {
+    if (!initialized.exchange(false)) {
+      return; // Already cleaned up
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Cleanup started\n");
+
+    // PC sampling data is drained in finalize() during CONTEXT_DESTROY_STARTING
+    // when the CUDA context is still valid. By the time cleanup() runs, the
+    // context may already be dead, so we don't drain here.
+
+    // Disable all callbacks
+    if (subscriber) {
+      proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
+      proton::setLaunchCallbacks(subscriber, /*enable=*/false);
+      if (pcSamplingEnabled) {
+        proton::setResourceCallbacks(subscriber, /*enable=*/false);
+      }
+    }
+
+    // Flush any remaining activity records
+    proton::cupti::activityFlushAll<true>(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+
+    // Disable activity recording
+    proton::cupti::activityDisable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+
+    // Unsubscribe
+    if (subscriber) {
+      proton::cupti::unsubscribe<true>(subscriber);
+      subscriber = nullptr;
+    }
+
+    DEBUG_PRINTF("[PARCAGPU] Cleanup completed\n");
+  }
+
+private:
+  std::atomic<bool> initialized{false};
+  bool pcSamplingEnabled = false;
+  CUpti_SubscriberHandle subscriber = nullptr;
+
+  // PC sampling state — owned by this profiler, destroyed with it.
+  parcagpu::PCSampling pcSampling;
+
+  // Outstanding event counter for flushing
+  size_t outstandingEvents = 0;
+
+  // Buffer management - using Proton's pattern (static methods)
+  // A kernel activity is around 224 bytes so a 128kb buffer
+  // will hold ~500 activities, we want to flush regularly since
+  // we are a continuous profiler so we don't need a huge buffer
+  // like most CUPTI profilers.  Also a small size avoids malloc
+  // just going to mmap every time so the allocator should cache
+  // and re-use these for us.
+  static constexpr size_t AlignSize = 8;
+  static constexpr size_t BufferSize = 128 * 1024;
+
+  static void allocBuffer(uint8_t **buffer, size_t *bufferSize,
+                          size_t *maxNumRecords) {
+    if (!PARCAGPU_CUDA_CORRELATION_ENABLED()) {
+      *buffer = nullptr;
+      return;
+    }
+    *buffer = static_cast<uint8_t *>(aligned_alloc(AlignSize, BufferSize));
+    if (*buffer == nullptr) {
+      DEBUG_PRINTF("[PARCAGPU] ERROR: aligned_alloc failed\n");
+      return;
+    }
+    *bufferSize = BufferSize;
+    *maxNumRecords = 0;
+    DEBUG_PRINTF("[PARCAGPU:allocBuffer] Allocated buffer at %p size %zu\n",
+                 *buffer, *bufferSize);
+  }
+
+  static void completeBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
+                             size_t size, size_t validSize) {
+    CUpti_Activity *record = nullptr;
+    int recordCount = 0;
+    int filteredCount = 0;
+
+    // Batch probe: collect pointers to activity records and pass them to
+    // BPF/USDT every ACTIVITY_BATCH_SIZE records. Stack-allocated array
+    // of pointers — no heap allocation, no copying, version-independent.
+    const void *batchPtrs[ACTIVITY_BATCH_SIZE];
+    uint32_t batchCount = 0;
+
+    DEBUG_PRINTF(
+        "[PARCAGPU] completeBuffer called: ctx=%p buffer=%p validSize=%zu\n",
+        ctx, buffer, validSize);
+
+    // Start a new buffer cycle for graph correlation tracking
+    uint32_t cycle = g_bufferCycle.fetch_add(1);
+    g_graphCorrelationMap.cycle_start(cycle);
+
+    while (true) {
+      CUptiResult result = proton::cupti::activityGetNextRecord<false>(
+          buffer, validSize, &record);
+      if (result == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+        break;
+      } else if (result != CUPTI_SUCCESS) {
+        DEBUG_PRINTF("[PARCAGPU] Error reading activity record: error %d\n",
+                     result);
+        break;
+      }
+
+      recordCount++;
+      switch (record->kind) {
+      case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
+      case CUPTI_ACTIVITY_KIND_KERNEL: {
+        auto *k = reinterpret_cast<CUpti_ActivityKernel5 *>(record);
+
+        // Check correlation filter - only emit probe if this kernel was sampled
+        bool shouldEmit = false;
+        if (k->graphId != 0) {
+          // Graph kernel - check graph correlation map
+          shouldEmit = g_graphCorrelationMap.check_and_mark_seen(
+              k->correlationId, cycle);
+        } else {
+          // Regular kernel - check and remove from correlation filter
+          shouldEmit = g_correlationFilter.check_and_remove(k->correlationId);
+        }
+
+        if (!shouldEmit) {
+          filteredCount++;
+          DEBUG_PRINTF("[PARCAGPU] Filtered kernel activity: correlationId=%u "
+                       "graphId=%u (not in filter)\n",
+                       k->correlationId, k->graphId);
+          break;
+        }
+
+        DEBUG_PRINTF("[PARCAGPU] Kernel activity: graphId=%u graphNodeId=%lu "
+                     "name=%s, correlationId=%u, deviceId=%u, "
+                     "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n",
+                     k->graphId, k->graphNodeId, k->name, k->correlationId,
+                     k->deviceId, k->streamId, k->start, k->end,
+                     k->end - k->start);
+
+        // Emit USDT probe for kernel execution
+        PARCAGPU_KERNEL_EXECUTED(k->start, k->end, k->correlationId,
+                                 k->deviceId, k->streamId, k->graphId,
+                                 k->graphNodeId, k->name);
+        break;
+      }
+      default:
+        DEBUG_PRINTF("[PARCAGPU] Activity record %d: kind=%d\n", recordCount,
+                     record->kind);
+        break;
+      }
+
+      // Collect pointer for batch probe (all activity kinds).
+      // BPF consumers inspect the kind field to filter types they care about.
+      batchPtrs[batchCount++] = record;
+      if (batchCount >= ACTIVITY_BATCH_SIZE) {
+        parcagpuActivityBatch(batchPtrs, batchCount);
+        batchCount = 0;
+      }
+    }
+
+    // Flush remaining batch
+    if (batchCount > 0) {
+      parcagpuActivityBatch(batchPtrs, batchCount);
+    }
+
+    // End cycle - cleanup completed graph entries
+    g_graphCorrelationMap.cycle_end();
+
+    DEBUG_PRINTF("[PARCAGPU] Processed %d activity records (%d filtered) from "
+                 "buffer %p\n",
+                 recordCount, filteredCount, buffer);
+
+    // Reset to 0 rather than decrement - one API callback can produce N
+    // activities so decrementing by recordCount can cause underflow
+    CuptiProfiler::instance().outstandingEvents = 0;
+
+    // Free the buffer (Proton's pattern)
+    std::free(buffer);
+  }
+
+  static void callbackHandler(void *userdata, CUpti_CallbackDomain domain,
+                              CUpti_CallbackId cbid, const void *cbdata_void) {
+    auto &profiler = CuptiProfiler::instance();
+
+    if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
+      // Handle resource callbacks for PC sampling (only if enabled)
+      if (!profiler.pcSamplingEnabled) {
+        return;
+      }
+
+      const CUpti_ResourceData *resData =
+          static_cast<const CUpti_ResourceData *>(cbdata_void);
+
+      switch (cbid) {
+      case CUPTI_CBID_RESOURCE_MODULE_LOADED: {
+        const CUpti_ModuleResourceData *modData =
+            static_cast<const CUpti_ModuleResourceData *>(
+                resData->resourceDescriptor);
+        if (modData && modData->pCubin && modData->cubinSize > 0) {
+          DEBUG_PRINTF("[PARCAGPU] Module loaded: cubin=%p size=%zu\n",
+                       modData->pCubin, modData->cubinSize);
+          profiler.pcSampling.loadModule(modData->pCubin, modData->cubinSize);
+        }
+        break;
+      }
+      case CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING: {
+        const CUpti_ModuleResourceData *modData =
+            static_cast<const CUpti_ModuleResourceData *>(
+                resData->resourceDescriptor);
+        if (modData && modData->pCubin && modData->cubinSize > 0) {
+          DEBUG_PRINTF("[PARCAGPU] Module unloading: cubin=%p size=%zu\n",
+                       modData->pCubin, modData->cubinSize);
+          profiler.pcSampling.unloadModule(modData->pCubin, modData->cubinSize);
+        }
+        break;
+      }
+      case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: {
+        CUcontext ctx = resData->context;
+        DEBUG_PRINTF("[PARCAGPU] Context created: %p\n", ctx);
+        profiler.pcSampling.initialize(ctx);
+        break;
+      }
+      case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: {
+        CUcontext ctx = resData->context;
+        DEBUG_PRINTF("[PARCAGPU] Context destroying: %p\n", ctx);
+        profiler.pcSampling.finalize(ctx);
+        break;
+      }
+      default:
+        break;
+      }
+    } else {
+      // Handle both Runtime and Driver API callbacks
+      const CUpti_CallbackData *cbdata =
+          static_cast<const CUpti_CallbackData *>(cbdata_void);
+      uint32_t correlationId = cbdata->correlationId;
+
+      // ENTER: manage probabilistic sampling windows.
+      // start() begins CUPTI PC sampling (kernels serialized).
+      // stop() ends it and drains data (kernels concurrent again).
+      if (cbdata->callbackSite == CUPTI_API_ENTER) {
+        if (domain == CUPTI_CB_DOMAIN_RUNTIME_API)
+          runtimeEnterCorrelationId = correlationId;
+
+        if (profiler.pcSamplingEnabled) {
+          auto &st = g_pcSamplingState;
+          auto &cfg = g_pcSamplingConfig;
+          uint64_t now = nowNs();
+
+          // If sampling and the window has closed, stop + drain.
+          if (st.active && (now - st.windowStartNs >= cfg.intervalNs)) {
+            profiler.pcSampling.stop(cbdata->context);
+            st.active = false;
+          }
+
+          // If not sampling, check interval then probability.
+          if (!st.active && (now - st.lastCheckNs >= cfg.intervalNs)) {
+            st.lastCheckNs = now;
+            if (threadRandom(st) < cfg.probability) {
+              st.active = true;
+              st.windowStartNs = now;
+              profiler.pcSampling.start(cbdata->context);
+            }
+          }
+
+          profiler.pcSampling.emitMetadata();
+        }
+        return;
+      }
+
+      // Process on EXIT to avoid adding latency to GPU launch
+      if (cbdata->callbackSite != CUPTI_API_EXIT) {
+        return;
+      }
+
+      // EXIT: stop sampling if the window has closed.
+      if (profiler.pcSamplingEnabled) {
+        auto &st = g_pcSamplingState;
+        auto &cfg = g_pcSamplingConfig;
+        uint64_t now = nowNs();
+
+        if (st.active && (now - st.windowStartNs >= cfg.intervalNs)) {
+          profiler.pcSampling.stop(cbdata->context);
+          st.active = false;
+        }
+
+        profiler.pcSampling.emitMetadata();
+      }
+
+      // Skip correlation/rate-limiter work when no profiler is attached.
+      if (!PARCAGPU_CUDA_CORRELATION_ENABLED())
+        return;
+
+      const char *name =
+          cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
+      int signedCbid;
+
+      if (domain == CUPTI_CB_DOMAIN_DRIVER_API) {
+        // Skip if this driver call is under a runtime call (same correlation
+        // ID)
+        if (correlationId == runtimeEnterCorrelationId) {
+          DEBUG_PRINTF("[PARCAGPU] Skipping driver EXIT correlationId=%u - "
+                       "runtime will handle\n",
+                       correlationId);
+          return;
+        }
+        // Pure driver call (no runtime wrapper) - use negative cbid
+        signedCbid = -(int)cbid;
+        DEBUG_PRINTF("[PARCAGPU] Driver API callback: cbid=%d, "
+                     "correlationId=%u, func=%s\n",
+                     cbid, correlationId, name);
+      } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
+        signedCbid = (int)cbid;
+        runtimeEnterCorrelationId = 0; // Clear after use
+        DEBUG_PRINTF("[PARCAGPU] Runtime API callback: cbid=%d, "
+                     "correlationId=%u, func=%s\n",
+                     cbid, correlationId, name);
+      } else {
+        return;
+      }
+
+      // Check if this is a graph launch (never rate limit these)
+      bool isGraphLaunch = false;
+      if (signedCbid < 0) {
+        // Driver API: cuGraphLaunch = 514, cuGraphLaunch_ptsz = 515
+        int driverCbid = -signedCbid;
+        isGraphLaunch =
+            (driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch ||
+             driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
+      } else {
+        // Runtime API: cudaGraphLaunch = 311, cudaGraphLaunch_ptsz = 312
+        isGraphLaunch =
+            (signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 ||
+             signedCbid ==
+                 CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000);
+      }
+
+      // Rate limit probes using token bucket (skip for graph launches)
+      if (!limiter_disabled && !isGraphLaunch) {
+        if (!callbackLimiter.tryAcquire()) {
+          DEBUG_PRINTF(
+              "[PARCAGPU] Rate limited: skipping probe for correlationId=%u\n",
+              correlationId);
+          return;
+        }
+      }
+
+      profiler.outstandingEvents++;
+      // Emit USDT probe with signed cbid (negative for driver, positive for
+      // runtime)
+      PARCAGPU_CUDA_CORRELATION(correlationId, signedCbid, name);
+
+      // Insert into correlation filter so we can match kernel activities later
+      if (isGraphLaunch) {
+        g_graphCorrelationMap.insert(correlationId);
+        DEBUG_PRINTF("[PARCAGPU] Inserted correlationId=%u into graph map\n",
+                     correlationId);
+      } else {
+        g_correlationFilter.insert(correlationId);
+        DEBUG_PRINTF(
+            "[PARCAGPU] Inserted correlationId=%u into correlation filter\n",
+            correlationId);
+      }
+
+      // Flush if too many events pile up
+      if (profiler.outstandingEvents > 3000) {
+        DEBUG_PRINTF("[PARCAGPU] Flushing: outstandingEvents=%zu\n",
+                     profiler.outstandingEvents);
+        proton::cupti::activityFlushAll<true>(0);
+        profiler.outstandingEvents = 0;
+      }
+    }
+  }
+};
+
+} // namespace parcagpu
+
+// CUPTI initialization function required for CUDA_INJECTION64_PATH
+extern "C" int InitializeInjection(void) {
+  DEBUG_PRINTF("[PARCAGPU] InitializeInjection called\n");
+
+  auto &profiler = parcagpu::CuptiProfiler::instance();
+  if (!profiler.initialize()) {
+    return 0; // Return 0 on failure, but don't break injection
+  }
+
+  // Register cleanup at exit
+  atexit([]() { parcagpu::CuptiProfiler::instance().cleanup(); });
+
+  return 1; // Success
+}
diff --git a/src/env_config.cpp b/src/env_config.cpp
new file mode 100644
index 0000000..7d0bac7
--- /dev/null
+++ b/src/env_config.cpp
@@ -0,0 +1,103 @@
+#include "env_config.h"
+#include "pc_sampling.h" // DEBUG_PRINTF, fireError
+#include "probes.h"
+
+#include <cstdlib>
+#include <cstring>
+
+extern char **environ;
+
+namespace parcagpu {
+
+// Known PARCAGPU_* environment variables.
+static const char *knownVars[] = {
+    "PARCAGPU_DEBUG",
+    "PARCAGPU_LIMITER_DISABLE",
+    "PARCAGPU_RATE_LIMIT",
+    "PARCAGPU_SAMPLING_FACTOR",
+    "PARCAGPU_PC_SAMPLING_PROBABILITY",
+    "PARCAGPU_PC_SAMPLING_INTERVAL",
+};
+static constexpr size_t numKnownVars =
+    sizeof(knownVars) / sizeof(knownVars[0]);
+
+static bool isKnown(const char *name, size_t nameLen) {
+  for (size_t i = 0; i < numKnownVars; ++i) {
+    if (std::strlen(knownVars[i]) == nameLen &&
+        std::strncmp(knownVars[i], name, nameLen) == 0)
+      return true;
+  }
+  return false;
+}
+
+void validateEnvVars() {
+  // Scan environment for unrecognized PARCAGPU_* variables.
+  static constexpr const char prefix[] = "PARCAGPU_";
+  static constexpr size_t prefixLen = sizeof(prefix) - 1;
+
+  for (char **ep = environ; *ep; ++ep) {
+    if (std::strncmp(*ep, prefix, prefixLen) != 0)
+      continue;
+
+    // Extract variable name (everything before '=').
+    const char *eq = std::strchr(*ep, '=');
+    size_t nameLen = eq ? (size_t)(eq - *ep) : std::strlen(*ep);
+
+    if (!isKnown(*ep, nameLen)) {
+      // Null-terminate for printing.
+      char nameBuf[128] = {};
+      size_t copyLen = nameLen < sizeof(nameBuf) - 1 ? nameLen : sizeof(nameBuf) - 1;
+      std::memcpy(nameBuf, *ep, copyLen);
+
+      DEBUG_PRINTF("[PARCAGPU] Warning: unrecognized env var '%s'\n", nameBuf);
+      fireError(0, nameBuf, "env_config: unrecognized variable");
+    }
+  }
+
+  // Validate specific variables.
+  const char *val;
+
+  val = std::getenv("PARCAGPU_RATE_LIMIT");
+  if (val) {
+    double rate = std::atof(val);
+    if (rate <= 0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_RATE_LIMIT=%s invalid "
+                   "(must be > 0), using default\n", val);
+      fireError(0, val, "env_config: PARCAGPU_RATE_LIMIT invalid");
+    }
+  }
+
+  val = std::getenv("PARCAGPU_SAMPLING_FACTOR");
+  if (val) {
+    int factor = std::atoi(val);
+    if (factor != 0 && (factor < 5 || factor > 31)) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%s out of "
+                   "range [0, 5-31], using default\n", val);
+      fireError(0, val, "env_config: PARCAGPU_SAMPLING_FACTOR out of range");
+    }
+  }
+
+  val = std::getenv("PARCAGPU_PC_SAMPLING_PROBABILITY");
+  if (val) {
+    double p = std::atof(val);
+    if (p <= 0.0 || p > 1.0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_SAMPLING_PROBABILITY=%s "
+                   "invalid (must be in (0, 1]), using default\n", val);
+      fireError(0, val,
+                "env_config: PARCAGPU_PC_SAMPLING_PROBABILITY invalid");
+    }
+  }
+
+  val = std::getenv("PARCAGPU_PC_SAMPLING_INTERVAL");
+  if (val) {
+    double s = std::atof(val);
+    if (s <= 0.0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_SAMPLING_INTERVAL=%s "
+                   "invalid (must be > 0), using default\n", val);
+      fireError(0, val,
+                "env_config: PARCAGPU_PC_SAMPLING_INTERVAL invalid");
+    }
+  }
+}
+
+} // namespace parcagpu
diff --git a/src/env_config.h b/src/env_config.h
new file mode 100644
index 0000000..2075c85
--- /dev/null
+++ b/src/env_config.h
@@ -0,0 +1,16 @@
+#ifndef PARCAGPU_ENV_CONFIG_H_
+#define PARCAGPU_ENV_CONFIG_H_
+
+namespace parcagpu {
+
+// Scan the process environment for all PARCAGPU_* variables.
+// Warns (via DEBUG_PRINTF and the error probe) about unrecognized names.
+// Validates types and ranges for known variables; on invalid values, fires
+// the error probe, prints a debug warning, and uses the default.
+//
+// Call once at startup (e.g. from init_debug).
+void validateEnvVars();
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_ENV_CONFIG_H_
diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp
new file mode 100644
index 0000000..230b494
--- /dev/null
+++ b/src/pc_sampling.cpp
@@ -0,0 +1,772 @@
+#include "Driver/GPU/CudaApi.h"
+#include "Driver/GPU/CuptiApi.h"
+#include "pc_sampling.h"
+#include "probes.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+
+namespace parcagpu {
+
+// CUDA driver version for 12.8.1 (minimum for PC sampling)
+// Version format: major * 1000 + minor * 10 + patch
+#define CUDA_VERSION_12_8_1 12081
+
+// CUPTI version that added correlationId to CUpti_PCSamplingPCData,
+// breaking ABI compatibility.
+#define CUPTI_CUDA12_4_VERSION 22
+
+// noinline wrappers so each USDT probe has exactly one call site in the
+// binary.  Multiple call sites produce multiple .note.stapsdt entries with
+// different argument encodings, which complicates BPF attachment.
+__attribute__((noinline)) void fireCubinLoaded(uint64_t crc, const char *cubin,
+                                               uint64_t size) {
+  PARCAGPU_CUBIN_LOADED(crc, cubin, size);
+}
+
+__attribute__((noinline)) void fireCubinUnloaded(uint64_t crc) {
+  PARCAGPU_CUBIN_UNLOADED(crc);
+}
+
+// Max records per pc_sample_batch probe invocation.
+static constexpr uint32_t PCSampleBatchSize = 128;
+
+__attribute__((noinline)) void firePCSampleBatch(
+    const void **ptrs, uint32_t count) {
+  PARCAGPU_PC_SAMPLE_BATCH(ptrs, count);
+}
+
+namespace {
+
+// CUPTI helper functions (adapted from Proton's CuptiPCSamplingUtils.h)
+// These wrap Proton's cupti API calls with PARCAGPU-specific setup
+
+uint64_t getCubinCrc(const char *cubin, size_t size) {
+  CUpti_GetCubinCrcParams cubinCrcParams = {
+      /*size=*/CUpti_GetCubinCrcParamsSize,
+      /*cubinSize=*/size,
+      /*cubin=*/cubin,
+      /*cubinCrc=*/0,
+  };
+  proton::cupti::getCubinCrc<true>(&cubinCrcParams);
+  return cubinCrcParams.cubinCrc;
+}
+
+void enablePCSampling(CUcontext context) {
+  CUpti_PCSamplingEnableParams params = {
+      /*size=*/CUpti_PCSamplingEnableParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  proton::cupti::pcSamplingEnable<true>(&params);
+}
+
+bool startPCSampling(CUcontext context) {
+  // CUPTI requires the GPU to be idle before starting PC sampling.
+  proton::cuda::ctxSynchronize<false>();
+  CUpti_PCSamplingStartParams params = {
+      /*size=*/CUpti_PCSamplingStartParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  auto ret = proton::cupti::pcSamplingStart<false>(&params);
+  if (ret != CUPTI_SUCCESS) {
+    DEBUG_PRINTF("cuptiPCSamplingStart failed: %d\n", ret);
+    return false;
+  }
+  return true;
+}
+
+bool stopPCSampling(CUcontext context) {
+  CUpti_PCSamplingStopParams params = {
+      /*size=*/CUpti_PCSamplingStopParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  auto ret = proton::cupti::pcSamplingStop<false>(&params);
+  if (ret != CUPTI_SUCCESS) {
+    DEBUG_PRINTF("cuptiPCSamplingStop failed: %d\n", ret);
+    return false;
+  }
+  return true;
+}
+
+void disablePCSampling(CUcontext context) {
+  CUpti_PCSamplingDisableParams params = {
+      /*size=*/CUpti_PCSamplingDisableParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+  };
+  proton::cupti::pcSamplingDisable<true>(&params);
+}
+
+// Returns true if data was retrieved successfully, false on error.
+bool getPCSamplingData(CUcontext context,
+                       CUpti_PCSamplingData *pcSamplingData) {
+  CUpti_PCSamplingGetDataParams params = {
+      /*size=*/CUpti_PCSamplingGetDataParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*pcSamplingData=*/pcSamplingData,
+  };
+  auto result = proton::cupti::pcSamplingGetData<false>(&params);
+  if (result != CUPTI_SUCCESS) {
+    DEBUG_PRINTF("cuptiPCSamplingGetData failed: error %d (ctx=%p)\n", result,
+                 context);
+    return false;
+  }
+  return true;
+}
+
+void setConfigurationAttribute(
+    CUcontext context,
+    std::vector<CUpti_PCSamplingConfigurationInfo> &configurationInfos) {
+  CUpti_PCSamplingConfigurationInfoParams infoParams = {
+      /*size=*/CUpti_PCSamplingConfigurationInfoParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*numAttributes=*/configurationInfos.size(),
+      /*pPCSamplingConfigurationInfo=*/configurationInfos.data(),
+  };
+  proton::cupti::pcSamplingSetConfigurationAttribute<true>(&infoParams);
+}
+
+std::tuple<uint32_t, std::string, std::string>
+getSassToSourceCorrelation(const char *functionName, uint64_t pcOffset,
+                           const char *cubin, size_t cubinSize) {
+  CUpti_GetSassToSourceCorrelationParams sassToSourceParams = {
+      /*size=*/CUpti_GetSassToSourceCorrelationParamsSize,
+      /*cubin=*/cubin,
+      /*functionName=*/functionName,
+      /*cubinSize=*/cubinSize,
+      /*lineNumber=*/0,
+      /*pcOffset=*/pcOffset,
+      /*fileName=*/NULL,
+      /*dirName=*/NULL,
+  };
+  // Get source can fail if the line mapping is not available
+  proton::cupti::getSassToSourceCorrelation<false>(&sassToSourceParams);
+  auto fileNameStr = sassToSourceParams.fileName
+                         ? std::string(sassToSourceParams.fileName)
+                         : "";
+  auto dirNameStr =
+      sassToSourceParams.dirName ? std::string(sassToSourceParams.dirName) : "";
+  // Free the memory
+  if (sassToSourceParams.fileName)
+    std::free(sassToSourceParams.fileName);
+  if (sassToSourceParams.dirName)
+    std::free(sassToSourceParams.dirName);
+  return std::make_tuple(sassToSourceParams.lineNumber, fileNameStr,
+                         dirNameStr);
+}
+
+// Double-checked locking helper
+template <typename CheckFn, typename ActionFn>
+void doubleCheckedLock(CheckFn check, std::mutex &mutex, ActionFn action) {
+  if (check()) {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (check()) {
+      action();
+    }
+  }
+}
+
+// Helper to get PARCAGPU's custom sampling frequency from environment
+uint32_t getGPUSamplingFrequency() {
+  // Default frequency for PARCAGPU is 18 (Proton uses 10)
+  constexpr uint32_t PARCAGPU_DEFAULT_FREQUENCY = 18;
+
+  uint32_t samplingPeriod = PARCAGPU_DEFAULT_FREQUENCY;
+  const char *sampling_factor_env = getenv("PARCAGPU_SAMPLING_FACTOR");
+  if (sampling_factor_env) {
+    int factor = atoi(sampling_factor_env);
+    if (factor >= 5 && factor <= 31) {
+      samplingPeriod = factor;
+      DEBUG_PRINTF("Using PARCAGPU_SAMPLING_FACTOR=%u\n", samplingPeriod);
+    } else if (factor != 0) { // 0 is handled in isSupported()
+      fprintf(stderr,
+              "[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%d out of range "
+              "[5,31], using default %u\n",
+              factor, PARCAGPU_DEFAULT_FREQUENCY);
+    }
+  } else {
+    return 0;
+  }
+  return samplingPeriod;
+}
+
+// Get number of stall reasons
+size_t getNumStallReasons(CUcontext context) {
+  size_t numStallReasons = 0;
+  CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = {
+      /*size=*/CUpti_PCSamplingGetNumStallReasonsParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*numStallReasons=*/&numStallReasons};
+  proton::cupti::pcSamplingGetNumStallReasons<true>(&numStallReasonsParams);
+  return numStallReasons;
+}
+
+// Get stall reason names and indices
+std::pair<char **, uint32_t *>
+getStallReasonNamesAndIndices(CUcontext context, size_t numStallReasons) {
+  char **stallReasonNames =
+      static_cast<char **>(std::calloc(numStallReasons, sizeof(char *)));
+  for (size_t i = 0; i < numStallReasons; i++) {
+    stallReasonNames[i] = static_cast<char *>(
+        std::calloc(CUPTI_STALL_REASON_STRING_SIZE, sizeof(char)));
+  }
+  uint32_t *stallReasonIndices =
+      static_cast<uint32_t *>(std::calloc(numStallReasons, sizeof(uint32_t)));
+  CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = {
+      /*size=*/CUpti_PCSamplingGetStallReasonsParamsSize,
+      /*pPriv=*/NULL,
+      /*ctx=*/context,
+      /*numStallReasons=*/numStallReasons,
+      /*stallReasonIndex=*/stallReasonIndices,
+      /*stallReasons=*/stallReasonNames,
+  };
+  proton::cupti::pcSamplingGetStallReasons<true>(&stallReasonsParams);
+  return std::make_pair(stallReasonNames, stallReasonIndices);
+}
+
+// Match stall reasons to indices (PARCAGPU emits all stall reasons)
+size_t matchStallReasonsToIndices(
+    size_t numStallReasons, char **stallReasonNames,
+    uint32_t *stallReasonIndices,
+    std::map<size_t, size_t> &stallReasonIndexToMetricIndex,
+    std::set<size_t> &notIssuedStallReasonIndices) {
+  // PARCAGPU emits all stall reasons
+  size_t numValidStalls = 0;
+  for (size_t i = 0; i < numStallReasons; i++) {
+    std::string cuptiStallName = std::string(stallReasonNames[i]);
+    bool notIssued = cuptiStallName.find("not_issued") != std::string::npos ||
+                     cuptiStallName.find("Not Issued") != std::string::npos;
+
+    if (notIssued)
+      notIssuedStallReasonIndices.insert(stallReasonIndices[i]);
+    stallReasonIndexToMetricIndex[stallReasonIndices[i]] = i;
+    numValidStalls++;
+  }
+  return numValidStalls;
+}
+
+// Allocate PC sampling data buffer
+CUpti_PCSamplingData allocPCSamplingData(size_t collectNumPCs,
+                                         size_t numValidStallReasons) {
+  CUpti_PCSamplingData pcSamplingData{
+      /*size=*/sizeof(CUpti_PCSamplingData),
+      /*collectNumPcs=*/collectNumPCs,
+      /*totalSamples=*/0,
+      /*droppedSamples=*/0,
+      /*totalNumPcs=*/0,
+      /*remainingNumPcs=*/0,
+      /*rangeId=*/0,
+      /*pPcData=*/
+      static_cast<CUpti_PCSamplingPCData *>(
+          std::calloc(collectNumPCs, sizeof(CUpti_PCSamplingPCData)))};
+  for (size_t i = 0; i < collectNumPCs; ++i) {
+    pcSamplingData.pPcData[i].stallReason =
+        static_cast<CUpti_PCSamplingStallReason *>(std::calloc(
+            numValidStallReasons, sizeof(CUpti_PCSamplingStallReason)));
+  }
+  return pcSamplingData;
+}
+
+} // namespace
+
+// ConfigureData implementation
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureStallReasons() {
+  numStallReasons = getNumStallReasons(context);
+  std::tie(this->stallReasonNames, this->stallReasonIndices) =
+      getStallReasonNamesAndIndices(context, numStallReasons);
+  numValidStallReasons = matchStallReasonsToIndices(
+      numStallReasons, stallReasonNames, stallReasonIndices,
+      stallReasonIndexToMetricIndex, notIssuedStallReasonIndices);
+
+  CUpti_PCSamplingConfigurationInfo stallReasonInfo{};
+  stallReasonInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON;
+  stallReasonInfo.attributeData.stallReasonData.stallReasonCount =
+      numValidStallReasons;
+  stallReasonInfo.attributeData.stallReasonData.pStallReasonIndex =
+      stallReasonIndices;
+  return stallReasonInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingPeriod() {
+  CUpti_PCSamplingConfigurationInfo samplingPeriodInfo{};
+  samplingPeriodInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD;
+
+  // Use PARCAGPU's custom sampling frequency
+  uint32_t frequency = getGPUSamplingFrequency();
+
+  samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod =
+      frequency;
+  return samplingPeriodInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingBuffer() {
+  CUpti_PCSamplingConfigurationInfo samplingBufferInfo{};
+  samplingBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER;
+  this->pcSamplingData =
+      allocPCSamplingData(DataBufferPCCount, numValidStallReasons);
+  samplingBufferInfo.attributeData.samplingDataBufferData.samplingDataBuffer =
+      &this->pcSamplingData;
+  return samplingBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() {
+  CUpti_PCSamplingConfigurationInfo scratchBufferInfo{};
+  scratchBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE;
+  scratchBufferInfo.attributeData.scratchBufferSizeData.scratchBufferSize =
+      ScratchBufferSize;
+  return scratchBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() {
+  CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{};
+  hardwareBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE;
+  hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize =
+      HardwareBufferSize;
+  return hardwareBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureCollectionMode() {
+  CUpti_PCSamplingConfigurationInfo collectionModeInfo{};
+  collectionModeInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE;
+  collectionModeInfo.attributeData.collectionModeData.collectionMode =
+      CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED;
+  return collectionModeInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureStartStopControl() {
+  CUpti_PCSamplingConfigurationInfo startStopControlInfo{};
+  startStopControlInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL;
+  startStopControlInfo.attributeData.enableStartStopControlData
+      .enableStartStopControl = true;
+  return startStopControlInfo;
+}
+
+void ConfigureData::initialize(CUcontext context) {
+  this->context = context;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n", context,
+               contextId);
+
+  configurationInfos.emplace_back(configureStallReasons());
+  configurationInfos.emplace_back(configureCollectionMode());
+  configurationInfos.emplace_back(configureStartStopControl());
+  configurationInfos.emplace_back(configureSamplingBuffer());
+  // Don't set sampling period — let CUPTI use its default.
+  // Explicit period values silently break sampling on some GPUs (e.g.
+  // Blackwell).
+
+  setConfigurationAttribute(context, configurationInfos);
+
+  // Allocate a separate output buffer for getPCSamplingData calls.
+  // The configured pcSamplingData buffer is owned by CUPTI internally;
+  // we must pass a different buffer to getPCSamplingData.
+  this->outputData =
+      allocPCSamplingData(DataBufferPCCount, numValidStallReasons);
+
+  DEBUG_PRINTF("PC sampling configured with %u stall reasons (%u valid)\n",
+               numStallReasons, numValidStallReasons);
+}
+
+// GPUPCSampling implementation
+
+bool PCSampling::isSupported() {
+  // PC sampling is enabled by default.
+  // Set PARCAGPU_SAMPLING_FACTOR=0 to disable.
+  const char *env = getenv("PARCAGPU_SAMPLING_FACTOR");
+  if (env && atoi(env) == 0) {
+    DEBUG_PRINTF("PC sampling disabled via PARCAGPU_SAMPLING_FACTOR=0\n");
+    return false;
+  }
+
+  // Check CUDA driver version >= 12.8.1
+  int driverVersion = 0;
+  proton::cuda::driverGetVersion<true>(&driverVersion);
+
+  if (driverVersion < CUDA_VERSION_12_8_1) {
+    int major = driverVersion / 1000;
+    int minor = (driverVersion % 1000) / 10;
+    int patch = driverVersion % 10;
+    DEBUG_PRINTF("PC sampling not supported: CUDA driver version %d.%d.%d < "
+                 "required 12.8.1\n",
+                 major, minor, patch);
+    fireError(driverVersion,
+              "CUDA driver version too low for PC sampling (need >= 12.8.1)",
+              "pc_sampling");
+    return false;
+  }
+
+  // Check CUPTI API/driver version compatibility.
+  // CUPTI 12.4 (v22) added correlationId to CUpti_PCSamplingPCData, breaking
+  // ABI. Mixing compile-time and runtime versions across this boundary crashes.
+  uint32_t cuptiVersion = 0;
+  proton::cupti::getVersion<true>(&cuptiVersion);
+
+  if ((cuptiVersion < CUPTI_CUDA12_4_VERSION &&
+       CUPTI_API_VERSION >= CUPTI_CUDA12_4_VERSION) ||
+      (cuptiVersion >= CUPTI_CUDA12_4_VERSION &&
+       CUPTI_API_VERSION < CUPTI_CUDA12_4_VERSION)) {
+    DEBUG_PRINTF(
+        "PC sampling disabled: CUPTI API version %d and driver version %d "
+        "are incompatible across the 12.4 (v22) ABI boundary\n",
+        CUPTI_API_VERSION, cuptiVersion);
+    fireError((int32_t)cuptiVersion,
+              "CUPTI API/driver version mismatch (12.4 ABI boundary)",
+              "pc_sampling");
+    return false;
+  }
+
+  // Attempt a lightweight permission probe. CUPTI PC sampling requires
+  // either root, CAP_SYS_ADMIN, or the NVIDIA module parameter
+  // NVreg_RestrictProfilingToAdminUsers=0.
+  // We cannot easily pre-check permissions without attempting CUPTI calls,
+  // so we defer the real check to initialize() where enablePCSampling()
+  // will fail with a CUPTI error if permissions are insufficient.
+  // TODO: Add explicit permission pre-check.
+  // Reference: https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters
+
+  int major = driverVersion / 1000;
+  int minor = (driverVersion % 1000) / 10;
+  int patch = driverVersion % 10;
+  DEBUG_PRINTF("PC sampling supported: CUDA %d.%d.%d, CUPTI v%u (API v%d)\n",
+               major, minor, patch, cuptiVersion, CUPTI_API_VERSION);
+  return true;
+}
+
+ConfigureData *PCSampling::getConfigureData(uint32_t contextId) {
+  return &contextIdToConfigureData[contextId];
+}
+
+CubinData *PCSampling::getCubinData(uint64_t cubinCrc) {
+  return &(cubinCrcToCubinData[cubinCrc].first);
+}
+
+void PCSampling::initialize(CUcontext context) {
+  uint32_t contextId = 0;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  doubleCheckedLock(
+      [&]() {
+        return !contextInitialized.contain(contextId) &&
+               !contextFailed.contain(contextId);
+      },
+      contextMutex,
+      [&]() {
+        // enablePCSampling can fail due to insufficient permissions
+        // (ERR_NVGPUCTRPERM). Catch and degrade gracefully.
+        CUpti_PCSamplingEnableParams enableParams = {
+            /*size=*/CUpti_PCSamplingEnableParamsSize,
+            /*pPriv=*/NULL,
+            /*ctx=*/context,
+        };
+        auto result =
+            proton::cupti::pcSamplingEnable<false>(&enableParams);
+        if (result != CUPTI_SUCCESS) {
+          DEBUG_PRINTF(
+              "Failed to enable PC sampling for context %u: CUPTI error %d\n"
+              "This may be a permission issue. See:\n"
+              "https://developer.nvidia.com/nvidia-development-tools-solutions-"
+              "err_nvgpuctrperm-permission-issue-performance-counters\n",
+              contextId, result);
+          fireError((int32_t)result,
+                    "Failed to enable PC sampling (possible permission issue)",
+                    "pc_sampling");
+          contextFailed.insert(contextId);
+          return;
+        }
+
+        auto *configData = getConfigureData(contextId);
+        configData->initialize(context);
+
+        // Build contiguous stall reason map for USDT probe emission.
+        stallReasonMap.build(configData->numStallReasons,
+                             configData->stallReasonIndices,
+                             configData->stallReasonNames);
+
+        contextInitialized.insert(contextId);
+        initializedContextIds.push_back(contextId);
+        DEBUG_PRINTF(
+            "PC sampling initialized (serialized mode) for context %u\n",
+            contextId);
+      });
+}
+
+void PCSampling::start(CUcontext context) {
+  std::lock_guard<std::mutex> lock(pcSamplingMutex);
+  if (samplingActive)
+    return;
+  if (startPCSampling(context)) {
+    samplingActive = true;
+    samplingContext = context;
+    DEBUG_PRINTF("PC sampling started (kernels serialized)\n");
+  }
+}
+
+void PCSampling::stop(CUcontext context) {
+  std::lock_guard<std::mutex> lock(pcSamplingMutex);
+  if (!samplingActive)
+    return;
+  stopPCSampling(context);
+  samplingActive = false;
+  DEBUG_PRINTF("PC sampling stopped (kernels concurrent)\n");
+  // Drain data collected during this window.
+  collectData(context);
+}
+
+__attribute__((noinline)) void fireError(int32_t code, const char *message,
+                                         const char *component) {
+  PARCAGPU_ERROR(code, message, component);
+}
+
+void PCSampling::processPCSamplingData(ConfigureData *configureData) {
+  auto *pcSamplingData = &configureData->outputData;
+
+  if (pcSamplingData->totalNumPcs == 0) {
+    return;
+  }
+
+  DEBUG_PRINTF("Processing %zu PCs (remaining: %zu)\n",
+               pcSamplingData->totalNumPcs, pcSamplingData->remainingNumPcs);
+
+  if (debug_enabled) {
+    for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
+      auto *pcData = pcSamplingData->pPcData + i;
+
+      uint64_t totalSamples = 0;
+      uint64_t stalledSamples = 0;
+      for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
+        auto *stallReason = &pcData->stallReason[j];
+        totalSamples += stallReason->samples;
+        bool isNotIssued = configureData->notIssuedStallReasonIndices.count(
+                               stallReason->pcSamplingStallReasonIndex) > 0;
+        if (!isNotIssued)
+          stalledSamples += stallReason->samples;
+      }
+
+      auto *cubinData = getCubinData(pcData->cubinCrc);
+      auto key =
+          CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset};
+      if (cubinData->lineInfo.find(key) == cubinData->lineInfo.end()) {
+        auto [lineNumber, fileName, dirName] =
+            getSassToSourceCorrelation(pcData->functionName, pcData->pcOffset,
+                                       cubinData->cubin, cubinData->cubinSize);
+        cubinData->lineInfo.try_emplace(key, lineNumber,
+                                        std::string(pcData->functionName),
+                                        dirName, fileName);
+      }
+      auto &lineInfo = cubinData->lineInfo[key];
+      std::string fullPath = lineInfo.fileName.size()
+                                 ? lineInfo.dirName + "/" + lineInfo.fileName
+                                 : "";
+      DEBUG_PRINTF("  [%zu] func=%s pc=0x%lx total=%lu stalled=%lu %s:%u\n", i,
+                   lineInfo.functionName.c_str(), pcData->pcOffset,
+                   totalSamples, stalledSamples, fullPath.c_str(),
+                   lineInfo.lineNumber);
+    }
+  }
+
+  // Emit batched PC sample probes as a bag of pointers (like activity_batch).
+  // Using pointers avoids depending on the CUPTI struct stride, which can
+  // change across CUDA versions.
+  const void *batchPtrs[PCSampleBatchSize];
+  uint32_t batchCount = 0;
+
+  for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
+    batchPtrs[batchCount++] = &pcSamplingData->pPcData[i];
+    if (batchCount == PCSampleBatchSize) {
+      firePCSampleBatch(batchPtrs, batchCount);
+      batchCount = 0;
+    }
+  }
+  if (batchCount > 0) {
+    firePCSampleBatch(batchPtrs, batchCount);
+  }
+}
+
+void PCSampling::emitMetadata() {
+  // Re-emit stall reason map when a tracer attaches (semaphore transitions
+  // to non-zero), using the same pattern as cubin replay.
+  if (stallReasonMap.data() && PARCAGPU_STALL_REASON_MAP_ENABLED()) {
+    if (!stallMapEmitted) {
+      PARCAGPU_STALL_REASON_MAP(stallReasonMap.data(),
+                                stallReasonMap.numEntries());
+      stallMapEmitted = true;
+    }
+  } else {
+    stallMapEmitted = false;
+  }
+
+  // Replay cubin_loaded probes for late-attaching tracers.
+  // When the semaphore transitions to non-zero, re-emit all known cubins.
+  if (PARCAGPU_CUBIN_LOADED_ENABLED()) {
+    if (!cubinsEmitted) {
+      DEBUG_PRINTF("Emitting cubins");
+      std::lock_guard<std::mutex> lock(contextMutex);
+      for (const auto &ref : loadedCubins) {
+        fireCubinLoaded(ref.crc, ref.data, ref.size);
+      }
+      cubinsEmitted = true;
+    }
+  } else {
+    // Tracer detached — reset so we replay again on next attach.
+    cubinsEmitted = false;
+  }
+}
+
+void PCSampling::collectData(CUcontext context) {
+  uint32_t contextId = 0;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  if (!contextInitialized.contain(contextId)) {
+    DEBUG_PRINTF("Context %u not initialized, skipping data collection\n",
+                 contextId);
+    return;
+  }
+
+  auto *configureData = getConfigureData(contextId);
+  DEBUG_PRINTF("Collecting PC sampling data for context %u (cfg total=%zu "
+               "remaining=%zu)\n",
+               contextId, configureData->pcSamplingData.totalNumPcs,
+               configureData->pcSamplingData.remainingNumPcs);
+
+  // Use the separate output buffer for getData — the configured
+  // pcSamplingData buffer is owned by CUPTI.
+  bool ok = getPCSamplingData(context, &configureData->outputData);
+  DEBUG_PRINTF("getData: ok=%d output total=%zu remaining=%zu "
+               "cfg total=%zu remaining=%zu\n",
+               ok, configureData->outputData.totalNumPcs,
+               configureData->outputData.remainingNumPcs,
+               configureData->pcSamplingData.totalNumPcs,
+               configureData->pcSamplingData.remainingNumPcs);
+  processPCSamplingData(configureData);
+}
+
+void PCSampling::collectAllData() {
+  std::lock_guard<std::mutex> lock(contextMutex);
+  for (auto contextId : initializedContextIds) {
+    auto result = contextIdToConfigureData.find(contextId);
+    if (!result) {
+      DEBUG_PRINTF("Context %u in initializedContextIds but not in map, "
+                   "skipping\n",
+                   contextId);
+      continue;
+    }
+    auto *configureData = &result->get();
+    DEBUG_PRINTF("Draining PC sampling data for context %u\n", contextId);
+    processPCSamplingData(configureData);
+  }
+}
+
+void PCSampling::finalize(CUcontext context) {
+  uint32_t contextId = 0;
+  proton::cupti::getContextId<true>(context, &contextId);
+
+  if (!contextInitialized.contain(contextId)) {
+    // Clean up failed context tracking if applicable.
+    contextFailed.erase(contextId);
+    return;
+  }
+
+  // Hold contextMutex for the entire finalize to prevent collectAllData
+  // from racing with us (it iterates initializedContextIds under this lock).
+  std::lock_guard<std::mutex> lock(contextMutex);
+
+  DEBUG_PRINTF("Finalizing PC sampling for context %p\n", context);
+
+  // Remove from iteration list first so collectAllData won't touch this context
+  initializedContextIds.erase(std::remove(initializedContextIds.begin(),
+                                          initializedContextIds.end(),
+                                          contextId),
+                              initializedContextIds.end());
+
+  // Stop sampling if it was started on this context.
+  {
+    std::lock_guard<std::mutex> lock2(pcSamplingMutex);
+    if (samplingActive) {
+      stopPCSampling(context);
+      samplingActive = false;
+    }
+  }
+
+  // Drain remaining data before disabling
+  auto *configureData = getConfigureData(contextId);
+  processPCSamplingData(configureData);
+
+  // After disable, CUPTI may fill remaining records — drain once more
+  if (configureData->pcSamplingData.totalNumPcs > 0) {
+    processPCSamplingData(configureData);
+  }
+
+  contextIdToConfigureData.erase(contextId);
+  contextInitialized.erase(contextId);
+}
+
+void PCSampling::loadModule(const char *cubin, size_t cubinSize) {
+  auto cubinCrc = getCubinCrc(cubin, cubinSize);
+
+  if (cubinCrcToCubinData.contain(cubinCrc)) {
+    // Increment reference count
+    cubinCrcToCubinData[cubinCrc].second++;
+    DEBUG_PRINTF("Module 0x%lx loaded (refcount=%zu)\n", cubinCrc,
+                 cubinCrcToCubinData[cubinCrc].second);
+  } else {
+    // New module — getCubinData after the contain() check so operator[]
+    // doesn't auto-insert before we test.
+    auto *cubinData = getCubinData(cubinCrc);
+    cubinData->cubinCrc = cubinCrc;
+    cubinData->cubinSize = cubinSize;
+    cubinData->cubin = cubin;
+    cubinCrcToCubinData[cubinCrc].second = 1;
+    DEBUG_PRINTF("Module 0x%lx loaded (new)\n", cubinCrc);
+    fireCubinLoaded(cubinCrc, cubin, cubinSize);
+    {
+      std::lock_guard<std::mutex> lock(contextMutex);
+      loadedCubins.push_back({cubinCrc, cubin, cubinSize});
+    }
+  }
+}
+
+void PCSampling::unloadModule(const char *cubin, size_t cubinSize) {
+  auto cubinCrc = getCubinCrc(cubin, cubinSize);
+
+  if (!cubinCrcToCubinData.contain(cubinCrc))
+    return;
+
+  auto count = cubinCrcToCubinData[cubinCrc].second;
+  if (count > 1) {
+    cubinCrcToCubinData[cubinCrc].second = count - 1;
+    DEBUG_PRINTF("Module 0x%lx unloaded (refcount=%zu)\n", cubinCrc, count - 1);
+  } else {
+    cubinCrcToCubinData.erase(cubinCrc);
+    DEBUG_PRINTF("Module 0x%lx unloaded (removed)\n", cubinCrc);
+    fireCubinUnloaded(cubinCrc);
+    {
+      std::lock_guard<std::mutex> lock(contextMutex);
+      loadedCubins.erase(std::remove_if(loadedCubins.begin(),
+                                        loadedCubins.end(),
+                                        [cubinCrc](const CubinRef &r) {
+                                          return r.crc == cubinCrc;
+                                        }),
+                         loadedCubins.end());
+    }
+  }
+}
+
+} // namespace parcagpu
diff --git a/src/pc_sampling.h b/src/pc_sampling.h
new file mode 100644
index 0000000..0bcd56d
--- /dev/null
+++ b/src/pc_sampling.h
@@ -0,0 +1,171 @@
+#ifndef PC_SAMPLING_H_
+#define PC_SAMPLING_H_
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "stall_reason_map.h"
+
+#include <cuda.h>
+#include <cupti.h>
+
+#include "Driver/GPU/CuptiApi.h"
+#include "Profiler/Cupti/CuptiPCSampling.h"
+#include "Utility/Map.h"
+#include "Utility/Set.h"
+
+#define DEBUG_PRINTF(...)                                                      \
+  do {                                                                         \
+    parcagpu::init_debug();                                                    \
+    if (parcagpu::debug_enabled) {                                             \
+      struct timespec ts;                                                      \
+      clock_gettime(CLOCK_REALTIME, &ts);                                      \
+      fprintf(stderr, "[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec);                  \
+      fprintf(stderr, __VA_ARGS__);                                            \
+    }                                                                          \
+  } while (0)
+
+namespace parcagpu {
+
+// Debug logging control — defined in cupti.cpp.
+extern bool debug_enabled;
+extern void init_debug();
+
+// Use Proton's CubinData directly
+using proton::CubinData;
+
+// ConfigureData for PARCAGPU (based on Proton's but standalone)
+// We don't inherit to avoid linking Proton's profiler dependencies
+struct ConfigureData {
+  ConfigureData() = default;
+
+  ~ConfigureData() {
+    if (stallReasonNames) {
+      for (size_t i = 0; i < numStallReasons; i++) {
+        if (stallReasonNames[i])
+          std::free(stallReasonNames[i]);
+      }
+      std::free(stallReasonNames);
+    }
+    if (stallReasonIndices)
+      std::free(stallReasonIndices);
+    if (pcSamplingData.pPcData) {
+      for (size_t i = 0; i < numValidStallReasons; ++i) {
+        std::free(pcSamplingData.pPcData[i].stallReason);
+      }
+      std::free(pcSamplingData.pPcData);
+    }
+  }
+
+  void initialize(CUcontext context);
+
+  CUpti_PCSamplingConfigurationInfo configureStallReasons();
+  CUpti_PCSamplingConfigurationInfo configureSamplingPeriod();
+  CUpti_PCSamplingConfigurationInfo configureSamplingBuffer();
+  CUpti_PCSamplingConfigurationInfo configureScratchBuffer();
+  CUpti_PCSamplingConfigurationInfo configureHardwareBufferSize();
+  CUpti_PCSamplingConfigurationInfo configureCollectionMode();
+  CUpti_PCSamplingConfigurationInfo configureStartStopControl();
+
+  // Buffer size constants (from Proton)
+  static constexpr size_t HardwareBufferSize = 128 * 1024 * 1024;
+  static constexpr size_t ScratchBufferSize = 16 * 1024 * 1024;
+  static constexpr size_t DataBufferPCCount = 1024;
+
+  CUcontext context{};
+  uint32_t contextId;
+  uint32_t numStallReasons{};
+  uint32_t numValidStallReasons{};
+  char **stallReasonNames{};
+  uint32_t *stallReasonIndices{};
+  std::map<size_t, size_t> stallReasonIndexToMetricIndex{};
+  std::set<size_t> notIssuedStallReasonIndices{};
+  CUpti_PCSamplingData pcSamplingData{};       // registered with CUPTI config
+  CUpti_PCSamplingData outputData{};           // separate buffer for getData calls
+  std::vector<CUpti_PCSamplingConfigurationInfo> configurationInfos;
+};
+
+// PC Sampling class (adapted from Proton's CuptiPCSampling)
+// Owned by CuptiProfiler — not a standalone singleton, so lifetime
+// is tied to the profiler and there are no static destruction order issues.
+class PCSampling {
+public:
+  PCSampling() = default;
+  ~PCSampling() = default;
+
+  // Check if PC sampling is supported (CUPTI >= 12.8.1).
+  // Enabled by default; set PARCAGPU_SAMPLING_FACTOR=0 to disable.
+  static bool isSupported();
+
+  void initialize(CUcontext context);
+
+  // Start PC sampling — kernels become serialized until stop().
+  // No-op if already started. Thread-safe.
+  void start(CUcontext context);
+
+  // Stop PC sampling, drain accumulated data, and emit probes.
+  // Kernels resume concurrent execution. No-op if not started.
+  void stop(CUcontext context);
+
+  // Emit stall reason map and replay cubin probes for late-attaching tracers.
+  // Call periodically regardless of sampling state.
+  void emitMetadata();
+
+  void collectData(CUcontext context);
+  void collectAllData();
+  void finalize(CUcontext context);
+  void loadModule(const char *cubin, size_t cubinSize);
+  void unloadModule(const char *cubin, size_t cubinSize);
+
+private:
+  ConfigureData *getConfigureData(uint32_t contextId);
+  CubinData *getCubinData(uint64_t cubinCrc);
+  void processPCSamplingData(ConfigureData *configureData);
+
+  proton::ThreadSafeMap<uint32_t, ConfigureData> contextIdToConfigureData;
+  proton::ThreadSafeMap<size_t, std::pair<CubinData, size_t>>
+      cubinCrcToCubinData;
+  proton::ThreadSafeSet<uint32_t> contextInitialized;
+  proton::ThreadSafeSet<uint32_t> contextFailed; // contexts where enable failed
+
+  // Plain vector of initialized context IDs for iteration in collectAllData.
+  // Protected by contextMutex.
+  std::vector<uint32_t> initializedContextIds;
+
+  // Tracks whether CUPTI PC sampling is currently active (start/stop).
+  // Only one context can be sampling at a time in KERNEL_SERIALIZED mode.
+  std::atomic<bool> samplingActive{false};
+  CUcontext samplingContext{};
+  std::mutex pcSamplingMutex{};
+  std::mutex contextMutex{};
+
+  // Contiguous stall reason map for USDT probe emission.
+  StallReasonMap stallReasonMap;
+
+  // Lightweight cubin metadata for replaying cubin_loaded probes to
+  // late-attaching tracers. Protected by contextMutex.
+  struct CubinRef {
+    uint64_t crc;
+    const char *data;
+    size_t size;
+  };
+  std::vector<CubinRef> loadedCubins;
+
+  // Tracks whether we've replayed cubin_loaded probes for a late-attaching
+  // tracer. Reset to false when the cubin_loaded semaphore transitions to
+  // non-zero.
+  bool cubinsEmitted = false;
+  bool stallMapEmitted = false;
+};
+
+// Fire the error USDT probe. Callable from any translation unit.
+void fireError(int32_t code, const char *message, const char *component);
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_PC_SAMPLING_H_
diff --git a/src/probes.d b/src/probes.d
new file mode 100644
index 0000000..dba3db4
--- /dev/null
+++ b/src/probes.d
@@ -0,0 +1,14 @@
+provider parcagpu {
+  probe cuda_correlation(uint32_t correlationId, int signedCbid,
+                          const char *name);
+  probe kernel_executed(uint64_t start, uint64_t end,
+                        uint32_t correlationId, uint32_t deviceId,
+                        uint32_t streamId, uint32_t graphId,
+                        uint64_t graphNodeId, const char *name);
+  probe activity_batch(const void **ptrs, uint32_t count);
+  probe pc_sample_batch(const void **records, uint32_t count);
+  probe stall_reason_map(const char *names, uint32_t count);
+  probe cubin_loaded(uint64_t cubinCrc, const char *cubin, uint64_t cubinSize);
+  probe cubin_unloaded(uint64_t cubinCrc);
+  probe error(int32_t code, const char *message, const char *component);
+};
diff --git a/src/stall_reason_map.h b/src/stall_reason_map.h
new file mode 100644
index 0000000..ca64ed9
--- /dev/null
+++ b/src/stall_reason_map.h
@@ -0,0 +1,56 @@
+#ifndef PARCAGPU_STALL_REASON_MAP_H_
+#define PARCAGPU_STALL_REASON_MAP_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+namespace parcagpu {
+
+// Contiguous, BPF-friendly stall reason name table.
+// Array of fixed-width 64-byte name slots indexed directly by stall reason
+// index. BPF reads names[stallReasonIndex * 64] — no pointer chasing.
+static constexpr uint32_t STALL_REASON_NAME_LEN = 64;
+
+class StallReasonMap {
+public:
+  StallReasonMap() = default;
+  ~StallReasonMap() { std::free(buf); }
+
+  StallReasonMap(const StallReasonMap &) = delete;
+  StallReasonMap &operator=(const StallReasonMap &) = delete;
+
+  // Build from parallel arrays (as returned by CUPTI).
+  // Indices must be dense 0..N-1.
+  void build(uint32_t numReasons, const uint32_t *indices, char **names) {
+    // Find max index to size the array.
+    uint32_t maxIdx = 0;
+    for (uint32_t i = 0; i < numReasons; i++) {
+      if (indices[i] > maxIdx)
+        maxIdx = indices[i];
+    }
+    count = maxIdx + 1;
+
+    std::free(buf);
+    bufSize = count * STALL_REASON_NAME_LEN;
+    buf = static_cast<char *>(std::calloc(1, bufSize));
+
+    for (uint32_t i = 0; i < numReasons; i++) {
+      char *slot = buf + indices[i] * STALL_REASON_NAME_LEN;
+      strncpy(slot, names[i], STALL_REASON_NAME_LEN - 1);
+    }
+  }
+
+  const char *data() const { return buf; }
+  uint32_t size() const { return bufSize; }
+  uint32_t numEntries() const { return count; }
+
+private:
+  char *buf = nullptr;
+  uint32_t bufSize = 0;
+  uint32_t count = 0;
+};
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_STALL_REASON_MAP_H_
diff --git a/src/token_bucket.h b/src/token_bucket.h
new file mode 100644
index 0000000..88d6d7c
--- /dev/null
+++ b/src/token_bucket.h
@@ -0,0 +1,59 @@
+#ifndef PARCAGPU_TOKEN_BUCKET_H_
+#define PARCAGPU_TOKEN_BUCKET_H_
+
+#include <cstdint>
+#include <time.h>
+
+namespace parcagpu {
+
+// Simple token bucket rate limiter. Not thread-safe — use one instance per
+// thread, or a thread_local instance when shared across call sites.
+class TokenBucket {
+public:
+  // startFull=true: first tryAcquire succeeds immediately.
+  // startFull=false: must wait for refill before first success.
+  explicit TokenBucket(double tokensPerSec, bool startFull = true)
+      : rate(tokensPerSec), tokens(startFull ? 1.0 : 0.0) {}
+
+  void setRate(double tokensPerSec) {
+    rate = tokensPerSec;
+    if (tokens > rate)
+      tokens = rate;
+  }
+
+  // Returns true if a token was available and consumed.
+  bool tryAcquire() {
+    refill();
+    if (tokens >= 1.0) {
+      tokens -= 1.0;
+      return true;
+    }
+    return false;
+  }
+
+private:
+  void refill() {
+    uint64_t now = nowNs();
+    if (lastRefillNs > 0) {
+      double elapsed = (now - lastRefillNs) / 1e9;
+      tokens += elapsed * rate;
+      if (tokens > rate)
+        tokens = rate;
+    }
+    lastRefillNs = now;
+  }
+
+  static uint64_t nowNs() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+  }
+
+  double rate;
+  double tokens;
+  uint64_t lastRefillNs = 0;
+};
+
+} // namespace parcagpu
+
+#endif // PARCAGPU_TOKEN_BUCKET_H_
diff --git a/test.sh b/test.sh
index 56f4e5a..d486cf7 100755
--- a/test.sh
+++ b/test.sh
@@ -6,12 +6,7 @@ cd "$(dirname "$0")"
 
 # Parse arguments
 USE_BPFTRACE=0
-# Auto-detect architecture
-case "$(uname -m)" in
-    aarch64|arm64) DEFAULT_ARCH="arm64" ;;
-    *)             DEFAULT_ARCH="amd64" ;;
-esac
-ARCH="${ARCH:-$DEFAULT_ARCH}"
+ARCH="${ARCH:-amd64}"
 CUDA_MAJOR="${CUDA_MAJOR:-12}"
 for arg in "$@"; do
     case $arg in
@@ -27,8 +22,11 @@ done
 
 echo ""
 echo "=== Building test infrastructure with CMake ==="
-mkdir -p test/build
-cd test/build && cmake .. -DCUDA_MAJOR_VERSION="${CUDA_MAJOR}" && make && cd ../..
+mkdir -p build-local
+cd build-local
+cmake ..
+make -j$(nproc)
+cd ..
 
 # Start bpftrace if requested
 if [ "$USE_BPFTRACE" -eq 1 ]; then
@@ -53,12 +51,12 @@ fi
 
 echo ""
 echo "=== Running test program ==="
-# Set LD_LIBRARY_PATH so the test can find libcupti.so at runtime
-# Set PARCAGPU_DEBUG externally to enable debug output
-# Set PARCAGPU_RATE_LIMIT externally to override default (100/s)
-export LD_LIBRARY_PATH="$(pwd)/test/build:$LD_LIBRARY_PATH"
-# Use the CMake-built library with mock CUPTI
-test/build/test_cupti_prof "build/${CUDA_MAJOR}/${ARCH}/libparcagpucupti.so" --kernel-names=kernel_names.txt "$@"
+# Set LD_LIBRARY_PATH so the test can find libcupti.so and libparcagpucupti.so at runtime
+# Set PARCAGPU_DEBUG to enable debug output
+export LD_LIBRARY_PATH="$(pwd)/build-local/lib:$LD_LIBRARY_PATH"
+export PARCAGPU_DEBUG=1
+# Run the test program with path to library
+./build-local/bin/test_cupti_prof build-local/lib/libparcagpucupti.so "$@"
 
 # If bpftrace was started, stop it and show results
 if [ "$USE_BPFTRACE" -eq 1 ]; then
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index cf805ef..8a0a46b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -20,6 +20,15 @@ set_target_properties(cupti PROPERTIES
     POSITION_INDEPENDENT_CODE ON
 )
 
+# Mock CUDA driver library
+add_library(cuda SHARED mock_cuda.c)
+target_include_directories(cuda PRIVATE ${CUDA_INCLUDE_DIRS})
+set_target_properties(cuda PROPERTIES
+    C_STANDARD 11
+    C_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON
+)
+
 # CUDA major version - prefer explicitly passed value, fall back to detection
 if(NOT DEFINED CUDA_MAJOR_VERSION)
     execute_process(
diff --git a/test/README.md b/test/README.md
index 4cd932a..ff74c58 100644
--- a/test/README.md
+++ b/test/README.md
@@ -1,19 +1,19 @@
 # CUPTI Profiler Test Infrastructure
 
-This directory contains test infrastructure for `libparcagpucupti.so` using Zig as the build system.
+This directory contains test infrastructure for `libparcagpucupti.so` using CMake as the build system.
 
 ## Components
 
-- **test/mock_cupti.c**: Mock CUPTI library that provides stub implementations of all CUPTI APIs used by cupti-prof.c
+- **test/mock_cupti.c**: Mock CUPTI library that provides stub implementations of all CUPTI APIs used by the profiler
 - **test/test_cupti_prof.c**: Test program that dynamically loads libparcagpucupti.so and simulates CUPTI callbacks
-- **build.zig**: Zig build configuration (at project root)
+- **CMakeLists.txt**: CMake build configuration (at project root)
 - **test.sh**: Test script (at project root)
 
 ## Building
 
 From the project root:
 ```bash
-zig build
+make
 ```
 
 This builds:
@@ -21,32 +21,31 @@ This builds:
 2. `libparcagpucupti.so` - The profiler library linked against the mock CUPTI
 3. `test_cupti_prof` - Test executable that loads and exercises the profiler
 
-All outputs go to `zig-out/lib/` and `zig-out/bin/`.
+All outputs go to `build/lib/` and `build/bin/`.
 
 ## Running
 
 Using the test script (recommended):
 ```bash
-cd /home/tpr/src/gpu/parcagpu
 ./test.sh
 ```
 
-Using Zig directly:
+Using Make directly:
 ```bash
-zig build run
+make test
 ```
 
 Or manually:
 ```bash
-zig build
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so
+make
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so
 ```
 
 ### Running Continuously
 
 To run the test in continuous mode (useful for monitoring probes with bpftrace):
 ```bash
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so --forever
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so --forever
 ```
 
 In this mode, the test will:
@@ -83,7 +82,7 @@ The test script automatically enables `PARCAGPU_DEBUG=1` to show detailed debug
 
 To run without debug output:
 ```bash
-LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so
+LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so
 ```
 
 ## Verifying DTRACE Probes
@@ -92,13 +91,11 @@ To verify that the DTRACE/USDT probes are firing correctly, use the provided bpf
 
 **Terminal 1** - Run bpftrace to monitor probes:
 ```bash
-cd /home/tpr/src/gpu/parcagpu
 sudo bpftrace parcagpu.bt
 ```
 
 **Terminal 2** - Run the test:
 ```bash
-cd /home/tpr/src/gpu/parcagpu
 ./test.sh
 ```
 
diff --git a/test/bpf/activity_parser.bpf.c b/test/bpf/activity_parser.bpf.c
index 33aa754..cb8c495 100644
--- a/test/bpf/activity_parser.bpf.c
+++ b/test/bpf/activity_parser.bpf.c
@@ -22,23 +22,25 @@
 
 #include "usdt_args.h"
 
-#include "cupti_activity_bpf.h"
+#include "cupti_bpf.h"
 
 #define MAX_BATCH_SIZE 128
 #define MAX_KERNEL_NAME 128
+#define MAX_CUBIN_SIZE (64 * 1024 * 1024) // 64MB safety cap
 
 // USDT spec map — populated by Go loader before uprobe attachment.
 // Keyed by spec ID (uint32); value is struct bpf_usdt_spec.
-// Old-style SEC("maps") definition to match the extern in usdt_args.h.
-struct bpf_map_def __bpf_usdt_specs SEC("maps") = {
-    .type = BPF_MAP_TYPE_HASH,
-    .key_size = sizeof(u32),
-    .value_size = sizeof(struct bpf_usdt_spec),
-    .max_entries = 256,
-};
+struct usdt_specs_t {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __type(key, u32);
+    __type(value, struct bpf_usdt_spec);
+    __uint(max_entries, 256);
+} __bpf_usdt_specs SEC(".maps");
 
 // Event sent to user-space for each kernel activity found.
 struct kernel_event {
+  u32 event_type; // EVENT_TYPE_KERNEL
+  u32 _pad;
   u64 start;
   u64 end;
   u32 correlation_id;
@@ -49,20 +51,75 @@ struct kernel_event {
   char name[MAX_KERNEL_NAME];
 };
 
-// Ring buffer for sending events to user-space.
-struct bpf_map_def events SEC("maps") = {
-    .type = BPF_MAP_TYPE_RINGBUF,
-    .max_entries = 1 << 20, // 1 MB
+// Cubin load/unload events — Go reads actual bytes via /proc/pid/mem.
+struct cubin_event {
+  u32 event_type; // EVENT_TYPE_CUBIN_LOADED or EVENT_TYPE_CUBIN_UNLOADED
+  u32 _pad;
+  u64 cubin_crc;
+  u64 cubin_ptr; // user-space address (for /proc/pid/mem read)
+  u64 cubin_size;
 };
 
-// Stats counters.
-struct bpf_map_def stats SEC("maps") = {
-    .type = BPF_MAP_TYPE_ARRAY,
-    .key_size = sizeof(u32),
-    .value_size = sizeof(u64),
-    .max_entries = 4,
+// Event type tags for the ring buffer.
+#define EVENT_TYPE_KERNEL 1
+#define EVENT_TYPE_CUBIN_LOADED 2
+#define EVENT_TYPE_CUBIN_UNLOADED 3
+#define EVENT_TYPE_PC_SAMPLE 4
+#define EVENT_TYPE_ERROR 5
+
+// PC sample event sent to user-space.
+struct pc_sample_event {
+  u32 event_type; // EVENT_TYPE_PC_SAMPLE
+  u32 stall_reason_count;
+  u64 cubin_crc;
+  u64 pc_offset;
+  u32 function_index;
+  u32 correlation_id; // kernel correlation ID (CUDA 12.4+ / CUPTI v22+, else 0)
+  char function_name[MAX_FUNC_NAME];
+  struct cupti_stall_reason stall_reasons[MAX_STALL_REASONS];
 };
 
+// Error event sent to user-space.
+#define MAX_ERROR_MSG 256
+#define MAX_ERROR_COMPONENT 64
+struct error_event {
+  u32 event_type; // EVENT_TYPE_ERROR
+  s32 error_code;
+  char message[MAX_ERROR_MSG];
+  char component[MAX_ERROR_COMPONENT];
+};
+
+// Ring buffer for sending events to user-space.
+struct {
+    __uint(type, BPF_MAP_TYPE_RINGBUF);
+    __uint(max_entries, 1 << 20); // 1 MB
+} events SEC(".maps");
+
+// Stall reason name table — indexed by stall reason index.
+// Value is a 64-byte null-terminated string.
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __type(key, u32);
+    __type(value, char[STALL_REASON_NAME_LEN]);
+    __uint(max_entries, MAX_STALL_REASONS);
+} stall_reasons SEC(".maps");
+
+// Whether the stall reason map has been populated.
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __type(key, u32);
+    __type(value, u32);
+    __uint(max_entries, 1);
+} stall_map_loaded SEC(".maps");
+
+// Stats counters.
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __type(key, u32);
+    __type(value, u64);
+    __uint(max_entries, 4);
+} stats SEC(".maps");
+
 enum stat_key {
   STAT_BATCHES = 0,    // number of batch probe invocations
   STAT_ACTIVITIES = 1, // total activity records scanned
@@ -126,6 +183,7 @@ int BPF_USDT(handle_activity_batch, u64 ptrs_base, u32 num_activities) {
       continue;
     }
 
+    evt->event_type = EVENT_TYPE_KERNEL;
     evt->start = record.start;
     evt->end = record.end;
     evt->correlation_id = record.correlation_id;
@@ -148,4 +206,166 @@ int BPF_USDT(handle_activity_batch, u64 ptrs_base, u32 num_activities) {
   return 0;
 }
 
+SEC("usdt/parcagpu/stall_reason_map")
+int BPF_USDT(handle_stall_reason_map, u64 names_base, u32 count) {
+  // Only load the map once.
+  u32 zero = 0;
+  u32 *loaded = bpf_map_lookup_elem(&stall_map_loaded, &zero);
+  if (!loaded)
+    return 0;
+  if (*loaded)
+    return 0;
+
+  if (count > MAX_STALL_REASONS)
+    count = MAX_STALL_REASONS;
+
+  // Read each 64-byte name slot and store in the BPF map.
+  for (u32 i = 0; i < MAX_STALL_REASONS; i++) {
+    if (i >= count)
+      break;
+
+    char name[STALL_REASON_NAME_LEN] = {};
+    int ret = bpf_probe_read_user(
+        name, sizeof(name),
+        (void *)(names_base + (u64)i * STALL_REASON_NAME_LEN));
+    if (ret != 0)
+      continue;
+
+    bpf_map_update_elem(&stall_reasons, &i, name, BPF_ANY);
+  }
+
+  u32 one = 1;
+  bpf_map_update_elem(&stall_map_loaded, &zero, &one, BPF_ANY);
+  return 0;
+}
+
+SEC("usdt/parcagpu/cubin_loaded")
+int BPF_USDT(handle_cubin_loaded, u64 cubin_crc, u64 cubin_ptr,
+             u64 cubin_size) {
+  if (cubin_size == 0 || cubin_size > MAX_CUBIN_SIZE)
+    return 0;
+
+  struct cubin_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+  if (!evt) {
+    bump_stat(STAT_DROPS);
+    return 0;
+  }
+
+  evt->event_type = EVENT_TYPE_CUBIN_LOADED;
+  evt->cubin_crc = cubin_crc;
+  evt->cubin_ptr = cubin_ptr;
+  evt->cubin_size = cubin_size;
+  bpf_ringbuf_submit(evt, 0);
+  return 0;
+}
+
+SEC("usdt/parcagpu/cubin_unloaded")
+int BPF_USDT(handle_cubin_unloaded, u64 cubin_crc) {
+  struct cubin_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+  if (!evt) {
+    bump_stat(STAT_DROPS);
+    return 0;
+  }
+
+  evt->event_type = EVENT_TYPE_CUBIN_UNLOADED;
+  evt->cubin_crc = cubin_crc;
+  evt->cubin_ptr = 0;
+  evt->cubin_size = 0;
+  bpf_ringbuf_submit(evt, 0);
+  return 0;
+}
+
+SEC("usdt/parcagpu/pc_sample_batch")
+int BPF_USDT(handle_pc_sample_batch, u64 ptrs_base, u32 count) {
+  if (count > MAX_PC_BATCH_SIZE)
+    count = MAX_PC_BATCH_SIZE;
+
+  for (u32 i = 0; i < MAX_PC_BATCH_SIZE; i++) {
+    if (i >= count)
+      break;
+
+    // Read the i-th pointer from the array.
+    u64 rec_ptr = 0;
+    int ret = bpf_probe_read_user(&rec_ptr, sizeof(rec_ptr),
+                                  (void *)(ptrs_base + (u64)i * sizeof(u64)));
+    if (ret != 0 || rec_ptr == 0)
+      continue;
+
+    // Chase the pointer to read the CUPTI PC data record.
+    struct cupti_pc_data rec = {};
+    ret = bpf_probe_read_user(&rec, sizeof(rec), (void *)rec_ptr);
+    if (ret != 0)
+      continue;
+
+    // Reserve ring buffer space for the event.
+    struct pc_sample_event *evt =
+        bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+    if (!evt) {
+      bump_stat(STAT_DROPS);
+      continue;
+    }
+
+    evt->event_type = EVENT_TYPE_PC_SAMPLE;
+    evt->cubin_crc = rec.cubin_crc;
+    evt->pc_offset = rec.pc_offset;
+    evt->function_index = rec.function_index;
+
+    // Read correlationId if the struct is large enough (CUDA 12.4+).
+    // It sits right after the stallReason pointer at offset 56.
+    evt->correlation_id = 0;
+    if (rec.size > CUPTI_PC_DATA_BASE_SIZE) {
+      u32 corr = 0;
+      bpf_probe_read_user(&corr, sizeof(corr),
+                          (void *)(rec_ptr + CUPTI_PC_DATA_BASE_SIZE));
+      evt->correlation_id = corr;
+    }
+
+    // Chase the function name pointer.
+    if (rec.function_name_ptr) {
+      bpf_probe_read_user_str(evt->function_name, sizeof(evt->function_name),
+                              (void *)rec.function_name_ptr);
+    } else {
+      evt->function_name[0] = '\0';
+    }
+
+    // Chase the stall reason pointer.
+    u32 sr_count = rec.stall_reason_count;
+    if (sr_count > MAX_STALL_REASONS)
+      sr_count = MAX_STALL_REASONS;
+    evt->stall_reason_count = sr_count;
+
+    if (rec.stall_reason_ptr && sr_count > 0) {
+      bpf_probe_read_user(evt->stall_reasons,
+                          sr_count * sizeof(struct cupti_stall_reason),
+                          (void *)rec.stall_reason_ptr);
+    }
+
+    bpf_ringbuf_submit(evt, 0);
+  }
+
+  return 0;
+}
+
+SEC("usdt/parcagpu/error")
+int BPF_USDT(handle_error, s32 code, u64 message_ptr, u64 component_ptr) {
+  struct error_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0);
+  if (!evt) {
+    bump_stat(STAT_DROPS);
+    return 0;
+  }
+
+  evt->event_type = EVENT_TYPE_ERROR;
+  evt->error_code = code;
+  evt->message[0] = '\0';
+  evt->component[0] = '\0';
+  if (message_ptr)
+    bpf_probe_read_user_str(evt->message, sizeof(evt->message),
+                            (void *)message_ptr);
+  if (component_ptr)
+    bpf_probe_read_user_str(evt->component, sizeof(evt->component),
+                            (void *)component_ptr);
+  bpf_ringbuf_submit(evt, 0);
+  return 0;
+}
+
 char LICENSE[] SEC("license") = "GPL";
diff --git a/test/bpf/cupti_activity_bpf.h b/test/bpf/cupti_activity_bpf.h
deleted file mode 100644
index 38086c3..0000000
--- a/test/bpf/cupti_activity_bpf.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Minimal CUPTI activity buffer definitions for BPF programs.
-// Stripped-down version of CUpti_ActivityKernel5 with exact layout
-// matching the CUDA 12.x CUPTI headers (sizeof = 160, packed aligned 8).
-//
-// Only the fields needed by BPF are named; the rest are padding.
-// Field offsets verified against the real struct with offsetof().
-
-#ifndef CUPTI_ACTIVITY_BPF_H
-#define CUPTI_ACTIVITY_BPF_H
-
-// CUpti_ActivityKind values we care about
-#define CUPTI_ACTIVITY_KIND_KERNEL 3
-#define CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL 10
-
-// Matches the layout of CUpti_ActivityKernel5 exactly.
-// PACKED_ALIGNMENT in CUPTI is: __attribute__((__packed__)) __attribute__((aligned(8)))
-struct cupti_activity_kernel5 {
-  u32 kind;            // offset 0   - CUpti_ActivityKind
-  u8 _pad1[12];        // offset 4   - cacheConfig, sharedMemConfig,
-                        //              registersPerThread,
-                        //              partitionedGlobalCache x2
-  u64 start;            // offset 16  - kernel start timestamp (ns)
-  u64 end;              // offset 24  - kernel end timestamp (ns)
-  u64 completed;        // offset 32  - completion timestamp
-  u32 device_id;        // offset 40
-  u32 context_id;       // offset 44
-  u32 stream_id;        // offset 48
-  u8 _pad2[40];         // offset 52  - gridX/Y/Z, blockX/Y/Z,
-                         //              staticSharedMemory,
-                         //              dynamicSharedMemory,
-                         //              localMemoryPerThread,
-                         //              localMemoryTotal
-  u32 correlation_id;   // offset 92
-  s64 grid_id;          // offset 96
-  u64 name_ptr;         // offset 104 - const char* (user-space pointer)
-  u64 _reserved0;       // offset 112
-  u64 queued;           // offset 120
-  u64 submitted;        // offset 128
-  u8 _pad3[8];          // offset 136 - launchType, isSharedMemoryCarveout,
-                         //              sharedMemoryCarveoutRequested,
-                         //              padding, sharedMemoryExecuted
-  u64 graph_node_id;   // offset 144
-  u32 shmem_limit_cfg; // offset 152 - CUpti_FuncShmemLimitConfig
-  u32 graph_id;         // offset 156
-} __attribute__((__packed__)) __attribute__((aligned(8)));
-
-// Verify expected size at compile time
-_Static_assert(sizeof(struct cupti_activity_kernel5) == 160,
-               "cupti_activity_kernel5 size mismatch");
-
-#endif // CUPTI_ACTIVITY_BPF_H
diff --git a/test/bpf/go.mod b/test/bpf/go.mod
index ae3ac5b..b3ab1bf 100644
--- a/test/bpf/go.mod
+++ b/test/bpf/go.mod
@@ -1,25 +1,12 @@
-module github.com/parca-dev/batch-kernel/test/bpf
+module parcagpu/test/bpf
 
 go 1.25.1
 
 require (
-	github.com/cilium/ebpf v0.20.0
-	go.opentelemetry.io/ebpf-profiler v0.0.0-00010101000000-000000000000
+	github.com/cilium/ebpf v0.21.0
+	github.com/gnurizen/sass-table v0.0.1
+	github.com/parca-dev/usdt v0.0.2
 	golang.org/x/sys v0.41.0
 )
 
-require (
-	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/go-logr/logr v1.4.3 // indirect
-	github.com/go-logr/stdr v1.2.2 // indirect
-	github.com/google/uuid v1.6.0 // indirect
-	github.com/klauspost/cpuid/v2 v2.2.8 // indirect
-	github.com/minio/sha256-simd v1.0.1 // indirect
-	github.com/sirupsen/logrus v1.9.3 // indirect
-	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
-	go.opentelemetry.io/otel v1.39.0 // indirect
-	go.opentelemetry.io/otel/metric v1.39.0 // indirect
-	go.opentelemetry.io/otel/trace v1.39.0 // indirect
-)
-
-replace go.opentelemetry.io/ebpf-profiler => ../../vendor/opentelemetry-ebpf-profiler
+require github.com/rogpeppe/go-internal v1.14.1 // indirect
diff --git a/test/bpf/go.sum b/test/bpf/go.sum
index 6530420..f1dfcbd 100644
--- a/test/bpf/go.sum
+++ b/test/bpf/go.sum
@@ -1,66 +1,30 @@
-github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
-github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-github.com/cilium/ebpf v0.20.0 h1:atwWj9d3NffHyPZzVlx3hmw1on5CLe9eljR8VuHTwhM=
-github.com/cilium/ebpf v0.20.0/go.mod h1:pzLjFymM+uZPLk/IXZUL63xdx5VXEo+enTzxkZXdycw=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
-github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
-github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
-github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
-github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k=
+github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ=
+github.com/gnurizen/sass-table v0.0.1 h1:LvV2GIUAIv6ZXLzr6TbC51Domg/5iVihhc7JXVGZXe0=
+github.com/gnurizen/sass-table v0.0.1/go.mod h1:epXL4VyR6Yuec1rY5RV9awBdWm6ibGUKRB5zajgyUjo=
 github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6 h1:teYtXy9B7y5lHTp8V9KPxpYRAVA7dozigQcMiBust1s=
 github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6/go.mod h1:p4lGIVX+8Wa6ZPNDvqcxq36XpUDLh42FLetFU7odllI=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
-github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
-github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
 github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
-github.com/jsimonetti/rtnetlink/v2 v2.0.3 h1:Jcp7GTnTPepoUAJ9+LhTa7ZiebvNS56T1GtlEUaPNFE=
-github.com/jsimonetti/rtnetlink/v2 v2.0.3/go.mod h1:atIkksp/9fqtf6rpAw45JnttnP2gtuH9X88WPfWfS9A=
-github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
-github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
+github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM=
+github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g=
 github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
-github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U=
-github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA=
-github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM=
-github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos=
+github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ=
+github.com/parca-dev/usdt v0.0.2 h1:bpKQycQ++zV8pwkMaJSxZS07XnEXqO3rkHcLYFJDTl4=
+github.com/parca-dev/usdt v0.0.2/go.mod h1:bjh3OTksk+pyP7WsHWlRKWaMSJTUr0gx0piZ/tAv6/w=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
-github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
-github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
-go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
-go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
-go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48=
-go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8=
-go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0=
-go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs=
-go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI=
-go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA=
-golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c=
-golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk=
 golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
 golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
 golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
 golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
-golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
 golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/test/bpf/main.go b/test/bpf/main.go
index f6e9738..681e4a0 100644
--- a/test/bpf/main.go
+++ b/test/bpf/main.go
@@ -1,10 +1,7 @@
 // Test program that loads the activity_parser BPF program, attaches it
-// to the parcagpu:activity_batch USDT probe in the target shared library,
-// and logs kernel activities received through the ring buffer.
-//
-// The USDT probe location and argument specs are parsed from the ELF
-// .note.stapsdt section using pfelf, then populated into the BPF
-// __bpf_usdt_specs map so bpf_usdt_arg() reads the correct registers.
+// to parcagpu USDT probes in the target shared library, and logs kernel
+// activities received through the ring buffer. Also captures cubin modules
+// and resolves PC addresses to source lines using llvm-dwarfdump.
 //
 // Usage:
 //
@@ -14,30 +11,51 @@
 package main
 
 import (
+	"bufio"
 	"bytes"
+	"debug/elf"
 	"encoding/binary"
 	"errors"
 	"flag"
 	"fmt"
+	"io"
 	"log"
 	"os"
+	"os/exec"
 	"os/signal"
 	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
 	"syscall"
 	"time"
 	"unsafe"
 
-	"github.com/cilium/ebpf"
+	ebpf2 "github.com/cilium/ebpf"
 	"github.com/cilium/ebpf/link"
 	"github.com/cilium/ebpf/ringbuf"
-	"go.opentelemetry.io/ebpf-profiler/libpf/pfelf"
+	"github.com/parca-dev/usdt"
 	"golang.org/x/sys/unix"
+
+	sasstable "github.com/gnurizen/sass-table"
 )
 
-//go:generate bpf2go -cc clang -cflags "-O2 -g -Wall -target bpf -D__TARGET_ARCH_x86 -D__x86_64__ -I../../vendor/opentelemetry-ebpf-profiler/support/ebpf" activityParser activity_parser.bpf.c
+//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $GOARCH -cflags "-I../../ebpf -I$USDT_HEADERS" activityParser activity_parser.bpf.c
+
+// Event type tags — must match BPF #defines.
+const (
+	eventTypeKernel        = 1
+	eventTypeCubinLoaded   = 2
+	eventTypeCubinUnloaded = 3
+	eventTypePCSample      = 4
+	eventTypeError         = 5
+)
 
 // KernelEvent matches struct kernel_event in the BPF program.
 type KernelEvent struct {
+	EventType     uint32
+	_             uint32
 	Start         uint64
 	End           uint64
 	CorrelationID uint32
@@ -48,6 +66,41 @@ type KernelEvent struct {
 	Name          [128]byte
 }
 
+// CubinEvent matches struct cubin_event in the BPF program.
+type CubinEvent struct {
+	EventType uint32
+	_         uint32
+	CubinCRC  uint64
+	CubinPtr  uint64
+	CubinSize uint64
+}
+
+// StallReason matches struct cupti_stall_reason in the BPF program.
+type StallReason struct {
+	Index   uint32
+	Samples uint32
+}
+
+// ErrorEvent matches struct error_event in the BPF program.
+type ErrorEvent struct {
+	EventType uint32
+	ErrorCode int32
+	Message   [256]byte
+	Component [64]byte
+}
+
+// PCSampleEvent matches struct pc_sample_event in the BPF program.
+type PCSampleEvent struct {
+	EventType        uint32
+	StallReasonCount uint32
+	CubinCRC         uint64
+	PCOffset         uint64
+	FunctionIndex    uint32
+	CorrelationID    uint32
+	FunctionName     [128]byte
+	StallReasons     [64]StallReason
+}
+
 const (
 	statBatches    = 0
 	statActivities = 1
@@ -55,6 +108,277 @@ const (
 	statDrops      = 3
 )
 
+// lineEntry is a single address→source mapping from the DWARF line table.
+type lineEntry struct {
+	addr uint64
+	file string
+	line int
+}
+
+// cubinStore holds loaded cubins and their parsed line tables.
+type cubinStore struct {
+	mu     sync.RWMutex
+	cubins map[uint64]*cubinInfo // keyed by CRC
+	pid    int
+}
+
+// textSection holds one .text._Zfuncname section from the cubin ELF.
+type textSection struct {
+	name string
+	data []byte
+}
+
+type cubinInfo struct {
+	crc     uint64
+	size    uint64
+	lines   []lineEntry   // sorted by addr
+	files   []string      // file table from line table header
+	archSM  int           // SM version from ELF e_flags (e.g. 121)
+	texts   []textSection // .text sections for instruction decoding
+	tmpPath string        // temp file for llvm-dwarfdump
+}
+
+func newCubinStore(pid int) *cubinStore {
+	return &cubinStore{
+		cubins: make(map[uint64]*cubinInfo),
+		pid:    pid,
+	}
+}
+
+func (cs *cubinStore) load(crc, ptr, size uint64) {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+
+	if _, ok := cs.cubins[crc]; ok {
+		return // already loaded
+	}
+
+	data, err := readProcessMemory(cs.pid, ptr, size)
+	if err != nil {
+		log.Printf("  [CUBIN] failed to read cubin 0x%x (%d bytes) from pid %d: %v",
+			crc, size, cs.pid, err)
+		return
+	}
+
+	info := &cubinInfo{crc: crc, size: size}
+
+	// Write to temp file for llvm-dwarfdump.
+	tmp, err := os.CreateTemp("", fmt.Sprintf("cubin_%x_*.elf", crc))
+	if err != nil {
+		log.Printf("  [CUBIN] 0x%x loaded (%d bytes), no temp file: %v", crc, size, err)
+		cs.cubins[crc] = info
+		return
+	}
+	tmp.Write(data)
+	tmp.Close()
+	info.tmpPath = tmp.Name()
+
+	// Parse line table with llvm-dwarfdump.
+	lines, files, err := parseLinesWithDwarfdump(info.tmpPath)
+	if err != nil {
+		log.Printf("  [CUBIN] 0x%x loaded (%d bytes), no line info: %v", crc, size, err)
+	} else {
+		info.lines = lines
+		info.files = files
+		log.Printf("  [CUBIN] 0x%x loaded (%d bytes), %d line entries, %d files",
+			crc, size, len(lines), len(files))
+	}
+
+	// Parse ELF to extract SM version and .text sections for SASS decoding.
+	archSM, texts := parseCubinELF(data)
+	info.archSM = archSM
+	info.texts = texts
+	if archSM > 0 {
+		log.Printf("  [CUBIN] 0x%x SM%d, %d text sections", crc, archSM, len(texts))
+	}
+
+	cs.cubins[crc] = info
+}
+
+func (cs *cubinStore) unload(crc uint64) {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+	if info, ok := cs.cubins[crc]; ok {
+		if info.tmpPath != "" {
+			os.Remove(info.tmpPath)
+		}
+		delete(cs.cubins, crc)
+	}
+	log.Printf("  [CUBIN] 0x%x unloaded", crc)
+}
+
+func (cs *cubinStore) cleanup() {
+	cs.mu.Lock()
+	defer cs.mu.Unlock()
+	for _, info := range cs.cubins {
+		if info.tmpPath != "" {
+			os.Remove(info.tmpPath)
+		}
+	}
+}
+
+// resolveInstruction looks up the SASS mnemonic at a PC offset.
+// Uses the sass-table opcode decoder first, falls back to nvdisasm cache.
+func (cs *cubinStore) resolveInstruction(cubinCRC uint64, pcOffset uint64) string {
+	cs.mu.RLock()
+	info, ok := cs.cubins[cubinCRC]
+	cs.mu.RUnlock()
+	if !ok {
+		return ""
+	}
+
+	if info.archSM == 0 || len(info.texts) == 0 {
+		return ""
+	}
+
+	// pcOffset is function-relative. Try all text sections — the offset
+	// should only produce a valid decode in the correct one.
+	for _, ts := range info.texts {
+		if int(pcOffset)+16 <= len(ts.data) {
+			m := sasstable.DecodeMnemonicFromSlice(info.archSM, ts.data[pcOffset:])
+			if m != "" {
+				return m
+			}
+		}
+	}
+	return ""
+}
+
+// parseCubinELF extracts the SM version and .text section data from a cubin ELF.
+func parseCubinELF(data []byte) (archSM int, texts []textSection) {
+	f, err := elf.NewFile(bytes.NewReader(data))
+	if err != nil {
+		return 0, nil
+	}
+	defer f.Close()
+
+	// Go's debug/elf doesn't expose e_flags. Read it directly from the ELF header.
+	// For ELF64: e_flags is at offset 48, 4 bytes little-endian.
+	if len(data) >= 52 {
+		flags := binary.LittleEndian.Uint32(data[48:52])
+		archSM = int((flags >> 8) & 0xFF)
+	}
+
+	for _, s := range f.Sections {
+		if s.Type == elf.SHT_PROGBITS && s.Flags&elf.SHF_EXECINSTR != 0 &&
+			strings.HasPrefix(s.Name, ".text") {
+			d, err := s.Data()
+			if err != nil {
+				continue
+			}
+			texts = append(texts, textSection{name: s.Name, data: d})
+		}
+	}
+	return archSM, texts
+}
+
+// resolvePC looks up a PC offset in a cubin's line table.
+func (cs *cubinStore) resolvePC(cubinCRC uint64, pcOffset uint64) (file string, line int) {
+	cs.mu.RLock()
+	info, ok := cs.cubins[cubinCRC]
+	cs.mu.RUnlock()
+	if !ok || len(info.lines) == 0 {
+		return
+	}
+
+	// Binary search for the largest address <= pcOffset.
+	i := sort.Search(len(info.lines), func(i int) bool {
+		return info.lines[i].addr > pcOffset
+	})
+	if i == 0 {
+		return
+	}
+	e := info.lines[i-1]
+	return e.file, e.line
+}
+
+// parseLinesWithDwarfdump runs llvm-dwarfdump --debug-line on the cubin ELF
+// and parses the output into a sorted line table.
+func parseLinesWithDwarfdump(path string) ([]lineEntry, []string, error) {
+	cmd := exec.Command("llvm-dwarfdump", "--debug-line", path)
+	out, err := cmd.Output()
+	if err != nil {
+		return nil, nil, fmt.Errorf("llvm-dwarfdump: %w", err)
+	}
+
+	var entries []lineEntry
+	files := map[string]bool{}
+	fileList := []string{}
+
+	scanner := bufio.NewScanner(bytes.NewReader(out))
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Parse file table entries: file_names[  N]:
+		//            name: "foo.cu"
+		if strings.HasPrefix(line, "           name: ") {
+			name := strings.Trim(strings.TrimPrefix(line, "           name: "), "\"")
+			if !files[name] {
+				files[name] = true
+				fileList = append(fileList, name)
+			}
+			continue
+		}
+
+		// Parse line table rows: 0xADDR  LINE  COL  FILE ...
+		if !strings.HasPrefix(line, "0x") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 4 {
+			continue
+		}
+
+		addr, err := strconv.ParseUint(strings.TrimPrefix(fields[0], "0x"), 16, 64)
+		if err != nil {
+			continue
+		}
+
+		lineNum, err := strconv.Atoi(fields[1])
+		if err != nil {
+			continue
+		}
+
+		// fields[3] is the file index (1-based)
+		fileIdx, err := strconv.Atoi(fields[3])
+		if err != nil || fileIdx < 1 || fileIdx > len(fileList) {
+			continue
+		}
+
+		entries = append(entries, lineEntry{
+			addr: addr,
+			file: fileList[fileIdx-1],
+			line: lineNum,
+		})
+	}
+
+	if len(entries) == 0 {
+		return nil, nil, fmt.Errorf("no line entries found")
+	}
+
+	sort.Slice(entries, func(i, j int) bool {
+		return entries[i].addr < entries[j].addr
+	})
+
+	return entries, fileList, nil
+}
+
+func readProcessMemory(pid int, addr, size uint64) ([]byte, error) {
+	f, err := os.Open(fmt.Sprintf("/proc/%d/mem", pid))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	data := make([]byte, size)
+	_, err = f.ReadAt(data, int64(addr))
+	if err != nil && !errors.Is(err, io.EOF) {
+		return nil, err
+	}
+	return data, nil
+}
+
 func main() {
 	pid := flag.Int("pid", 0, "PID of the target process")
 	libPath := flag.String("lib", "", "Path to the shared library containing the USDT probe")
@@ -83,7 +407,7 @@ func main() {
 	// Load pre-compiled BPF objects.
 	objs := activityParserObjects{}
 	if err := loadActivityParserObjects(&objs, nil); err != nil {
-		var ve *ebpf.VerifierError
+		var ve *ebpf2.VerifierError
 		if errors.As(err, &ve) {
 			log.Fatalf("Verifier error loading BPF objects:\n%+v", ve)
 		}
@@ -92,67 +416,69 @@ func main() {
 	defer objs.Close()
 
 	// Parse USDT probes from the shared library's .note.stapsdt section.
-	ef, err := pfelf.Open(realLib)
-	if err != nil {
-		log.Fatalf("Opening ELF %s: %v", realLib, err)
-	}
-	defer ef.Close()
-
-	if err := ef.LoadSections(); err != nil {
-		log.Fatalf("Loading ELF sections: %v", err)
-	}
-
-	probes, err := ef.ParseUSDTProbes()
+	probes, err := usdt.ParseProbesFromFile(realLib)
 	if err != nil {
-		log.Fatalf("Parsing USDT probes: %v", err)
+		log.Fatalf("Parsing USDT probes from %s: %v", realLib, err)
 	}
 
-	// Find parcagpu:activity_batch probe(s) and attach uprobe at each site.
+	// Find USDT probes and attach uprobes at each site.
 	ex, err := link.OpenExecutable(realLib)
 	if err != nil {
 		log.Fatalf("Opening executable %s: %v", realLib, err)
 	}
 
+	type probeTarget struct {
+		name    string
+		handler *ebpf2.Program
+	}
+	targets := []probeTarget{
+		{"activity_batch", objs.HandleActivityBatch},
+		{"stall_reason_map", objs.HandleStallReasonMap},
+		{"cubin_loaded", objs.HandleCubinLoaded},
+		{"cubin_unloaded", objs.HandleCubinUnloaded},
+		{"pc_sample_batch", objs.HandlePcSampleBatch},
+		{"error", objs.HandleError},
+	}
+
 	var links []link.Link
 	var specID uint32
-	for _, probe := range probes {
-		if probe.Provider != "parcagpu" || probe.Name != "activity_batch" {
-			continue
-		}
+	for _, t := range targets {
+		for _, probe := range probes {
+			if probe.Provider != "parcagpu" || probe.Name != t.name {
+				continue
+			}
 
-		// Parse the stapsdt argument spec into a bpf_usdt_spec.
-		spec, err := pfelf.ParseUSDTArguments(probe.Arguments)
-		if err != nil {
-			log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err)
-		}
+			spec, err := usdt.ParseUSDTArguments(probe.Arguments)
+			if err != nil {
+				log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err)
+			}
 
-		// Store spec in the BPF map so bpf_usdt_arg() can look it up.
-		specBytes := pfelf.USDTSpecToBytes(spec)
-		if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil {
-			log.Fatalf("Populating USDT spec map: %v", err)
-		}
+			specBytes := usdt.SpecToBytes(spec)
+			if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil {
+				log.Fatalf("Populating USDT spec map: %v", err)
+			}
 
-		// Cookie: spec_id in high 32 bits (bpf_usdt_arg reads it via bpf_get_attach_cookie).
-		cookie := uint64(specID) << 32
+			cookie := uint64(specID) << 32
 
-		log.Printf("USDT probe parcagpu:activity_batch at offset 0x%x, args=%q, spec_id=%d",
-			probe.Location, probe.Arguments, specID)
+			log.Printf("USDT probe parcagpu:%s at offset 0x%x, args=%q, spec_id=%d",
+				t.name, probe.Location, probe.Arguments, specID)
 
-		up, err := ex.Uprobe("activity_batch", objs.HandleActivityBatch, &link.UprobeOptions{
-			Address:      probe.Location,
-			PID:          *pid,
-			Cookie:       cookie,
-			RefCtrOffset: probe.SemaphoreOffset,
-		})
-		if err != nil {
-			log.Fatalf("Attaching uprobe at offset 0x%x: %v", probe.Location, err)
+			up, err := ex.Uprobe(t.name, t.handler, &link.UprobeOptions{
+				Address:      probe.Location,
+				PID:          *pid,
+				Cookie:       cookie,
+				RefCtrOffset: probe.SemaphoreOffset,
+			})
+			if err != nil {
+				log.Fatalf("Attaching uprobe for %s at offset 0x%x: %v", t.name, probe.Location, err)
+			}
+			links = append(links, up)
+			specID++
 		}
-		links = append(links, up)
-		specID++
 	}
 
 	if len(links) == 0 {
-		log.Fatalf("No parcagpu:activity_batch USDT probes found in %s", realLib)
+		log.Fatalf("No parcagpu USDT probes found in %s", realLib)
 	}
 	defer func() {
 		for _, l := range links {
@@ -160,6 +486,12 @@ func main() {
 		}
 	}()
 
+	cubins := newCubinStore(*pid)
+	defer cubins.cleanup()
+
+	// Stall reason index → name cache, populated lazily from BPF map.
+	stallReasonNames := map[uint32]string{}
+
 	// Open ring buffer reader.
 	rd, err := ringbuf.NewReader(objs.Events)
 	if err != nil {
@@ -171,7 +503,6 @@ func main() {
 	sig := make(chan os.Signal, 1)
 	signal.Notify(sig, syscall.SIGINT, syscall.SIGTERM)
 
-	// Also watch for the target process to exit.
 	done := make(chan struct{})
 	go func() {
 		for {
@@ -193,9 +524,10 @@ func main() {
 	}()
 
 	log.Printf("Attached %d USDT probe(s) in %s (PID %d)", len(links), realLib, *pid)
-	log.Printf("Waiting for kernel activity events...")
+	log.Printf("Waiting for events...")
 
 	var eventCount uint64
+	var pcSampleCount uint64
 	ticker := time.NewTicker(2 * time.Second)
 	defer ticker.Stop()
 
@@ -205,7 +537,6 @@ func main() {
 		}
 	}()
 
-	var event KernelEvent
 	for {
 		record, err := rd.Read()
 		if err != nil {
@@ -216,23 +547,113 @@ func main() {
 			continue
 		}
 
-		if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil {
-			log.Printf("Parsing event: %v", err)
+		raw := record.RawSample
+		if len(raw) < 4 {
 			continue
 		}
 
-		eventCount++
-		if *verbose {
-			name := cString(event.Name[:])
-			duration := event.End - event.Start
-			fmt.Printf("kernel: name=%-40s corr=%-6d dev=%d stream=%d graph=%-3d duration=%dns\n",
-				name, event.CorrelationID, event.DeviceID, event.StreamID, event.GraphID, duration)
+		eventType := binary.LittleEndian.Uint32(raw[:4])
+
+		switch eventType {
+		case eventTypeKernel:
+			var event KernelEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing kernel event: %v", err)
+				continue
+			}
+			eventCount++
+			if *verbose {
+				name := cString(event.Name[:])
+				duration := event.End - event.Start
+				fmt.Printf("kernel: name=%-40s corr=%-6d dev=%d stream=%d graph=%-3d duration=%dns\n",
+					name, event.CorrelationID, event.DeviceID, event.StreamID, event.GraphID, duration)
+			}
+
+		case eventTypeCubinLoaded:
+			var event CubinEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing cubin event: %v", err)
+				continue
+			}
+			cubins.load(event.CubinCRC, event.CubinPtr, event.CubinSize)
+
+		case eventTypeCubinUnloaded:
+			var event CubinEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing cubin event: %v", err)
+				continue
+			}
+			cubins.unload(event.CubinCRC)
+
+		case eventTypeError:
+			var event ErrorEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing error event: %v", err)
+				continue
+			}
+			msg := cString(event.Message[:])
+			comp := cString(event.Component[:])
+			log.Printf("ERROR [%s] code=%d: %s", comp, event.ErrorCode, msg)
+
+		case eventTypePCSample:
+			var event PCSampleEvent
+			if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil {
+				log.Printf("Parsing pc sample event: %v", err)
+				continue
+			}
+			pcSampleCount++
+
+			// Lazily populate stall reason name cache.
+			if len(stallReasonNames) == 0 {
+				for i := uint32(0); i < 64; i++ {
+					var name [64]byte
+					if err := objs.StallReasons.Lookup(&i, &name); err != nil {
+						continue
+					}
+					s := cString(name[:])
+					if s != "" {
+						stallReasonNames[i] = s
+					}
+				}
+			}
+
+			name := cString(event.FunctionName[:])
+			file, line := cubins.resolvePC(event.CubinCRC, event.PCOffset)
+			insn := cubins.resolveInstruction(event.CubinCRC, event.PCOffset)
+
+			src := ""
+			if file != "" {
+				src = fmt.Sprintf("  %s:%d", file, line)
+			}
+			insnStr := ""
+			if insn != "" {
+				insnStr = fmt.Sprintf("  [%s]", insn)
+			}
+			corrStr := ""
+			if event.CorrelationID != 0 {
+				corrStr = fmt.Sprintf("  corr=%d", event.CorrelationID)
+			}
+			fmt.Printf("pc_sample: %s  pc=0x%04x%s%s%s\n", name, event.PCOffset, src, insnStr, corrStr)
+			for i := uint32(0); i < event.StallReasonCount; i++ {
+				sr := event.StallReasons[i]
+				if sr.Samples == 0 {
+					continue
+				}
+				srName := stallReasonNames[sr.Index]
+				if srName == "" {
+					srName = fmt.Sprintf("reason[%d]", sr.Index)
+				}
+				fmt.Printf("    %s = %d\n", srName, sr.Samples)
+			}
 		}
 	}
 
 	fmt.Println()
 	log.Printf("Final stats:")
 	printStats(&objs, eventCount)
+	log.Printf("  pc_samples=%d", pcSampleCount)
+	printStallReasonMap(&objs)
+	printCubins(cubins)
 }
 
 func printStats(objs *activityParserObjects, eventCount uint64) {
@@ -251,6 +672,53 @@ func printStats(objs *activityParserObjects, eventCount uint64) {
 		batches, activities, kernels, eventCount, drops)
 }
 
+func printStallReasonMap(objs *activityParserObjects) {
+	var loaded uint32
+	loadedKey := uint32(0)
+	if err := objs.StallMapLoaded.Lookup(&loadedKey, &loaded); err != nil || loaded == 0 {
+		log.Printf("  stall reason map: not received")
+		return
+	}
+
+	log.Printf("  stall reason map:")
+	for i := uint32(0); i < 64; i++ {
+		var name [64]byte
+		if err := objs.StallReasons.Lookup(&i, &name); err != nil {
+			continue
+		}
+		s := cString(name[:])
+		if s == "" {
+			continue
+		}
+		log.Printf("    [%2d] %s", i, s)
+	}
+}
+
+func printCubins(cs *cubinStore) {
+	cs.mu.RLock()
+	defer cs.mu.RUnlock()
+
+	if len(cs.cubins) == 0 {
+		log.Printf("  cubins: none loaded")
+		return
+	}
+
+	log.Printf("  cubins loaded: %d", len(cs.cubins))
+	for crc, info := range cs.cubins {
+		log.Printf("    crc=0x%x size=%d lines=%d files=%v",
+			crc, info.size, len(info.lines), info.files)
+
+		// Print first 10 line entries as demo.
+		for i, e := range info.lines {
+			if i >= 10 {
+				log.Printf("      ... and %d more entries", len(info.lines)-10)
+				break
+			}
+			log.Printf("      0x%04x -> %s:%d", e.addr, e.file, e.line)
+		}
+	}
+}
+
 func raiseMemlock() error {
 	return unix.Setrlimit(unix.RLIMIT_MEMLOCK, &unix.Rlimit{
 		Cur: unix.RLIM_INFINITY,
@@ -265,5 +733,7 @@ func cString(b []byte) string {
 	return string(b)
 }
 
-// Ensure KernelEvent matches the BPF struct size.
-var _ = [1]struct{}{}[unsafe.Sizeof(KernelEvent{})-168]
+// Ensure struct sizes match BPF.
+var _ = [1]struct{}{}[unsafe.Sizeof(KernelEvent{})-176]
+var _ = [1]struct{}{}[unsafe.Sizeof(CubinEvent{})-32]
+var _ = [1]struct{}{}[unsafe.Sizeof(PCSampleEvent{})-672]
diff --git a/test/bpf/usdt_compat.h b/test/bpf/usdt_compat.h
index ca5a15e..fc3793e 100644
--- a/test/bpf/usdt_compat.h
+++ b/test/bpf/usdt_compat.h
@@ -1,29 +1,25 @@
-// Compatibility shim for including the otel-ebpf-profiler usdt.h and
-// usdt_args.h from our vmlinux.h-based BPF program.
+// Compatibility shim for including parca-dev/usdt's usdt_args.h from our
+// vmlinux.h-based BPF program.
 //
 // Must be included AFTER vmlinux.h + bpf_helpers.h but BEFORE usdt_args.h.
-// Defines the include guards for bpfdefs.h and types.h so those vendor
-// headers (and their deep dependency trees) are skipped entirely.
+// Provides macros that usdt_args.h expects (normally from kernel.h) and
+// bridges bpf2go architecture defines to standard compiler builtins.
 
 #ifndef USDT_COMPAT_H
 #define USDT_COMPAT_H
 
-// Skip vendor bpfdefs.h and types.h — we already have everything from vmlinux.h.
-#define OPTI_BPFDEFS_H
-#define OPTI_TYPES_H
-
-// Macros expected by usdt_args.h (normally from bpfdefs.h).
+// Macros expected by usdt_args.h (normally from kernel.h).
 #define EBPF_INLINE __always_inline
 #define UNUSED __attribute__((unused))
 
-// bpf_map_def — usdt_args.h uses "extern bpf_map_def __bpf_usdt_specs"
-// (C++ style, no struct keyword), so we need a typedef.
-typedef struct bpf_map_def {
-  unsigned int type;
-  unsigned int key_size;
-  unsigned int value_size;
-  unsigned int max_entries;
-  unsigned int map_flags;
-} bpf_map_def;
+// bpf2go passes -target bpfel (not the real platform triple), so the
+// usual compiler builtins (__x86_64__, __aarch64__) are never set.
+// Bridge from bpf2go's __TARGET_ARCH_* defines to the builtins that
+// usdt_args.h checks for pt_regs layout.
+#if defined(__TARGET_ARCH_x86)
+#define __x86_64__
+#elif defined(__TARGET_ARCH_arm64)
+#define __aarch64__
+#endif
 
 #endif // USDT_COMPAT_H
diff --git a/test/mock_cuda.c b/test/mock_cuda.c
new file mode 100644
index 0000000..a512637
--- /dev/null
+++ b/test/mock_cuda.c
@@ -0,0 +1,31 @@
+/*
+ * Mock CUDA Driver API for testing
+ * Provides minimal implementation of cuDriverGetVersion for test environment
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+// Mock implementation of cuDriverGetVersion
+// Returns CUDA 12.8.1 (12081) to enable PC sampling in tests
+CUresult cuDriverGetVersion(int *driverVersion) {
+    if (driverVersion == NULL) {
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+
+    // Version format: major * 1000 + minor * 10 + patch
+    // 12.8.1 = 12 * 1000 + 8 * 10 + 1 = 12081
+    *driverVersion = 12081;
+
+    if (getenv("PARCAGPU_DEBUG") != NULL) {
+        fprintf(stderr, "[MOCK_CUDA] cuDriverGetVersion() -> 12.8.1 (12081)\n");
+    }
+
+    return CUDA_SUCCESS;
+}
+
+// Mock cuCtxSynchronize — no-op in test (no real GPU work to wait for)
+CUresult cuCtxSynchronize(void) {
+    return CUDA_SUCCESS;
+}
diff --git a/test/mock_cupti.c b/test/mock_cupti.c
index 3c04589..01092fd 100644
--- a/test/mock_cupti.c
+++ b/test/mock_cupti.c
@@ -1,7 +1,10 @@
 #include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
 #include <stdint.h>
 #include <cuda.h>
 #include <cupti.h>
+#include <cupti_pcsampling.h>
 
 // Define callback function types if not already defined by CUPTI headers
 #ifndef CUpti_BufferRequestFunc
@@ -134,8 +137,290 @@ CUptiResult cuptiActivityGetNumDroppedRecords(CUcontext context,
     return CUPTI_SUCCESS;
 }
 
+CUptiResult cuptiGetContextId(CUcontext context, uint32_t *contextId) {
+    (void)context;  // Mark as intentionally unused
+    // Return a fixed context ID for testing
+    *contextId = 1;
+    return CUPTI_SUCCESS;
+}
+
 CUptiResult cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) {
     (void)subscriber;  // Mark as intentionally unused
     fprintf(stderr, "[MOCK_CUPTI] cuptiUnsubscribe()\n");
     return CUPTI_SUCCESS;
 }
+
+// =========================================================================
+// PC Sampling mock — uses a real cubin from pc_sample_toy for realistic
+// CRC, offsets, and source-line correlation.
+// =========================================================================
+
+// Real cubin loaded from file (set MOCK_CUBIN_PATH, or auto-detected).
+static char *__cubin_data = NULL;
+static size_t __cubin_size = 0;
+static uint64_t __cubin_crc = 0;
+
+// CRC function shared between cuptiGetCubinCrc and internal use.
+static uint64_t __compute_crc(const void *data, size_t size) {
+    const uint8_t *bytes = (const uint8_t *)data;
+    uint64_t crc = 0xDEADBEEFULL;
+    for (size_t i = 0; i < size; i++)
+        crc = crc * 31 + bytes[i];
+    return crc;
+}
+
+static void __load_cubin(void) {
+    if (__cubin_data)
+        return;
+    const char *path = getenv("MOCK_CUBIN_PATH");
+    if (!path) {
+        fprintf(stderr, "[MOCK_CUPTI] MOCK_CUBIN_PATH not set, no cubin loaded\n");
+        return;
+    }
+    FILE *f = fopen(path, "rb");
+    if (!f) {
+        fprintf(stderr, "[MOCK_CUPTI] Failed to open cubin: %s\n", path);
+        return;
+    }
+    fseek(f, 0, SEEK_END);
+    __cubin_size = (size_t)ftell(f);
+    fseek(f, 0, SEEK_SET);
+    __cubin_data = (char *)malloc(__cubin_size);
+    if (fread(__cubin_data, 1, __cubin_size, f) != __cubin_size) {
+        fprintf(stderr, "[MOCK_CUPTI] Short read on cubin: %s\n", path);
+        free(__cubin_data);
+        __cubin_data = NULL;
+        __cubin_size = 0;
+        fclose(f);
+        return;
+    }
+    fclose(f);
+    __cubin_crc = __compute_crc(__cubin_data, __cubin_size);
+    fprintf(stderr, "[MOCK_CUPTI] Loaded cubin: %s (%zu bytes, crc=0x%lx)\n",
+            path, __cubin_size, __cubin_crc);
+}
+
+CUptiResult cuptiGetVersion(uint32_t *version) {
+    *version = 24;
+    return CUPTI_SUCCESS;
+}
+
+static int __pc_sampling_started = 0;
+
+CUptiResult cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *params) {
+    (void)params;
+    fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingEnable()\n");
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *params) {
+    (void)params;
+    fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingDisable()\n");
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingStart(CUpti_PCSamplingStartParams *params) {
+    (void)params;
+    __pc_sampling_started = 1;
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingStop(CUpti_PCSamplingStopParams *params) {
+    (void)params;
+    __pc_sampling_started = 0;
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingSetConfigurationAttribute(
+    CUpti_PCSamplingConfigurationInfoParams *params) {
+    (void)params;
+    fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingSetConfigurationAttribute()\n");
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiPCSamplingGetNumStallReasons(
+    CUpti_PCSamplingGetNumStallReasonsParams *params) {
+    *params->numStallReasons = 3;
+    return CUPTI_SUCCESS;
+}
+
+static const char *__mock_stall_names[] = {
+    "smsp__pcsamp_warps_issue_stalled_not_selected",
+    "smsp__pcsamp_warps_issue_stalled_math_pipe_throttle",
+    "smsp__pcsamp_warps_issue_stalled_barrier",
+};
+
+CUptiResult cuptiPCSamplingGetStallReasons(
+    CUpti_PCSamplingGetStallReasonsParams *params) {
+    size_t n = params->numStallReasons < 3 ? params->numStallReasons : 3;
+    for (size_t i = 0; i < n; i++) {
+        strncpy(params->stallReasons[i], __mock_stall_names[i],
+                CUPTI_STALL_REASON_STRING_SIZE - 1);
+        params->stallReasons[i][CUPTI_STALL_REASON_STRING_SIZE - 1] = '\0';
+        params->stallReasonIndex[i] = (uint32_t)i;
+    }
+    return CUPTI_SUCCESS;
+}
+
+// ---------------------------------------------------------------------------
+// Synthetic PC samples using real offsets from pc_sample_toy.cu kernels.
+// Offsets extracted from nvdisasm -g on the sm_121 cubin.
+// ---------------------------------------------------------------------------
+
+typedef struct {
+    const char *functionName;
+    uint32_t functionIndex;
+    uint64_t pcOffset;
+    // Expected source correlation for this offset:
+    uint32_t lineNumber;
+    const char *fileName;
+    const char *dirName;
+} MockPCSample;
+
+// Representative offsets covering all three kernels and distinct source lines.
+// Offsets and line numbers from: nvdisasm -g -c pc_sample_toy.sm_121.cubin
+static MockPCSample __mock_samples[] = {
+    // shmem_bounce — shared-memory bouncing kernel
+    {"_Z12shmem_bouncePfiy", 0, 0x00b0, 54,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // STS [R2], R3
+    {"_Z12shmem_bouncePfiy", 0, 0x00d0, 55,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // BAR.SYNC
+    {"_Z12shmem_bouncePfiy", 0, 0x01f0, 58,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // LDS (inner loop)
+    {"_Z12shmem_bouncePfiy", 0, 0x0230, 58,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // FFMA (inner loop)
+    {"_Z12shmem_bouncePfiy", 0, 0x0250, 59,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // BAR.SYNC (inner loop)
+
+    // hash_churn — integer bit-twiddling kernel
+    {"_Z10hash_churnPjiy", 1, 0x0050, 34,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // IMAD (idx calc)
+    {"_Z10hash_churnPjiy", 1, 0x0080, 39,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // LDCU (loop start)
+    {"_Z10hash_churnPjiy", 1, 0x0180, 40,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // IMAD.SHL (h ^= h << 13)
+
+    // trig_storm — FP math kernel
+    {"_Z10trig_stormPfiy", 2, 0x0050, 21,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // IMAD (idx calc)
+    {"_Z10trig_stormPfiy", 2, 0x00b0, 25,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // FMUL (x * 0.001f)
+    {"_Z10trig_stormPfiy", 2, 0x0f30, 27,
+     "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"},  // FMUL (sinf*cosf inner)
+};
+#define NUM_MOCK_SAMPLES (sizeof(__mock_samples) / sizeof(__mock_samples[0]))
+
+static int __pc_get_data_calls = 0;
+
+CUptiResult cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *params) {
+    CUpti_PCSamplingData *data = params->pcSamplingData;
+
+    __pc_get_data_calls++;
+    if (__pc_get_data_calls % 3 != 0 || data->collectNumPcs == 0) {
+        data->totalNumPcs = 0;
+        data->remainingNumPcs = 0;
+        return CUPTI_SUCCESS;
+    }
+
+    // Cycle through the sample table, emitting a batch each time.
+    static size_t sample_cursor = 0;
+    size_t count = data->collectNumPcs < 4 ? data->collectNumPcs : 4;
+    if (count > NUM_MOCK_SAMPLES)
+        count = NUM_MOCK_SAMPLES;
+
+    data->totalNumPcs = count;
+    data->remainingNumPcs = 0;
+    data->totalSamples = count * 9;
+
+    for (size_t i = 0; i < count; i++) {
+        MockPCSample *s = &__mock_samples[(sample_cursor + i) % NUM_MOCK_SAMPLES];
+        CUpti_PCSamplingPCData *pc = &data->pPcData[i];
+        pc->size = sizeof(CUpti_PCSamplingPCData);
+        pc->cubinCrc = __cubin_crc;
+        pc->pcOffset = s->pcOffset;
+        pc->functionIndex = s->functionIndex;
+        pc->functionName = (char *)s->functionName;
+        pc->stallReasonCount = 3;
+        for (size_t j = 0; j < 3; j++) {
+            pc->stallReason[j].pcSamplingStallReasonIndex = (uint32_t)j;
+            pc->stallReason[j].samples = (uint32_t)(5 - j * 2);
+        }
+    }
+    sample_cursor = (sample_cursor + count) % NUM_MOCK_SAMPLES;
+    return CUPTI_SUCCESS;
+}
+
+CUptiResult cuptiGetCubinCrc(CUpti_GetCubinCrcParams *params) {
+    params->cubinCrc = __compute_crc(params->cubin, params->cubinSize);
+    return CUPTI_SUCCESS;
+}
+
+// Source correlation: look up the offset in our known table.
+// Falls back to zeros if the offset isn't in the table (same as real CUPTI
+// when debug info is missing).
+CUptiResult cuptiGetSassToSourceCorrelation(
+    CUpti_GetSassToSourceCorrelationParams *params) {
+    for (size_t i = 0; i < NUM_MOCK_SAMPLES; i++) {
+        MockPCSample *s = &__mock_samples[i];
+        if (params->pcOffset == s->pcOffset &&
+            params->functionName && strcmp(params->functionName, s->functionName) == 0) {
+            params->lineNumber = s->lineNumber;
+            params->fileName = strdup(s->fileName);
+            params->dirName = strdup(s->dirName);
+            return CUPTI_SUCCESS;
+        }
+    }
+    // Unknown offset — no source info available.
+    params->lineNumber = 0;
+    params->fileName = NULL;
+    params->dirName = NULL;
+    return CUPTI_SUCCESS;
+}
+
+// =========================================================================
+// Resource callback helper — called from test harness after init.
+// Fires CONTEXT_CREATED and MODULE_LOADED with the real cubin.
+// =========================================================================
+
+void __mock_cupti_fire_resource_callbacks(void) {
+    if (!__cupti_runtime_api_callback)
+        return;
+
+    __load_cubin();
+
+    fprintf(stderr, "[MOCK_CUPTI] Firing resource callbacks\n");
+
+    // 1. CONTEXT_CREATED
+    CUpti_ResourceData resData;
+    memset(&resData, 0, sizeof(resData));
+    resData.context = (CUcontext)(uintptr_t)0x1;
+    __cupti_runtime_api_callback(
+        __cupti_runtime_api_userdata,
+        CUPTI_CB_DOMAIN_RESOURCE,
+        CUPTI_CBID_RESOURCE_CONTEXT_CREATED,
+        &resData);
+
+    // 2. MODULE_LOADED with the real cubin.
+    CUpti_ModuleResourceData modData;
+    memset(&modData, 0, sizeof(modData));
+    if (__cubin_data) {
+        modData.pCubin = __cubin_data;
+        modData.cubinSize = __cubin_size;
+    } else {
+        // Fallback: minimal fake cubin if no file was loaded.
+        static const char fake[] = {0x7f, 'E', 'L', 'F', 0,0,0,0, 0,0,0,0, 0,0,0,0};
+        modData.pCubin = fake;
+        modData.cubinSize = sizeof(fake);
+    }
+    resData.resourceDescriptor = &modData;
+    __cupti_runtime_api_callback(
+        __cupti_runtime_api_userdata,
+        CUPTI_CB_DOMAIN_RESOURCE,
+        CUPTI_CBID_RESOURCE_MODULE_LOADED,
+        &resData);
+}
+
+CUptiResult cuptiActivityDisable(CUpti_ActivityKind kind) {
+    (void)kind;
+    return CUPTI_SUCCESS;
+}
diff --git a/test/pc_sample_toy.cubin b/test/pc_sample_toy.cubin
new file mode 100644
index 0000000..849a335
Binary files /dev/null and b/test/pc_sample_toy.cubin differ
diff --git a/test/test-pc-mock.sh b/test/test-pc-mock.sh
new file mode 100755
index 0000000..2d8772a
--- /dev/null
+++ b/test/test-pc-mock.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# Mock PC sampling test: runs test_cupti_prof (mock CUPTI) under parcagpu
+# with BPF activity parser. Verifies stall reason map, PC samples, and
+# cubin loading WITHOUT requiring a real GPU.
+#
+# Prerequisites:
+#   make local bpf-test
+#
+# Usage:
+#   sudo -E test/test-pc-mock.sh          # default
+#   sudo -E test/test-pc-mock.sh -v       # verbose
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+LIB="$ROOT/build-local/lib/libparcagpucupti.so"
+TEST_BIN="$ROOT/build-local/bin/test_cupti_prof"
+BPF="$ROOT/test/bpf/activity_parser"
+CUBIN="$ROOT/test/pc_sample_toy.cubin"
+BPF_LOG="/tmp/parcagpu-pc-mock-bpf.log"
+TEST_LOG="/tmp/parcagpu-pc-mock-test.log"
+VERBOSE=""
+
+for arg in "$@"; do
+  case "$arg" in
+    -v) VERBOSE="-v" ;;
+  esac
+done
+
+# --- Preflight checks ---
+for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do
+  if [ ! -x "$f" ] && [ ! -f "$f" ]; then
+    echo "error: $f not found" >&2
+    exit 1
+  fi
+done
+
+cleanup() {
+  [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true
+  [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true
+  wait 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# --- Launch mock workload ---
+# Mock CUPTI/CUDA libs from build-local so proton's dynamic loader finds them
+# instead of real libcupti.so / libcuda.so.
+# Set probability=1 so every interval check triggers sampling.
+echo "=== Starting test_cupti_prof (mock) ==="
+LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \
+  PARCAGPU_DEBUG=1 \
+  PARCAGPU_PC_SAMPLING_PROBABILITY=1 \
+  PARCAGPU_PC_SAMPLING_INTERVAL=0.1 \
+  MOCK_CUBIN_PATH="$CUBIN" \
+  "$TEST_BIN" "$LIB" --duration=5 > "$TEST_LOG" 2>&1 &
+TEST_PID=$!
+echo "test_cupti_prof PID: $TEST_PID"
+
+# Wait for library to be loaded into the process.
+while kill -0 "$TEST_PID" 2>/dev/null &&
+      ! grep -q libparcagpucupti "/proc/$TEST_PID/maps" 2>/dev/null; do
+  sleep 0.1
+done
+
+if ! kill -0 "$TEST_PID" 2>/dev/null; then
+  echo "error: test_cupti_prof exited before library loaded" >&2
+  cat "$TEST_LOG" >&2
+  exit 1
+fi
+
+# --- Attach BPF parser ---
+echo "=== Starting BPF activity parser ==="
+"$BPF" -pid "$TEST_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 &
+BPF_PID=$!
+echo "activity_parser PID: $BPF_PID"
+
+# --- Wait for workload to finish ---
+wait "$TEST_PID" 2>/dev/null || true
+TEST_PID=""
+sleep 2
+
+# --- Stop BPF parser ---
+kill "$BPF_PID" 2>/dev/null || true
+wait "$BPF_PID" 2>/dev/null || true
+BPF_PID=""
+
+# --- Results ---
+echo
+echo "=== Mock test output (parcagpu debug) ==="
+cat "$TEST_LOG"
+echo
+echo "=== BPF parser output ==="
+cat "$BPF_LOG"
+echo
+
+# --- Checks ---
+PASS=true
+
+check() {
+  local label="$1" pattern="$2" file="$3"
+  if grep -q "$pattern" "$file"; then
+    echo "PASS: $label"
+  else
+    echo "FAIL: $label" >&2
+    PASS=false
+  fi
+}
+
+check "PC sampling initialized" "PC sampling initialized" "$TEST_LOG"
+check "real cubin loaded (mock)" "Loaded cubin.*pc_sample_toy" "$TEST_LOG"
+check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TEST_LOG"
+check "source correlation: shmem_bounce" "func=_Z12shmem_bounce.*pc_sample_toy.cu" "$TEST_LOG"
+check "source correlation: hash_churn"  "func=_Z10hash_churn.*pc_sample_toy.cu"  "$TEST_LOG"
+check "source correlation: trig_storm"  "func=_Z10trig_storm.*pc_sample_toy.cu"  "$TEST_LOG"
+check "stall reason map received" "\[ 0\] smsp__pcsamp" "$BPF_LOG"
+check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG"
+check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG"
+check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG"
+
+if $PASS; then
+  echo
+  echo "=== ALL CHECKS PASSED ==="
+else
+  echo
+  echo "=== SOME CHECKS FAILED ===" >&2
+  exit 1
+fi
diff --git a/test/test-pc-real.sh b/test/test-pc-real.sh
new file mode 100755
index 0000000..5892957
--- /dev/null
+++ b/test/test-pc-real.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+# Smoke test: runs pc_sample_toy under parcagpu with BPF activity parser.
+# Verifies stall reason map, PC samples, and cubin loading.
+#
+# Prerequisites:
+#   make local bpf-test
+#   nvcc -g -lineinfo -o microbenchmarks/pc_sample_toy microbenchmarks/pc_sample_toy.cu
+#
+# Usage:
+#   sudo -E test/test-pc-real.sh          # default
+#   sudo -E test/test-pc-real.sh -v       # verbose (print every event)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+LIB="$ROOT/build-local/lib/libparcagpucupti.so"
+TOY="$ROOT/microbenchmarks/pc_sample_toy"
+BPF="$ROOT/test/bpf/activity_parser"
+BPF_LOG="/tmp/parcagpu-pc-test-bpf.log"
+TOY_LOG="/tmp/parcagpu-pc-test-toy.log"
+VERBOSE=""
+
+for arg in "$@"; do
+  case "$arg" in
+    -v) VERBOSE="-v" ;;
+  esac
+done
+
+# --- Preflight checks ---
+for f in "$LIB" "$TOY" "$BPF"; do
+  if [ ! -x "$f" ] && [ ! -f "$f" ]; then
+    echo "error: $f not found" >&2
+    exit 1
+  fi
+done
+
+cleanup() {
+  [ -n "${TOY_PID:-}" ] && kill "$TOY_PID" 2>/dev/null || true
+  [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true
+  wait 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# --- Launch the toy workload ---
+echo "=== Starting pc_sample_toy ==="
+PARCAGPU_DEBUG=1 PARCAGPU_SAMPLING_FACTOR=18 \
+  PARCAGPU_PC_SAMPLING_PROBABILITY=1 \
+  PARCAGPU_PC_SAMPLING_INTERVAL=0.5 \
+  CUDA_INJECTION64_PATH="$LIB" "$TOY" 8 > "$TOY_LOG" 2>&1 &
+TOY_PID=$!
+echo "pc_sample_toy PID: $TOY_PID"
+
+# Wait for library to be loaded into the process.
+while kill -0 "$TOY_PID" 2>/dev/null &&
+      ! grep -q libparcagpucupti "/proc/$TOY_PID/maps" 2>/dev/null; do
+  sleep 0.1
+done
+
+if ! kill -0 "$TOY_PID" 2>/dev/null; then
+  echo "error: pc_sample_toy exited before library loaded" >&2
+  exit 1
+fi
+
+# --- Attach BPF parser ---
+echo "=== Starting BPF activity parser ==="
+"$BPF" -pid "$TOY_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 &
+BPF_PID=$!
+echo "activity_parser PID: $BPF_PID"
+
+# --- Wait for workload to finish ---
+wait "$TOY_PID" 2>/dev/null || true
+TOY_PID=""
+sleep 2
+
+# --- Stop BPF parser ---
+kill "$BPF_PID" 2>/dev/null || true
+wait "$BPF_PID" 2>/dev/null || true
+BPF_PID=""
+
+# --- Results ---
+echo
+echo "=== Toy output (parcagpu debug) ==="
+cat "$TOY_LOG"
+echo
+echo "=== BPF parser output ==="
+cat "$BPF_LOG"
+echo
+
+# --- Checks ---
+PASS=true
+
+check() {
+  local label="$1" pattern="$2" file="$3"
+  if grep -q "$pattern" "$file"; then
+    echo "PASS: $label"
+  else
+    echo "FAIL: $label" >&2
+    PASS=false
+  fi
+}
+
+check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TOY_LOG"
+check "stall reason map received" "\[ 0\] smsp__pcsamp" "$BPF_LOG"
+check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG"
+check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG"
+check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG"
+
+if $PASS; then
+  echo
+  echo "=== ALL CHECKS PASSED ==="
+else
+  echo
+  echo "=== SOME CHECKS FAILED ===" >&2
+  exit 1
+fi
diff --git a/test/test_cupti_prof.c b/test/test_cupti_prof.c
index 04810a0..c16649c 100644
--- a/test/test_cupti_prof.c
+++ b/test/test_cupti_prof.c
@@ -628,6 +628,13 @@ void *cupti_thread(void *arg) {
             size_t maxNumRecords;
             bufferRequestedCallback(&buffer, &bufferSize, &maxNumRecords);
 
+            // The profiler may return NULL when no tracer is attached
+            // (semaphore-gated short circuit). Skip this flush cycle.
+            if (buffer == NULL) {
+                usleep(100000);
+                continue;
+            }
+
             // Fill the buffer with activity records for launched kernels
             size_t offset = 0;
             size_t recordSize = sizeof(CUpti_ActivityKernel5);
@@ -894,6 +901,16 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    // Fire resource callbacks (CONTEXT_CREATED, MODULE_LOADED) so that
+    // PC sampling initialization runs in the profiler library.
+    typedef void (*FireResourceCallbacksFunc)(void);
+    FireResourceCallbacksFunc fireResourceCbs =
+        (FireResourceCallbacksFunc)dlsym(RTLD_DEFAULT,
+                                         "__mock_cupti_fire_resource_callbacks");
+    if (fireResourceCbs) {
+        fireResourceCbs();
+    }
+
     // Load kernel names if specified
     KernelNameList *kernel_names = NULL;
     if (config.kernel_names) {
diff --git a/vendor/opentelemetry-ebpf-profiler b/vendor/opentelemetry-ebpf-profiler
deleted file mode 160000
index d13351c..0000000
--- a/vendor/opentelemetry-ebpf-profiler
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d13351cf54ac5cad54bbf313f321539f4ba9f70b