diff --git a/.clangd b/.clangd new file mode 100644 index 0000000..1e71e2d --- /dev/null +++ b/.clangd @@ -0,0 +1,2 @@ +CompileFlags: + CompilationDatabase: build-local diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..fe8a30f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,17 @@ +# Git metadata (especially important for proton submodule) +.git/ +.gitmodules +.github/ + +# Build artifacts (in case they exist in directories we copy) +build/ +build-*/ +*.o +*.so +*.so.* + +# CMake artifacts +CMakeCache.txt +CMakeFiles/ +cmake_install.cmake +compile_commands.json diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..b98edf8 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,118 @@ +name: Build + +on: + push: + branches: [main, proton] + tags: ['v*'] + pull_request: + branches: [main, proton] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + CUDA_HEADERS: ghcr.io/parca-dev/cuda-headers:12 + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build library (amd64) + run: | + mkdir -p build/amd64 + docker buildx build -f Dockerfile \ + --build-arg CUDA_HEADERS=${{ env.CUDA_HEADERS }} \ + --target export \ + --output type=local,dest=build/amd64 \ + --platform linux/amd64 \ + . + + - name: Verify build artifacts + run: | + echo "=== Build artifacts ===" + ls -lh build/amd64/ + file build/amd64/libparcagpucupti.so + readelf -n build/amd64/libparcagpucupti.so | grep -A2 "stapsdt" | head -20 + + - name: Log in to Container Registry + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value={{date 'YYYYMMDD'}}-{{sha}},enable={{is_default_branch}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push multi-arch image + if: github.event_name != 'pull_request' + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + build-args: | + CUDA_HEADERS=${{ env.CUDA_HEADERS }} + platforms: linux/amd64,linux/arm64 + target: runtime + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + release-binaries: + if: startsWith(github.ref, 'refs/tags/v') + needs: build + runs-on: ubuntu-latest + permissions: + contents: write + + strategy: + matrix: + arch: [amd64, arm64] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and extract binary for ${{ matrix.arch }} + run: | + mkdir -p build/${{ matrix.arch }} + docker buildx build -f Dockerfile \ + --build-arg CUDA_HEADERS=${{ env.CUDA_HEADERS }} \ + --target export \ + --output type=local,dest=build/${{ matrix.arch }} \ + --platform linux/${{ matrix.arch }} \ + . + mv build/${{ matrix.arch }}/libparcagpucupti.so \ + build/${{ matrix.arch }}/libparcagpucupti-${{ matrix.arch }}.so + + - name: Upload to GitHub Release + uses: softprops/action-gh-release@v1 + with: + files: build/${{ matrix.arch }}/libparcagpucupti-${{ matrix.arch }}.so diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml deleted file mode 100644 index 7e21e01..0000000 --- a/.github/workflows/container.yml +++ /dev/null @@ -1,135 +0,0 @@ -name: Container Image Build - -on: - push: - branches: - - main - tags: - - 'v*' - pull_request: - branches: - - main - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - CUDA_HEADERS_REGISTRY: ghcr.io/parca-dev/cuda-headers - -jobs: - build-and-push: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Container Registry - if: github.event_name != 'pull_request' - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - # For releases: use semantic version tags - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - # For main branch: use date and commit SHA - type=raw,value={{date 'YYYYMMDD'}}-{{sha}},enable={{is_default_branch}} - # Tag as latest for main branch - type=raw,value=latest,enable={{is_default_branch}} - - - name: Build and push multi-arch image with both CUDA versions - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile - build-args: | - CUDA_12_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:12 - CUDA_13_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:13 - platforms: linux/amd64,linux/arm64 - target: runtime - push: ${{ github.event_name != 'pull_request' }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - - extract-release-binaries: - # Only run this job for tagged releases - if: startsWith(github.ref, 'refs/tags/v') - needs: build-and-push - runs-on: ubuntu-latest - permissions: - contents: write - - strategy: - matrix: - include: - - cuda_major: "12" - cuda_full: "12.9.1" - arch: amd64 - platform: linux/amd64 - - cuda_major: "12" - cuda_full: "12.9.1" - arch: arm64 - platform: linux/arm64 - - cuda_major: "13" - cuda_full: "13.0.2" - arch: amd64 - platform: linux/amd64 - - cuda_major: "13" - cuda_full: "13.0.2" - arch: arm64 - platform: linux/arm64 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build and extract binary for ${{ matrix.arch }} (CUDA ${{ matrix.cuda_major }}) - env: - LIB_NAME: libparcagpucupti.so.${{ matrix.cuda_major }} - run: | - mkdir -p build/${{ matrix.arch }} - docker buildx build -f Dockerfile \ - --build-arg CUDA_12_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:12 \ - --build-arg CUDA_13_HEADERS=${{ env.CUDA_HEADERS_REGISTRY }}:13 \ - --target export-cuda${{ matrix.cuda_major }} \ - --output type=local,dest=build/${{ matrix.arch }} \ - --platform ${{ matrix.platform }} \ - . - - - name: Rename binary with arch and CUDA version suffix - env: - LIB_NAME: libparcagpucupti.so.${{ matrix.cuda_major }} - OUTPUT_NAME: libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }} - run: | - mv build/${{ matrix.arch }}/${LIB_NAME} \ - build/${{ matrix.arch }}/${OUTPUT_NAME} - - - name: Upload binary as artifact - uses: actions/upload-artifact@v4 - with: - name: libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }} - path: build/${{ matrix.arch }}/libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }} - - - name: Upload to GitHub Release - uses: softprops/action-gh-release@v1 - with: - files: build/${{ matrix.arch }}/libparcagpucupti.so.${{ matrix.cuda_major }}-${{ matrix.arch }} diff --git a/.gitignore b/.gitignore index 07d7b12..1fb4839 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,9 @@ -cupti/build-* build/ -.zig-cache/ -zig-out/ test/build/ test/bpf/vmlinux.h test/bpf/activity_parser test/bpf/activityparser_*.go -test/bpf/activityparser_*.o \ No newline at end of file +test/bpf/activityparser_*.o +src/probes.h +microbenchmarks/rapid_launch +microbenchmarks/pc_sample_toy diff --git a/.gitmodules b/.gitmodules index e7d7dd2..73f0eb4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "vendor/opentelemetry-ebpf-profiler"] path = vendor/opentelemetry-ebpf-profiler url = https://github.com/parca-dev/opentelemetry-ebpf-profiler.git +[submodule "proton"] + path = proton + url = https://github.com/parca-dev/proton.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..be64c53 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,184 @@ +cmake_minimum_required(VERSION 3.18) +project(parcagpu LANGUAGES C CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Default to Release with debug symbols +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() + +# Compiler flags for different build types +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -O0") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -g -O2") +set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -g -O2") + +# Set proton directory (git submodule or Docker build) +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/proton") + set(PROTON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/proton") +elseif(EXISTS "/build/proton") + set(PROTON_DIR "/build/proton") +else() + message(FATAL_ERROR "Could not find proton directory. Expected at ${CMAKE_CURRENT_SOURCE_DIR}/proton or /build/proton") +endif() +message(STATUS "Using Proton from: ${PROTON_DIR}") + +# Find CUDA headers (allow fallback to manual path for header-only builds) +if(NOT DEFINED CUDA_INCLUDE_DIR) + # Prefer /usr/local/cuda over system-installed CUDA packages, which may + # ship older headers (e.g. CUDA 12.0) missing symbols that Proton needs. + if(NOT DEFINED CUDAToolkit_ROOT AND EXISTS "/usr/local/cuda/include/cuda.h") + set(CUDAToolkit_ROOT "/usr/local/cuda") + endif() + find_package(CUDAToolkit QUIET) + if(CUDAToolkit_FOUND) + set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS}) + else() + # Fallback for header-only builds (e.g., Docker with ghcr.io/parca-dev/cuda-headers) + set(CUDA_INCLUDE_DIR "/usr/local/cuda/include") + if(NOT EXISTS "${CUDA_INCLUDE_DIR}/cuda.h") + message(FATAL_ERROR "Could not find CUDA headers. Set CUDA_INCLUDE_DIR or install CUDA Toolkit.") + endif() + message(STATUS "Using CUDA headers from: ${CUDA_INCLUDE_DIR}") + endif() +endif() + +# Include directories from proton +include_directories( + ${PROTON_DIR}/csrc/include + ${PROTON_DIR}/common/include + ${CUDA_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/src +) + +# Generate USDT probe header and object from probes.d +find_program(DTRACE dtrace REQUIRED) + +add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h + COMMAND ${DTRACE} -h -s ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d + -o ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d + COMMENT "Generating USDT probe header (probes.h)" +) + +add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/probes.o + COMMAND ${DTRACE} -G -s ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d + -o ${CMAKE_CURRENT_BINARY_DIR}/probes.o + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.d + COMMENT "Generating USDT probe object (probes.o)" +) + +# Collect proton driver source files we need +# Note: We only link driver APIs and callback utilities (no profiler dependencies) +set(PROTON_SOURCES + ${PROTON_DIR}/csrc/lib/Driver/GPU/CuptiApi.cpp + ${PROTON_DIR}/csrc/lib/Driver/GPU/CudaApi.cpp + ${PROTON_DIR}/csrc/lib/Profiler/Cupti/CuptiCallbacks.cpp +) + +# Build the shared library +add_library(parcagpucupti SHARED + src/cupti.cpp + src/pc_sampling.cpp + src/correlation_filter.cpp + src/env_config.cpp + ${PROTON_SOURCES} +) + +# Wire USDT probe artifacts into the build +add_custom_target(probes_generated + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/probes.h + ${CMAKE_CURRENT_BINARY_DIR}/probes.o +) +add_dependencies(parcagpucupti probes_generated) +target_link_libraries(parcagpucupti ${CMAKE_CURRENT_BINARY_DIR}/probes.o) + +# Link against system libraries only (NOT libcupti - loaded dynamically by Proton) +target_link_libraries(parcagpucupti + dl + pthread +) + +# Set output directory +set_target_properties(parcagpucupti PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + OUTPUT_NAME "parcagpucupti" +) + +# Install target +install(TARGETS parcagpucupti + LIBRARY DESTINATION lib +) + +# Build test infrastructure +option(BUILD_TESTS "Build test infrastructure" ON) + +if(BUILD_TESTS) + # Mock CUDA library for testing (minimal - just cuDriverGetVersion) + add_library(cuda_mock SHARED + test/mock_cuda.c + ) + target_include_directories(cuda_mock PRIVATE + ${CUDA_INCLUDE_DIR} + ) + set_target_properties(cuda_mock PROPERTIES + OUTPUT_NAME "cuda" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + ) + + # Mock CUPTI library for testing + add_library(cupti_mock SHARED + test/mock_cupti.c + ) + target_include_directories(cupti_mock PRIVATE + ${CUDA_INCLUDE_DIR} + ) + set_target_properties(cupti_mock PROPERTIES + OUTPUT_NAME "cupti" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + ) + + # Test executable + add_executable(test_cupti_prof + test/test_cupti_prof.c + ) + target_include_directories(test_cupti_prof PRIVATE + ${CUDA_INCLUDE_DIR} + ) + # Link against mock libraries so their symbols are globally visible. + # This lets Proton's dlopen(RTLD_NOLOAD) find them, and lets the test + # access __cupti_runtime_api_callback etc. via dlsym(RTLD_DEFAULT). + # --no-as-needed forces the linker to keep them even though the test + # only accesses their symbols via dlsym, not direct references. + target_link_libraries(test_cupti_prof + dl + -Wl,--no-as-needed + cuda_mock + cupti_mock + -Wl,--as-needed + ) + set_target_properties(test_cupti_prof PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" + ) + + # Create libcuda.so.1 symlink for test (Proton expects libcuda.so.1) + add_custom_command(TARGET cuda_mock POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink + libcuda.so + ${CMAKE_BINARY_DIR}/lib/libcuda.so.1 + COMMENT "Creating libcuda.so.1 symlink" + ) + + # Create libcupti.so.12 symlink for test + add_custom_command(TARGET cupti_mock POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink + libcupti.so + ${CMAKE_BINARY_DIR}/lib/libcupti.so.12 + COMMENT "Creating libcupti.so.12 symlink" + ) +endif() diff --git a/Dockerfile b/Dockerfile index b691c6e..b8eddd2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,83 +1,47 @@ # Slim multi-platform build for libparcagpucupti.so # Uses pre-built CUDA header images instead of full CUDA development images # This significantly reduces build time and disk space requirements +# +# Thanks to Proton's dynamic CUPTI loading, we only need to build once +# and the library works with any CUDA version at runtime. -# CUDA header images (can be overridden at build time) -ARG CUDA_12_HEADERS=ghcr.io/parca-dev/cuda-headers:12 -ARG CUDA_13_HEADERS=ghcr.io/parca-dev/cuda-headers:13 +# CUDA header image (can be overridden at build time) +ARG CUDA_HEADERS=ghcr.io/parca-dev/cuda-headers:12 -# Import CUDA 12 headers -FROM ${CUDA_12_HEADERS} AS cuda12-headers +# Import CUDA headers +FROM ${CUDA_HEADERS} AS cuda-headers -# Import CUDA 13 headers -FROM ${CUDA_13_HEADERS} AS cuda13-headers - -# Build stage for CUDA 12 -FROM ubuntu:22.04 AS builder-cuda12 +# Build stage +FROM ubuntu:24.04 AS builder # Install only build tools (no CUDA toolkit needed) RUN apt-get update && apt-get install -y \ cmake \ make \ - gcc \ g++ \ systemtap-sdt-dev \ && rm -rf /var/lib/apt/lists/* -WORKDIR /build/cupti +WORKDIR /build -# Copy CUDA headers and libraries from header image -COPY --from=cuda12-headers /usr/local/cuda /usr/local/cuda +# Copy CUDA headers from header image +COPY --from=cuda-headers /usr/local/cuda /usr/local/cuda -# Copy source code -COPY cupti/cupti-prof.c cupti/correlation_filter.cpp cupti/correlation_filter.h cupti/CMakeLists.txt ./ +# Copy parcagpu source files and proton submodule +COPY src /build/src +COPY proton /build/proton +COPY CMakeLists.txt /build/ -# Build the library for CUDA 12 -ENV CUDA_ROOT=/usr/local/cuda +# Build the library (disable tests for Docker build) RUN mkdir -p build && \ cd build && \ - cmake -DCUDA_ROOT=${CUDA_ROOT} -DCMAKE_BUILD_TYPE=RelWithDebInfo .. && \ - make VERBOSE=1 && \ - mv libparcagpucupti.so libparcagpucupti.so.12 - -# Build stage for CUDA 13 -FROM ubuntu:22.04 AS builder-cuda13 - -# Install only build tools (no CUDA toolkit needed) -RUN apt-get update && apt-get install -y \ - cmake \ - make \ - gcc \ - g++ \ - systemtap-sdt-dev \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /build/cupti - -# Copy CUDA headers and libraries from header image -COPY --from=cuda13-headers /usr/local/cuda /usr/local/cuda - -# Copy source code -COPY cupti/cupti-prof.c cupti/correlation_filter.cpp cupti/correlation_filter.h cupti/CMakeLists.txt ./ - -# Build the library for CUDA 13 -ENV CUDA_ROOT=/usr/local/cuda -RUN mkdir -p build && \ - cd build && \ - cmake -DCUDA_ROOT=${CUDA_ROOT} .. && \ - make VERBOSE=1 && \ - mv libparcagpucupti.so libparcagpucupti.so.13 - -# Export stages for extracting single libraries (used by Makefile and release binaries) -FROM scratch AS export-cuda12 -COPY --from=builder-cuda12 /build/cupti/build/libparcagpucupti.so.12 / + cmake -DCUDA_INCLUDE_DIR=/usr/local/cuda/include -DBUILD_TESTS=OFF .. && \ + make -j$(nproc) -FROM scratch AS export-cuda13 -COPY --from=builder-cuda13 /build/cupti/build/libparcagpucupti.so.13 / +# Export stage for extracting the library (used by Makefile and release binaries) +FROM scratch AS export +COPY --from=builder /build/build/lib/libparcagpucupti.so / -# Runtime image with both CUDA versions (for container registry) +# Runtime image (for container registry) FROM busybox:latest AS runtime -COPY --from=builder-cuda12 /build/cupti/build/libparcagpucupti.so.12 /usr/lib/ -COPY --from=builder-cuda13 /build/cupti/build/libparcagpucupti.so.13 /usr/lib/ -# Default symlink points to CUDA 12 -RUN ln -s /usr/lib/libparcagpucupti.so.12 /usr/lib/libparcagpucupti.so +COPY --from=builder /build/build/lib/libparcagpucupti.so /usr/lib/libparcagpucupti.so diff --git a/Dockerfile.test b/Dockerfile.test index 4137314..65bf375 100644 --- a/Dockerfile.test +++ b/Dockerfile.test @@ -9,20 +9,22 @@ RUN apt-get update && apt-get install -y \ WORKDIR /test # Create directory structure -RUN mkdir -p cupti/build-amd64 zig-out/bin zig-out/lib +RUN mkdir -p build/bin build/lib # Copy the built library -COPY cupti/build-amd64/libparcagpucupti.so ./cupti/build-amd64/ +COPY build/lib/libparcagpucupti.so ./build/lib/ -# Copy the zig-built test files -COPY zig-out/bin/test_cupti_prof ./zig-out/bin/ -COPY zig-out/lib/libcupti.so ./zig-out/lib/ -COPY zig-out/lib/libcupti.so.12 ./zig-out/lib/ +# Copy the test files +COPY build/bin/test_cupti_prof ./build/bin/ +COPY build/lib/libcuda.so ./build/lib/ +COPY build/lib/libcuda.so.1 ./build/lib/ +COPY build/lib/libcupti.so ./build/lib/ +COPY build/lib/libcupti.so.12 ./build/lib/ # Set environment -ENV LD_LIBRARY_PATH=/test/zig-out/lib +ENV LD_LIBRARY_PATH=/test/build/lib ENV PARCAGPU_DEBUG=1 # Run tests by default -ENTRYPOINT ["/test/zig-out/bin/test_cupti_prof", "/test/cupti/build-amd64/libparcagpucupti.so"] +ENTRYPOINT ["/test/build/bin/test_cupti_prof", "/test/build/lib/libparcagpucupti.so"] CMD [] diff --git a/Makefile b/Makefile index fedd0c8..f64bb34 100644 --- a/Makefile +++ b/Makefile @@ -1,153 +1,90 @@ -.PHONY: all clean test cupti-amd64 cupti-arm64 cupti-all cupti-all-versions cross test-infra docker-push push-cuda-headers docker-test-build docker-test-run format bpf-test test-multi +.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test microbenchmarks test-multi test-pc-real test-pc-mock -# CUDA version configuration -CUDA_MAJOR ?= 12 -CUDA_FULL_VERSION ?= 12.9.1 -LIB_NAME = libparcagpucupti.so.$(CUDA_MAJOR) +LIB_NAME = libparcagpucupti.so -# Default target: build all CUDA versions (12 & 13) for both architectures and test infrastructure -all: cupti-all-versions test-infra - -local: - mkdir -p build && \ - cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -B build -S cupti && \ - cd build && make -j$(nproc) +# Default target: build for both architectures +all: build-all # Build libparcagpucupti.so for AMD64 using Docker -cupti-amd64: - @echo "=== Building $(LIB_NAME) for AMD64 with Docker (CUDA $(CUDA_MAJOR)) ===" +build-amd64: + @echo "=== Building $(LIB_NAME) for AMD64 with Docker ===" @mkdir -p /tmp/parcagpu-build-amd64 @docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder @docker buildx build -f Dockerfile \ - --build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \ - --build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \ - --target export-cuda$(CUDA_MAJOR) \ + --target export \ --output type=local,dest=/tmp/parcagpu-build-amd64 \ --platform linux/amd64 . - @mkdir -p build/$(CUDA_MAJOR)/amd64 - @cp /tmp/parcagpu-build-amd64/$(LIB_NAME) build/$(CUDA_MAJOR)/amd64/ - @ln -sf $(LIB_NAME) build/$(CUDA_MAJOR)/amd64/libparcagpucupti.so - @echo "AMD64 library built: build/$(CUDA_MAJOR)/amd64/$(LIB_NAME)" + @mkdir -p build/amd64 + @cp /tmp/parcagpu-build-amd64/$(LIB_NAME) build/amd64/ + @echo "AMD64 library built: build/amd64/$(LIB_NAME)" # Build libparcagpucupti.so for ARM64 using Docker -cupti-arm64: - @echo "=== Building $(LIB_NAME) for ARM64 with Docker (CUDA $(CUDA_MAJOR)) ===" +build-arm64: + @echo "=== Building $(LIB_NAME) for ARM64 with Docker ===" @mkdir -p /tmp/parcagpu-build-arm64 @docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder @docker buildx build -f Dockerfile \ - --build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \ - --build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \ - --target export-cuda$(CUDA_MAJOR) \ + --target export \ --output type=local,dest=/tmp/parcagpu-build-arm64 \ --platform linux/arm64 . - @mkdir -p build/$(CUDA_MAJOR)/arm64 - @cp /tmp/parcagpu-build-arm64/$(LIB_NAME) build/$(CUDA_MAJOR)/arm64/ - @ln -sf $(LIB_NAME) build/$(CUDA_MAJOR)/arm64/libparcagpucupti.so - @echo "ARM64 library built: build/$(CUDA_MAJOR)/arm64/$(LIB_NAME)" - -# Build both architectures for current CUDA version (controlled by CUDA_MAJOR variable) -# Example: make cupti-all CUDA_MAJOR=13 -cupti-all: cupti-amd64 cupti-arm64 - -# Build all local artifacts: CUDA 12 & 13 for both amd64 and arm64 -# This extracts the .so files to build/12/{amd64,arm64}/ and build/13/{amd64,arm64}/ -# Use this target when you want local build artifacts for testing -cupti-all-versions: - @echo "=== Building all CUDA versions (12 and 13) for both architectures ===" - @$(MAKE) cupti-amd64 CUDA_MAJOR=12 CUDA_FULL_VERSION=12.9.1 - @$(MAKE) cupti-arm64 CUDA_MAJOR=12 CUDA_FULL_VERSION=12.9.1 - @$(MAKE) cupti-amd64 CUDA_MAJOR=13 CUDA_FULL_VERSION=13.0.2 - @$(MAKE) cupti-arm64 CUDA_MAJOR=13 CUDA_FULL_VERSION=13.0.2 + @mkdir -p build/arm64 + @cp /tmp/parcagpu-build-arm64/$(LIB_NAME) build/arm64/ + @echo "ARM64 library built: build/arm64/$(LIB_NAME)" + +# Build both architectures +build-all: build-amd64 build-arm64 @echo "=== All artifacts built ===" - @echo "CUDA 12 amd64: build/12/amd64/libparcagpucupti.so.12" - @echo "CUDA 12 arm64: build/12/arm64/libparcagpucupti.so.12" - @echo "CUDA 13 amd64: build/13/amd64/libparcagpucupti.so.13" - @echo "CUDA 13 arm64: build/13/arm64/libparcagpucupti.so.13" + @echo "AMD64: build/amd64/$(LIB_NAME)" + @echo "ARM64: build/arm64/$(LIB_NAME)" -# Build runtime container image with both CUDA versions for both architectures -# Note: This builds a Docker image but doesn't extract local artifacts or load into Docker +# Build runtime container image for both architectures # Multi-platform images stay in buildx cache. Use docker-push to push to registry. cross: - @echo "=== Building runtime container for AMD64 and ARM64 (includes CUDA 12 and 13) ===" + @echo "=== Building runtime container for AMD64 and ARM64 ===" @docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder @docker buildx build -f Dockerfile \ - --build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \ - --build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \ --target runtime \ --platform linux/amd64,linux/arm64 \ . @echo "Runtime container built for both platforms (cached, not loaded into Docker)" -# CUDA header image configuration -# Can be overridden to use local images (e.g., make cupti-all CUDA_12_HEADERS=cuda-headers:12) -CUDA_HEADERS_REGISTRY ?= ghcr.io/parca-dev/cuda-headers -CUDA_12_HEADERS ?= $(CUDA_HEADERS_REGISTRY):12 -CUDA_13_HEADERS ?= $(CUDA_HEADERS_REGISTRY):13 - -# Build and push CUDA header images to registry -# These are lightweight images (~35MB each) containing only CUDA headers and libcupti -# Note: Only needs to be run manually when: -# - CUDA versions are updated (12.9.1 -> 12.x.x, 13.0.2 -> 13.x.x) -# - New CUDA major versions are added -# - CUPTI API changes require header updates -push-cuda-headers: - @echo "=== Building and pushing CUDA header images ===" - @docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder - @echo "Building CUDA 12 headers..." - @docker buildx build -f Dockerfile.cuda-headers \ - --build-arg CUDA_VERSION=12.9.1 \ - --platform linux/amd64,linux/arm64 \ - --tag $(CUDA_HEADERS_REGISTRY):12 \ - --push \ - . - @echo "Building CUDA 13 headers..." - @docker buildx build -f Dockerfile.cuda-headers \ - --build-arg CUDA_VERSION=13.0.2 \ - --platform linux/amd64,linux/arm64 \ - --tag $(CUDA_HEADERS_REGISTRY):13 \ - --push \ - . - @echo "CUDA header images pushed to $(CUDA_HEADERS_REGISTRY):12 and :13" - -# Build test infrastructure with CMake -test-infra: - @echo "=== Building test infrastructure with CMake ===" - @mkdir -p test/build - @cd test/build && cmake .. -DCUDA_MAJOR_VERSION=$(CUDA_MAJOR) && make - -# Run tests (using AMD64 library) -test: cupti-amd64 test-infra - @./test.sh - -# Run advanced test (8 GPUs @ 2500 launches/s, multi-threaded) -test-advanced: cupti-amd64 test-infra - @echo "=== Running advanced test (8 GPUs @ 2500 launches/s) ===" - @cd test/build && LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \ - ./test_cupti_prof ../../build/$(CUDA_MAJOR)/amd64/libparcagpucupti.so \ - --threads=4 --num-gpus=8 --launch-rate=2500 --duration=10 +# Local build with CMake (for development/testing) - default is Release with symbols +local: + @echo "=== Building locally with CMake (RelWithDebInfo) ===" + @cmake -B build-local -S . -DCMAKE_BUILD_TYPE=RelWithDebInfo + @cmake --build build-local + @echo "Local build complete: build-local/lib/$(LIB_NAME)" + +# Debug build with CMake (full debug, no optimizations) +debug: + @echo "=== Building debug version with CMake ===" + @cmake -B build-local -S . -DCMAKE_BUILD_TYPE=Debug + @cmake --build build-local + @echo "Debug build complete: build-local/lib/$(LIB_NAME)" + +# Run local tests +test: local + @echo "=== Running tests ===" + @LD_LIBRARY_PATH="$(CURDIR)/build-local/lib:$$LD_LIBRARY_PATH" \ + ./build-local/bin/test_cupti_prof build-local/lib/libparcagpucupti.so --duration=5 # Clean build artifacts clean: @echo "=== Cleaning build artifacts ===" - @rm -rf cupti/build cupti/build-amd64 cupti/build-arm64 build - @rm -rf test/build - @rm -f test/bpf/activity_parser test/bpf/activityparser_*.go test/bpf/activityparser_*.o + @rm -rf build build-local bin lib zig-out .zig-cache + @rm -rf CMakeCache.txt CMakeFiles/ cmake_install.cmake compile_commands.json @echo "Clean complete" # Build and push multi-arch Docker images to ghcr.io # Set IMAGE_TAG to override the default tag (e.g., make docker-push IMAGE_TAG=v1.0.0) # Set IMAGE to override the image name (e.g., make docker-push IMAGE=ghcr.io/myuser/parcagpu) -# Set CUDA_12_HEADERS and CUDA_13_HEADERS to override header images (e.g., cuda-headers:12 for local) -# Note: Runtime image includes both CUDA 12 and 13 IMAGE ?= ghcr.io/parca-dev/parcagpu IMAGE_TAG ?= latest docker-push: @echo "=== Setting up buildx builder ===" @docker buildx create --name parcagpu-builder --use --bootstrap 2>/dev/null || docker buildx use parcagpu-builder - @echo "=== Building and pushing multi-arch Docker images to $(IMAGE):$(IMAGE_TAG) (includes CUDA 12 and 13) ===" + @echo "=== Building and pushing multi-arch Docker images to $(IMAGE):$(IMAGE_TAG) ===" @docker buildx build -f Dockerfile \ - --build-arg CUDA_12_HEADERS=$(CUDA_12_HEADERS) \ - --build-arg CUDA_13_HEADERS=$(CUDA_13_HEADERS) \ --target runtime \ --platform linux/amd64,linux/arm64 \ --tag $(IMAGE):$(IMAGE_TAG) \ @@ -156,7 +93,7 @@ docker-push: @echo "Images pushed successfully to $(IMAGE):$(IMAGE_TAG)" # Build test container image -docker-test-build: cupti-amd64 test-infra +docker-test-build: build-amd64 @echo "=== Building test container image ===" @docker build -f Dockerfile.test -t parcagpu-test:latest . @echo "Test container built: parcagpu-test:latest" @@ -167,6 +104,17 @@ docker-test-run: docker-test-build @echo "=== Running tests in container ===" @docker run --rm parcagpu-test:latest $(ARGS) +# Build microbenchmark CUDA toys (with DWARF debug info for cubin symbolization) +NVCC ?= nvcc +CUDA_ARCH ?= native +MICROBENCH_SRCS := $(wildcard microbenchmarks/*.cu) +MICROBENCH_BINS := $(MICROBENCH_SRCS:.cu=) + +microbenchmarks: $(MICROBENCH_BINS) + +microbenchmarks/%: microbenchmarks/%.cu + $(NVCC) -g -lineinfo -arch=$(CUDA_ARCH) -o $@ $< + # Build the BPF activity parser test program # Requires: clang, libbpf-dev, bpftool (for vmlinux.h), Go 1.21+ bpf-test: @@ -175,21 +123,20 @@ bpf-test: echo "Generating vmlinux.h from kernel BTF..."; \ bpftool btf dump file /sys/kernel/btf/vmlinux format c > test/bpf/vmlinux.h; \ fi - @cd test/bpf && go generate ./... && CGO_ENABLED=0 go build -o activity_parser . + @cd test/bpf && \ + export USDT_HEADERS=$$(go mod download github.com/parca-dev/usdt >/dev/null && \ + go list -m -f '{{.Dir}}' github.com/parca-dev/usdt)/ebpf && \ + go generate ./... && CGO_ENABLED=0 go build -o activity_parser . @echo "BPF test built: test/bpf/activity_parser" # Run test_cupti_prof and BPF activity parser in parallel. -# The BPF test attaches to the test program's PID and logs kernel activities. +# The BPF test attaches to the activity_batch USDT probe and logs kernel activities. # Requires root (sudo) for BPF. -test-multi: cupti-amd64 test-infra bpf-test +test-multi: local bpf-test @echo "=== Running test with BPF activity parser ===" - @case "$$(uname -m)" in \ - aarch64|arm64) ARCH=arm64 ;; \ - *) ARCH=amd64 ;; \ - esac; \ - LIB_PATH="build/$(CUDA_MAJOR)/$${ARCH}/libparcagpucupti.so"; \ - export LD_LIBRARY_PATH="$$(pwd)/test/build:$$LD_LIBRARY_PATH"; \ - test/build/test_cupti_prof "$${LIB_PATH}" --kernel-names=kernel_names.txt --duration=10 & \ + @LIB_PATH="build-local/lib/libparcagpucupti.so"; \ + export LD_LIBRARY_PATH="$(CURDIR)/build-local/lib:$$LD_LIBRARY_PATH"; \ + ./build-local/bin/test_cupti_prof "$${LIB_PATH}" --kernel-names=kernel_names.txt --duration=10 & \ TEST_PID=$$!; \ sleep 1; \ echo "test_cupti_prof PID: $${TEST_PID}"; \ @@ -202,5 +149,15 @@ test-multi: cupti-amd64 test-infra bpf-test sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \ echo "=== test-multi completed (test exit: $${TEST_EXIT}) ===" +# Run pc_sample_toy with BPF activity parser and verify stall reason map is received. +# Requires: real GPU, root (sudo) for BPF, pc_sample_toy compiled separately. +test-pc-real: local bpf-test microbenchmarks + sudo -E test/test-pc-real.sh + +# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA. +test-pc-mock: local bpf-test + sudo -E test/test-pc-mock.sh + format: - clang-format -i -style=file cupti/*.[ch] + @echo "=== Formatting source files ===" + @clang-format -i -style=file src/*.cpp src/*.h test/*.c diff --git a/README_TEST.md b/README_TEST.md index 1c90651..f797556 100644 --- a/README_TEST.md +++ b/README_TEST.md @@ -13,9 +13,9 @@ make test ``` This builds: -- `cupti/build/libparcagpucupti.so` - Production library (CMake + real CUPTI) -- `zig-out/lib/libcupti.so` - Mock CUPTI for test infrastructure -- `zig-out/bin/test_cupti_prof` - Test program +- `build/lib/libparcagpucupti.so` - Production library (CMake) +- `build/lib/libcupti.so` - Mock CUPTI for test infrastructure +- `build/bin/test_cupti_prof` - Test program ## Quick Start @@ -39,7 +39,7 @@ This will: For extended testing or continuous probe monitoring: ```bash -LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so --forever +LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so --forever ``` This runs indefinitely at 1000 events/second until interrupted (Ctrl-C). diff --git a/bpft_test.sh b/bpft_test.sh new file mode 100755 index 0000000..2c2a285 --- /dev/null +++ b/bpft_test.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LIB="$SCRIPT_DIR/build-local/lib/libparcagpucupti.so" + +if [ ! -f "$LIB" ]; then + echo "error: $LIB not found" >&2 + exit 1 +fi + +export PARCAGPU_DEBUG=1 +PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$LIB" "$SCRIPT_DIR/microbenchmarks/pc_sample_toy" 4 & +TOY_PID=$! +trap "kill $TOY_PID 2>/dev/null; wait $TOY_PID 2>/dev/null" EXIT + +# Wait for the injection library to be loaded (CUDA must initialize first) +while kill -0 "$TOY_PID" 2>/dev/null && ! grep -q libparcagpucupti "/proc/$TOY_PID/maps" 2>/dev/null; do + sleep 0.1 +done + +exec bpftrace -p "$TOY_PID" "$SCRIPT_DIR/parcagpu.bt" diff --git a/cupti/CMakeLists.txt b/cupti/CMakeLists.txt deleted file mode 100644 index ead5ead..0000000 --- a/cupti/CMakeLists.txt +++ /dev/null @@ -1,68 +0,0 @@ -cmake_minimum_required(VERSION 3.18) -project(parcagpucupti C CXX) - -# CUDA root directory (can be overridden via -DCUDA_ROOT=...) -if(NOT DEFINED CUDA_ROOT) - set(CUDA_ROOT "/usr/local/cuda" CACHE PATH "CUDA installation directory") -endif() - -# CUDA library directory -if(NOT DEFINED CUDA_LIBDIR) - set(CUDA_LIBDIR "${CUDA_ROOT}/lib64" CACHE PATH "CUDA library directory") -endif() - -# Set include directories for CUDA headers -set(CUDAToolkit_INCLUDE_DIRS - "${CUDA_ROOT}/include" - "${CUDA_ROOT}/extras/CUPTI/include" -) - -# Add CUDA library directory to link directories -link_directories(${CUDA_LIBDIR}) - -# Create shared library with both C and C++ sources -add_library(parcagpucupti SHARED cupti-prof.c correlation_filter.cpp) - -# Set properties -set_target_properties(parcagpucupti PROPERTIES - C_STANDARD 11 - C_STANDARD_REQUIRED ON - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - POSITION_INDEPENDENT_CODE ON -) - -# Add debug symbols and disable C++ exceptions -target_compile_options(parcagpucupti PRIVATE -g -fno-exceptions) - -# Include directories -target_include_directories(parcagpucupti PRIVATE - ${CUDAToolkit_INCLUDE_DIRS} -) - -# systemtap headers are now in /usr/include/sys/ (copied in Dockerfile for cross-compilation) - -# Find the CUPTI library explicitly -find_library(CUPTI_LIBRARY - NAMES cupti - PATHS ${CUDA_LIBDIR} - REQUIRED -) - -# Find the CUDA driver library -find_library(CUDA_LIBRARY - NAMES cuda - PATHS ${CUDA_LIBDIR} ${CUDA_LIBDIR}/stubs - REQUIRED -) - -# Link libraries -target_link_libraries(parcagpucupti PRIVATE - ${CUPTI_LIBRARY} - ${CUDA_LIBRARY} -) - -# Installation -install(TARGETS parcagpucupti - LIBRARY DESTINATION lib -) \ No newline at end of file diff --git a/cupti/correlation_filter.cpp b/cupti/correlation_filter.cpp deleted file mode 100644 index 8d48dfa..0000000 --- a/cupti/correlation_filter.cpp +++ /dev/null @@ -1,240 +0,0 @@ -#include "correlation_filter.h" -#include -#include -#include -#include - -// CorrelationFilter implementation using std::unordered_set with mutex protection -// This provides thread-safe access with minimal overhead for our use case -class CorrelationFilter { -public: - CorrelationFilter() = default; - - // Insert a correlation ID into the filter - // Thread-safe - void insert(uint32_t correlation_id) { - std::lock_guard lock(mutex_); - set_.insert(correlation_id); - } - - // Check if correlation ID exists and remove it atomically - // Returns true if found and removed, false if not found - // Thread-safe - bool check_and_remove(uint32_t correlation_id) { - std::lock_guard lock(mutex_); - auto it = set_.find(correlation_id); - if (it != set_.end()) { - set_.erase(it); - return true; - } - return false; - } - - // Get current size - // Thread-safe - size_t size() const { - std::lock_guard lock(mutex_); - return set_.size(); - } - -private: - std::unordered_set set_; - mutable std::mutex mutex_; -}; - -// GraphCorrelationMap implementation for tracking graph launches across buffer cycles -// Uses a 2-slot state machine per correlation ID to detect when graph launches are complete -struct GraphCorrelationEntry { - uint8_t state[2]; // State for alternating cycles - bool ever_seen_kernel; // True once we've seen at least one kernel activity - uint32_t insertion_cycle; // Buffer cycle when entry was created (for fallback cleanup) - - GraphCorrelationEntry(uint32_t cycle) - : state{GRAPH_STATE_UNINITIALIZED, GRAPH_STATE_UNINITIALIZED} - , ever_seen_kernel(false) - , insertion_cycle(cycle) {} -}; - -class GraphCorrelationMap { -public: - GraphCorrelationMap() : current_cycle_(0) {} - - // Insert a correlation ID (called when sampling a graph launch) - // Thread-safe - void insert(uint32_t correlation_id) { - std::lock_guard lock(mutex_); - map_.emplace(correlation_id, GraphCorrelationEntry(current_cycle_)); - } - - // Start a new processing cycle - clear the appropriate slot for all entries - // Thread-safe - void cycle_start(uint32_t cycle) { - std::lock_guard lock(mutex_); - current_cycle_ = cycle; - uint32_t slot = cycle % 2; - for (auto& pair : map_) { - pair.second.state[slot] = GRAPH_STATE_CYCLE_CLEARED; - } - } - - // Check if correlation ID is tracked and mark as seen for this cycle - // Returns true if tracked (should fire probe) - // Thread-safe - bool check_and_mark_seen(uint32_t correlation_id, uint32_t cycle) { - std::lock_guard lock(mutex_); - auto it = map_.find(correlation_id); - if (it != map_.end()) { - uint32_t slot = cycle % 2; - it->second.state[slot] = GRAPH_STATE_KERNEL_SEEN; - it->second.ever_seen_kernel = true; // Mark that we've seen at least one kernel - return true; - } - return false; - } - - // End processing cycle - remove entries based on two conditions: - // 1. Primary: Both slots CYCLE_CLEARED AND we've seen at least one kernel (graph completed) - // 2. Fallback: Both slots CYCLE_CLEARED AND never seen kernel AND age > 100 cycles - // (handles GPU reset, failed launches, etc.) - // Thread-safe - void cycle_end() { - std::lock_guard lock(mutex_); - size_t removed_normal = 0; - size_t removed_fallback = 0; - - for (auto it = map_.begin(); it != map_.end(); ) { - bool should_remove = false; - bool is_fallback = false; - - if (it->second.state[0] == GRAPH_STATE_CYCLE_CLEARED && - it->second.state[1] == GRAPH_STATE_CYCLE_CLEARED) { - - if (it->second.ever_seen_kernel) { - // Primary: Graph completed normally (saw kernels, then stopped) - should_remove = true; - removed_normal++; - } else if ((current_cycle_ - it->second.insertion_cycle) > 100) { - // Fallback: Never saw kernels and entry is very old (>100 cycles) - // Prevents leaking entries when GPU resets or launches fail - should_remove = true; - is_fallback = true; - removed_fallback++; - } - } - - if (should_remove) { - it = map_.erase(it); - } else { - ++it; - } - } - } - - // Get cleanup stats (for debugging) - void get_stats(size_t& size, size_t& oldest_age) const { - std::lock_guard lock(mutex_); - size = map_.size(); - oldest_age = 0; - for (const auto& pair : map_) { - uint32_t age = current_cycle_ - pair.second.insertion_cycle; - if (age > oldest_age) { - oldest_age = age; - } - } - } - - // Get current size - // Thread-safe - size_t size() const { - std::lock_guard lock(mutex_); - return map_.size(); - } - -private: - std::unordered_map map_; - uint32_t current_cycle_; - mutable std::mutex mutex_; -}; - -// C API implementation -extern "C" { - -CorrelationFilterHandle correlation_filter_create(void) { - return new CorrelationFilter(); -} - -void correlation_filter_destroy(CorrelationFilterHandle filter) { - if (filter) { - delete static_cast(filter); - } -} - -void correlation_filter_insert(CorrelationFilterHandle filter, uint32_t correlation_id) { - if (filter) { - static_cast(filter)->insert(correlation_id); - } -} - -bool correlation_filter_check_and_remove(CorrelationFilterHandle filter, uint32_t correlation_id) { - if (filter) { - return static_cast(filter)->check_and_remove(correlation_id); - } - return false; -} - -size_t correlation_filter_size(CorrelationFilterHandle filter) { - if (filter) { - return static_cast(filter)->size(); - } - return 0; -} - -GraphCorrelationMapHandle graph_correlation_map_create(void) { - return new GraphCorrelationMap(); -} - -void graph_correlation_map_destroy(GraphCorrelationMapHandle map) { - if (map) { - delete static_cast(map); - } -} - -void graph_correlation_map_insert(GraphCorrelationMapHandle map, uint32_t correlation_id) { - if (map) { - static_cast(map)->insert(correlation_id); - } -} - -void graph_correlation_map_cycle_start(GraphCorrelationMapHandle map, uint32_t cycle) { - if (map) { - static_cast(map)->cycle_start(cycle); - } -} - -bool graph_correlation_map_check_and_mark_seen(GraphCorrelationMapHandle map, uint32_t correlation_id, uint32_t cycle) { - if (map) { - return static_cast(map)->check_and_mark_seen(correlation_id, cycle); - } - return false; -} - -void graph_correlation_map_cycle_end(GraphCorrelationMapHandle map) { - if (map) { - static_cast(map)->cycle_end(); - } -} - -size_t graph_correlation_map_size(GraphCorrelationMapHandle map) { - if (map) { - return static_cast(map)->size(); - } - return 0; -} - -void graph_correlation_map_get_stats(GraphCorrelationMapHandle map, size_t* size, size_t* oldest_age) { - if (map && size && oldest_age) { - static_cast(map)->get_stats(*size, *oldest_age); - } -} - -} // extern "C" diff --git a/cupti/correlation_filter.h b/cupti/correlation_filter.h deleted file mode 100644 index 53a2c78..0000000 --- a/cupti/correlation_filter.h +++ /dev/null @@ -1,83 +0,0 @@ -#pragma once - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Opaque handle to correlation filter -typedef void *CorrelationFilterHandle; - -// Create a new correlation filter -CorrelationFilterHandle correlation_filter_create(void); - -// Destroy the correlation filter -void correlation_filter_destroy(CorrelationFilterHandle filter); - -// Insert a correlation ID into the filter -// Thread-safe: can be called from multiple threads concurrently -void correlation_filter_insert(CorrelationFilterHandle filter, - uint32_t correlation_id); - -// Check if a correlation ID exists and remove it if found -// Returns true if the correlation ID was found and removed, false otherwise -// Thread-safe: safe to call concurrently with inserts -bool correlation_filter_check_and_remove(CorrelationFilterHandle filter, - uint32_t correlation_id); - -// Get the current size of the filter (number of tracked correlation IDs) -// Note: This is an approximate count in concurrent scenarios -size_t correlation_filter_size(CorrelationFilterHandle filter); - -// Graph correlation state values -enum GraphCorrelationState { - GRAPH_STATE_UNINITIALIZED = 0, // Entry just created, slot not yet processed - GRAPH_STATE_CYCLE_CLEARED = 1, // Cycle started, no kernels seen yet - GRAPH_STATE_KERNEL_SEEN = 2 // At least one kernel seen this cycle -}; - -// Opaque handle to graph correlation map -typedef void *GraphCorrelationMapHandle; - -// Create a new graph correlation map -GraphCorrelationMapHandle graph_correlation_map_create(void); - -// Destroy the graph correlation map -void graph_correlation_map_destroy(GraphCorrelationMapHandle map); - -// Insert a correlation ID into the map (called when sampling a graph launch) -// Thread-safe: can be called from multiple threads concurrently -void graph_correlation_map_insert(GraphCorrelationMapHandle map, - uint32_t correlation_id); - -// Start a new processing cycle - clears the appropriate slot for all entries -// Thread-safe -void graph_correlation_map_cycle_start(GraphCorrelationMapHandle map, - uint32_t cycle); - -// Check if correlation ID should fire probe and mark as seen for this cycle -// Returns true if the correlation ID is tracked (should fire probe) -// Thread-safe -bool graph_correlation_map_check_and_mark_seen(GraphCorrelationMapHandle map, - uint32_t correlation_id, - uint32_t cycle); - -// End processing cycle - removes entries that haven't seen kernels in 2 -// consecutive cycles Thread-safe -void graph_correlation_map_cycle_end(GraphCorrelationMapHandle map); - -// Get the current size of the map (number of tracked correlation IDs) -// Note: This is an approximate count in concurrent scenarios -size_t graph_correlation_map_size(GraphCorrelationMapHandle map); - -// Get statistics about the map (for debugging) -// Returns the current size and age of the oldest entry (in cycles) -void graph_correlation_map_get_stats(GraphCorrelationMapHandle map, - size_t *size, size_t *oldest_age); - -#ifdef __cplusplus -} -#endif diff --git a/cupti/cupti-prof.c b/cupti/cupti-prof.c deleted file mode 100644 index 11b61b0..0000000 --- a/cupti/cupti-prof.c +++ /dev/null @@ -1,575 +0,0 @@ -#define _POSIX_C_SOURCE 199309L -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "correlation_filter.h" - -// Debug logging control -static bool debug_enabled = false; - -// Activity buffer management -// A kernel activity is around 224 bytes so a 128kb buffer -// will hold ~500 activities, we want to flush regularly since -// we are a continuous profiler so we don't need a huge buffer -// like most CUPTI profilers. Also a small size avoid malloc -// just going to mmap every time so the allocator should cache -// and re-use these for us. -static size_t activityBufferSize = 128 * 1024; - -// Global variables -static CUpti_SubscriberHandle subscriber = 0; - -static size_t outstandingEvents = 0; - -// Thread-local tracking: store correlation ID from runtime ENTER -// so we can skip driver EXIT probe when it matches (driver calls happen under -// runtime calls) -static __thread uint32_t runtimeEnterCorrelationId = 0; - -// Rate limiting - token bucket algorithm (configurable via PARCAGPU_RATE_LIMIT) -static double rateLimitPerSec = 100.0; - -// Thread-local token bucket state -static __thread uint64_t lastRefillNs = 0; -static __thread double tokens = 0; - -// Returns true if the sample should be emitted, false if rate limited -static bool rateLimiterTryAcquire(uint64_t nowNs) { - // Refill tokens based on elapsed time - if (lastRefillNs > 0) { - double elapsedSec = (nowNs - lastRefillNs) / 1e9; - tokens = tokens + elapsedSec * rateLimitPerSec; - if (tokens > rateLimitPerSec) { - tokens = rateLimitPerSec; - } - } else { - tokens = rateLimitPerSec; // Start with full bucket - } - lastRefillNs = nowNs; - - if (tokens >= 1.0) { - tokens -= 1.0; - return true; - } - return false; -} - -// Correlation ID filter (for regular kernel launches) -static CorrelationFilterHandle correlationFilter = NULL; - -// Graph correlation map (for graph launches with multiple kernels per -// correlation ID) -static GraphCorrelationMapHandle graphCorrelationMap = NULL; - -// Buffer processing cycle counter (for graph map state machine) -static uint32_t bufferCycle = 0; - -static void init_debug(void) { - static bool initialized = false; - if (!initialized) { - debug_enabled = getenv("PARCAGPU_DEBUG") != NULL; - const char *rateEnv = getenv("PARCAGPU_RATE_LIMIT"); - if (rateEnv != NULL) { - double rate = atof(rateEnv); - if (rate > 0) { - rateLimitPerSec = rate; - } - } - initialized = true; - } -} - -#define DEBUG_PRINTF(...) \ - do { \ - init_debug(); \ - if (debug_enabled) { \ - struct timespec ts; \ - clock_gettime(CLOCK_REALTIME, &ts); \ - printf("[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec); \ - printf(__VA_ARGS__); \ - } \ - } while (0) - -// Forward declarations -static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData *cbdata); -static void parcagpuBufferRequested(uint8_t **buffer, size_t *size, - size_t *maxNumRecords); -static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId, - uint8_t *buffer, size_t size, - size_t validSize); - -void cleanup(void); - -// CUPTI initialization function required for CUDA_INJECTION64_PATH -int InitializeInjection(void) { - DEBUG_PRINTF("[CUPTI] InitializeInjection called\n"); - CUptiResult result; - - // Set flush period BEFORE enabling activities (in milliseconds) - // Try a larger value like 1000ms (1 second) for better compatibility - result = cuptiActivityFlushPeriod(1000); - if (result != CUPTI_SUCCESS) { - const char *errstr; - cuptiGetResultString(result, &errstr); - fprintf(stderr, "[CUPTI] Failed to set activity flush period: %s\n", - errstr); - } else { - DEBUG_PRINTF("[CUPTI] Set activity flush period to 1000ms\n"); - } - - // Try to subscribe to callbacks - result = cuptiSubscribe(&subscriber, - (CUpti_CallbackFunc)parcagpuCuptiCallback, NULL); - if (result != CUPTI_SUCCESS) { - const char *errstr; - cuptiGetResultString(result, &errstr); - fprintf(stderr, "[CUPTI] Failed to subscribe to callbacks: %s\n", errstr); - return 1; // Still return success to not break the injection - } - - // Enable all runtime API kernel launch callbacks - CUpti_CallbackId runtimeCallbacks[] = { - CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000, - CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, - CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000, - }; - for (size_t i = 0; i < sizeof(runtimeCallbacks) / sizeof(runtimeCallbacks[0]); - i++) { - result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, - runtimeCallbacks[i]); - if (result != CUPTI_SUCCESS) { - const char *errstr; - cuptiGetResultString(result, &errstr); - fprintf(stderr, "[CUPTI] Failed to enable runtime callback %d: %s\n", - runtimeCallbacks[i], errstr); - } - } - - // Enable all driver API kernel launch callbacks - CUpti_CallbackId driverCallbacks[] = { - CUPTI_DRIVER_TRACE_CBID_cuLaunch, - CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid, - CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync, - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz, - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx, - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz, - CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel, - CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz, - CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice, - CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, - CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz, - }; - for (size_t i = 0; i < sizeof(driverCallbacks) / sizeof(driverCallbacks[0]); - i++) { - result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, - driverCallbacks[i]); - if (result != CUPTI_SUCCESS) { - const char *errstr; - cuptiGetResultString(result, &errstr); - fprintf(stderr, "[CUPTI] Failed to enable driver callback %d: %s\n", - driverCallbacks[i], errstr); - } - } - - // Register activity buffer callbacks - result = cuptiActivityRegisterCallbacks(parcagpuBufferRequested, - parcagpuBufferCompleted); - if (result != CUPTI_SUCCESS) { - const char *errstr; - cuptiGetResultString(result, &errstr); - fprintf(stderr, "[CUPTI] Failed to register activity callbacks: %s\n", - errstr); - return 1; // Still return success to not break the injection - } - - result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); - if (result != CUPTI_SUCCESS) { - const char *errstr; - cuptiGetResultString(result, &errstr); - fprintf(stderr, "[CUPTI] Failed to enable concurrent kernel activity: %s\n", - errstr); - } else { - DEBUG_PRINTF("[CUPTI] Enabled CONCURRENT_KERNEL activity\n"); - } - - // Create correlation filter - correlationFilter = correlation_filter_create(); - if (correlationFilter) { - DEBUG_PRINTF("[CUPTI] Correlation filter created and enabled\n"); - } else { - fprintf(stderr, "[CUPTI] Warning: Failed to create correlation filter\n"); - } - - // Create graph correlation map - graphCorrelationMap = graph_correlation_map_create(); - if (graphCorrelationMap) { - DEBUG_PRINTF("[CUPTI] Graph correlation map created and enabled\n"); - } else { - fprintf(stderr, - "[CUPTI] Warning: Failed to create graph correlation map\n"); - } - - atexit(cleanup); - - DEBUG_PRINTF("[CUPTI] Successfully initialized CUPTI callbacks with external " - "correlation and activity API\n"); - - return 1; -} - -// Helper function to print stack trace -static void print_backtrace(const char *prefix) { - void *array[20]; - size_t size; - char **strings; - - size = backtrace(array, 20); - strings = backtrace_symbols(array, size); - - if (strings != NULL) { - printf("%s Stack trace (%zu frames):\n", prefix, size); - for (size_t i = 0; i < size; i++) { - printf(" [%zu] %s\n", i, strings[i]); - } - free(strings); - } -} - -// Callback handler for driver and runtime API -static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - const CUpti_CallbackData *cbdata) { - uint32_t correlationId = cbdata->correlationId; - - // Track runtime ENTER so we can skip driver EXIT when they match - if (domain == CUPTI_CB_DOMAIN_RUNTIME_API && - cbdata->callbackSite == CUPTI_API_ENTER) { - runtimeEnterCorrelationId = correlationId; - DEBUG_PRINTF("[CUPTI] Runtime API ENTER: correlationId=%u\n", - correlationId); - return; - } - - // We hook on EXIT because that makes our probe overhead not add to GPU - // launch latency and hopefully covers some of the overhead in the shadow of - // GPU async work. - if (cbdata->callbackSite != CUPTI_API_EXIT) { - if (cbdata->callbackSite == CUPTI_API_ENTER && - domain == CUPTI_CB_DOMAIN_DRIVER_API) { - DEBUG_PRINTF( - "[CUPTI] Driver API ENTER: correlationId=%u (will check on EXIT)\n", - correlationId); - } - return; - } - - const char *name = - cbdata->symbolName ? cbdata->symbolName : cbdata->functionName; - int signedCbid; - - if (domain == CUPTI_CB_DOMAIN_DRIVER_API) { - // Skip if this driver call is under a runtime call (same correlation ID) - if (correlationId == runtimeEnterCorrelationId) { - DEBUG_PRINTF("[CUPTI] Skipping driver EXIT correlationId=%u " - "(runtimeEnter=%u) - runtime " - "will handle\n", - correlationId, runtimeEnterCorrelationId); - return; - } - // Pure driver call (no runtime wrapper) - use negative cbid - signedCbid = -(int)cbid; - DEBUG_PRINTF("[CUPTI] Driver API EXIT callback: cbid=%d, correlationId=%u, " - "runtimeEnter=%u, func=%s\n", - cbid, correlationId, runtimeEnterCorrelationId, name); - } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) { - signedCbid = (int)cbid; - DEBUG_PRINTF("[CUPTI] Runtime API EXIT callback: cbid=%d, " - "correlationId=%u, runtimeEnter=%u, func=%s\n", - cbid, correlationId, runtimeEnterCorrelationId, name); - runtimeEnterCorrelationId = 0; // Clear after use - } else { - return; - } - - // Rate limit probes - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec; - if (!rateLimiterTryAcquire(nowNs)) { - DEBUG_PRINTF("[CUPTI] Rate limited: skipping probe for correlationId=%u\n", - correlationId); - return; - } - - outstandingEvents++; - DTRACE_PROBE3(parcagpu, cuda_correlation, correlationId, signedCbid, name); - - // Detect graph launches by callback ID - bool is_graph_launch = - (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000) || - (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000) || - (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch) || - (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); - - // Insert into appropriate map based on launch type - if (is_graph_launch) { - // Graph launch - will generate multiple kernels with same correlation ID - if (graphCorrelationMap) { - graph_correlation_map_insert(graphCorrelationMap, correlationId); - DEBUG_PRINTF( - "[CUPTI] Inserted correlationId=%u into graph map (size=%zu)\n", - correlationId, graph_correlation_map_size(graphCorrelationMap)); - } - } else { - // Regular kernel launch - single kernel per correlation ID - if (correlationFilter) { - correlation_filter_insert(correlationFilter, correlationId); - DEBUG_PRINTF("[CUPTI] Inserted correlationId=%u into filter (size=%zu)\n", - correlationId, correlation_filter_size(correlationFilter)); - } - } - - // If we let too many events pile up it overwhelms the perf_event buffers, - // just another reason to explore just passing the activity buffer through to - // eBPF. - if (outstandingEvents > 3000) { - DEBUG_PRINTF("[CUPTI] Flushing: outstandingEvents=%zu\n", - outstandingEvents); - cuptiActivityFlushAll(0); - outstandingEvents = 0; - } -} - -// Out-of-line USDT probe site for activity batches. -// Single call site ensures one probe location in the ELF .note.stapsdt section. -__attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs, - uint32_t count) { - DTRACE_PROBE2(parcagpu, activity_batch, ptrs, count); -} - -// Buffer request callback -static void parcagpuBufferRequested(uint8_t **buffer, size_t *size, - size_t *maxNumRecords) { - *buffer = (uint8_t *)aligned_alloc(8, activityBufferSize); - *size = activityBufferSize; - *maxNumRecords = 0; // Let CUPTI decide - - DEBUG_PRINTF("[CUPTI:bufferRequested] Allocated buffer %p, size=%zu\n", - *buffer, *size); -} - -// Buffer completion callback -static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId, - uint8_t *buffer, size_t size, - size_t validSize) { - CUptiResult result; - CUpti_Activity *record = NULL; - int recordCount = 0; - static int calls = 0; - - // Batch probe: collect pointers to activity records and pass them to - // BPF/USDT every ACTIVITY_BATCH_SIZE records. Stack-allocated array - // of pointers — no heap allocation, no copying, version-independent. - // BPF consumers filter by activity kind (kernel, memcpy, etc.). -#define ACTIVITY_BATCH_SIZE 128 - const void *batchPtrs[ACTIVITY_BATCH_SIZE]; - uint32_t batchCount = 0; - - DEBUG_PRINTF("[CUPTI] bufferCompleted called: buffer=%p validSize=%zu (%d)\n", - buffer, validSize, calls++); - - // Start new cycle for graph correlation map - uint32_t currentCycle = bufferCycle++; - if (graphCorrelationMap) { - graph_correlation_map_cycle_start(graphCorrelationMap, currentCycle); - DEBUG_PRINTF("[CUPTI] Started graph correlation map cycle %u\n", - currentCycle); - } - - while (1) { - result = cuptiActivityGetNextRecord(buffer, validSize, &record); - if (result == CUPTI_ERROR_MAX_LIMIT_REACHED) { - break; - } else if (result != CUPTI_SUCCESS) { - const char *errstr; - cuptiGetResultString(result, &errstr); - fprintf(stderr, "[CUPTI] Error reading activity record: %s\n", errstr); - break; - } - - recordCount++; - if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL || - record->kind == CUPTI_ACTIVITY_KIND_KERNEL) { - CUpti_ActivityKernel5 *k = (CUpti_ActivityKernel5 *)record; - - DEBUG_PRINTF("[CUPTI] Kernel activity: graphId=%u graphNodeId=%lu " - "name=%s, correlationId=%u, deviceId=%u, " - "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n", - k->graphId, k->graphNodeId, k->name, k->correlationId, - k->deviceId, k->streamId, k->start, k->end, - k->end - k->start); - - // Route to appropriate map based on whether this is a graph kernel - bool should_fire = true; - if (k->graphId != 0) { - // Graph kernel - check graph correlation map - if (graphCorrelationMap) { - should_fire = graph_correlation_map_check_and_mark_seen( - graphCorrelationMap, k->correlationId, currentCycle); - if (!should_fire) { - DEBUG_PRINTF( - "[CUPTI] Filtered out graph correlationId=%u (not tracked)\n", - k->correlationId); - } else { - DEBUG_PRINTF("[CUPTI] Matched graph correlationId=%u - firing " - "kernel_executed (map size=%zu)\n", - k->correlationId, - graph_correlation_map_size(graphCorrelationMap)); - } - } - } else { - // Regular kernel - check regular correlation filter - if (correlationFilter) { - should_fire = correlation_filter_check_and_remove(correlationFilter, - k->correlationId); - if (!should_fire) { - DEBUG_PRINTF( - "[CUPTI] Filtered out correlationId=%u (not tracked)\n", - k->correlationId); - } else { - DEBUG_PRINTF("[CUPTI] Matched correlationId=%u - firing " - "kernel_executed (filter size=%zu)\n", - k->correlationId, - correlation_filter_size(correlationFilter)); - } - } - } - - // Only fire probe if correlation ID was tracked (or filters disabled) - if (should_fire) { - DTRACE_PROBE8(parcagpu, kernel_executed, k->start, k->end, - k->correlationId, k->deviceId, k->streamId, k->graphId, - k->graphNodeId, k->name); - } - } - - // Collect pointer for batch probe (all activity kinds). - // BPF consumers inspect the kind field to filter types they care about. - batchPtrs[batchCount++] = record; - if (batchCount < ACTIVITY_BATCH_SIZE) - continue; - - parcagpuActivityBatch(batchPtrs, batchCount); - batchCount = 0; - } - - if (batchCount > 0) - parcagpuActivityBatch(batchPtrs, batchCount); - - DEBUG_PRINTF("[CUPTI] Processed %d activity records from buffer %p\n", - recordCount, buffer); - - // End cycle for graph correlation map - clean up completed graph launches - if (graphCorrelationMap) { - graph_correlation_map_cycle_end(graphCorrelationMap); - - size_t map_size = 0; - size_t oldest_age = 0; - graph_correlation_map_get_stats(graphCorrelationMap, &map_size, - &oldest_age); - - DEBUG_PRINTF("[CUPTI] Ended graph correlation map cycle %u (map size=%zu, " - "oldest_age=%zu cycles)\n", - currentCycle, map_size, oldest_age); - - // Log warning if we have old entries (potential leaked graph launches) - if (oldest_age > 50 && currentCycle % 10 == 0) { - DEBUG_PRINTF("[CUPTI] WARNING: Graph map has entries aged %zu cycles " - "(may be dropped launches)\n", - oldest_age); - } - } - - // Reset to 0 rather than decrement - one API callback can produce N - // activities so decrementing by recordCount can cause underflow (size_t wraps - // to huge value) - outstandingEvents = 0; - - // Free the buffer - DEBUG_PRINTF("[CUPTI] Freeing buffer %p\n", buffer); - free(buffer); - - // Report any records dropped due to buffer overflow - size_t dropped; - result = cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped); - if (result == CUPTI_SUCCESS && dropped > 0) { - fprintf(stderr, "[CUPTI] Warning: %zu activity records dropped\n", dropped); - } -} - -// Cleanup function (destructor disabled to prevent early cleanup) -void cleanup(void) { - static bool cleanup_done = false; - - // Make cleanup idempotent - safe to call multiple times - if (cleanup_done) { - return; - } - cleanup_done = true; - - DEBUG_PRINTF("[CUPTI] Cleanup started\n"); - // Flush any remaining activity records - cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); - - // Unsubscribe from callbacks - if (subscriber) { - cuptiUnsubscribe(subscriber); - subscriber = 0; - } - - // Destroy correlation filter - if (correlationFilter) { - size_t remaining = correlation_filter_size(correlationFilter); - if (remaining > 0) { - DEBUG_PRINTF( - "[CUPTI] Warning: %zu correlation IDs still in filter at cleanup\n", - remaining); - } - correlation_filter_destroy(correlationFilter); - correlationFilter = NULL; - } - - // Destroy graph correlation map - if (graphCorrelationMap) { - size_t remaining = graph_correlation_map_size(graphCorrelationMap); - if (remaining > 0) { - DEBUG_PRINTF("[CUPTI] Warning: %zu correlation IDs still in graph map at " - "cleanup\n", - remaining); - } - graph_correlation_map_destroy(graphCorrelationMap); - graphCorrelationMap = NULL; - } - - DEBUG_PRINTF("[CUPTI] Cleanup completed\n"); -} diff --git a/cupti/toolchain-arm64.cmake b/cupti/toolchain-arm64.cmake deleted file mode 100644 index 5542249..0000000 --- a/cupti/toolchain-arm64.cmake +++ /dev/null @@ -1,19 +0,0 @@ -# CMake toolchain file for cross-compiling to ARM64 -set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR aarch64) - -# Specify the cross compiler -set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) -set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) - -# Where to look for the target environment -set(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu) - -# Adjust the default behavior of the FIND_XXX() commands: -# search headers and libraries in the target environment -set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) - -# Force position-independent code for shared libraries -set(CMAKE_POSITION_INDEPENDENT_CODE ON) diff --git a/ebpf/cupti_bpf.h b/ebpf/cupti_bpf.h new file mode 100644 index 0000000..edc8df6 --- /dev/null +++ b/ebpf/cupti_bpf.h @@ -0,0 +1,87 @@ +// CUPTI BPF definitions — shared between test/bpf/ and production cuda.ebpf.c. +// +// Contains BPF-side layouts for: +// - CUpti_ActivityKernel5 (kernel activity records) +// - CUpti_PCSamplingPCData (PC sampling records) +// - CUpti_PCSamplingStallReason (stall reason entries) + +#ifndef CUPTI_BPF_H +#define CUPTI_BPF_H + +// --------------------------------------------------------------------------- +// Kernel activity records +// --------------------------------------------------------------------------- + +// CUpti_ActivityKind values we care about +#define CUPTI_ACTIVITY_KIND_KERNEL 3 +#define CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL 10 + +// Matches the layout of CUpti_ActivityKernel5 exactly. +// Explicit padding replaces __packed__ to avoid unnecessary unaligned-access +// handling in BPF. The struct uses aligned(8) and a static size assert. +struct cupti_activity_kernel5 { + u32 kind; // offset 0 - CUpti_ActivityKind + u8 _pad1[12]; // offset 4 - cacheConfig, sharedMemConfig, + // registersPerThread, + // partitionedGlobalCache x2 + u64 start; // offset 16 - kernel start timestamp (ns) + u64 end; // offset 24 - kernel end timestamp (ns) + u64 completed; // offset 32 - completion timestamp + u32 device_id; // offset 40 + u32 context_id; // offset 44 + u32 stream_id; // offset 48 + u8 _pad2[40]; // offset 52 - gridX/Y/Z, blockX/Y/Z, + // staticSharedMemory, + // dynamicSharedMemory, + // localMemoryPerThread, + // localMemoryTotal + u32 correlation_id; // offset 92 + s64 grid_id; // offset 96 + u64 name_ptr; // offset 104 - const char* (user-space pointer) + u64 _reserved0; // offset 112 + u64 queued; // offset 120 + u64 submitted; // offset 128 + u8 _pad3[8]; // offset 136 - launchType, isSharedMemoryCarveout, + // sharedMemoryCarveoutRequested, + // padding, sharedMemoryExecuted + u64 graph_node_id; // offset 144 + u32 shmem_limit_cfg; // offset 152 - CUpti_FuncShmemLimitConfig + u32 graph_id; // offset 156 +} __attribute__((aligned(8))); + +_Static_assert( + sizeof(struct cupti_activity_kernel5) == 160, "cupti_activity_kernel5 size mismatch"); + +// --------------------------------------------------------------------------- +// PC sampling records +// --------------------------------------------------------------------------- + +#define STALL_REASON_NAME_LEN 64 +#define MAX_STALL_REASONS 64 +#define MAX_PC_BATCH_SIZE 512 +#define MAX_FUNC_NAME 128 + +// Matches CUpti_PCSamplingStallReason (packed, aligned 8). +struct cupti_stall_reason { + u32 stall_reason_index; + u32 samples; +}; + +// Matches CUpti_PCSamplingPCData (packed, aligned 8). +// Contains user-space pointers that BPF chases with bpf_probe_read_user. +// We read the base 56-byte struct, then conditionally read correlationId +// if the size field indicates CUPTI 12.4+ / v22+ (size > 56). +struct cupti_pc_data { + u64 size; // struct size (56 = pre-12.4, 60+ = CUDA 12.4+) + u64 cubin_crc; + u64 pc_offset; + u32 function_index; + u32 _pc_pad; + u64 function_name_ptr; // const char* in user-space + u64 stall_reason_count; + u64 stall_reason_ptr; // CUpti_PCSamplingStallReason* in user-space +} __attribute__((__packed__)) __attribute__((aligned(8))); + +#define CUPTI_PC_DATA_BASE_SIZE 56 + +#endif // CUPTI_BPF_H diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..5114572 --- /dev/null +++ b/go.mod @@ -0,0 +1,10 @@ +// parcagpu — host-side CUDA profiler sources and shared BPF headers. +// +// This module exists primarily so external projects (e.g. +// opentelemetry-ebpf-profiler) can pull in the canonical BPF-side +// CUPTI definitions at ebpf/cupti_bpf.h via `go mod download`, the same +// way they consume github.com/parca-dev/usdt. The Go module has no Go +// source of its own — ebpf/ is a plain header directory. +module github.com/parca-dev/parcagpu + +go 1.25.0 diff --git a/microbenchmarks/pc_sample_toy.cu b/microbenchmarks/pc_sample_toy.cu new file mode 100644 index 0000000..beef66b --- /dev/null +++ b/microbenchmarks/pc_sample_toy.cu @@ -0,0 +1,110 @@ +// pc_sample_toy.cu — a simple GPU busy-loop for testing PC sampling +// Compile: make microbenchmarks (or: nvcc -g -lineinfo -arch=native -o pc_sample_toy pc_sample_toy.cu) +// Run: ./pc_sample_toy + +#include +#include +#include + +#define CHECK(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + exit(1); \ + } \ + } while (0) + +// Kernel A: heavy FP math (sin/cos chain) +__global__ void trig_storm(float *out, int n, unsigned long long iters) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) + return; + + float x = (float)idx * 0.001f; + for (unsigned long long i = 0; i < iters; i++) { + x = sinf(x) * cosf(x) + 0.1f; + } + out[idx] = x; +} + +// Kernel B: integer bit-twiddling +__global__ void hash_churn(unsigned int *out, int n, unsigned long long iters) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) + return; + + unsigned int h = idx ^ 0xdeadbeef; + for (unsigned long long i = 0; i < iters; i++) { + h ^= h << 13; + h ^= h >> 17; + h ^= h << 5; + h += (unsigned int)i; + } + out[idx] = h; +} + +// Kernel C: shared-memory bouncing +__global__ void shmem_bounce(float *out, int n, unsigned long long iters) { + __shared__ float tile[256]; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int tid = threadIdx.x; + + tile[tid] = (float)idx; + __syncthreads(); + + for (unsigned long long i = 0; i < iters; i++) { + tile[tid] += tile[(tid + 1) % blockDim.x] * 0.01f; + __syncthreads(); + } + + if (idx < n) + out[idx] = tile[tid]; +} + +void go() { + const int N = 1 << 18; // 256K elements + const int threads = 256; + const int blocks = (N + threads - 1) / threads; + + float *d_float; + unsigned int *d_uint; + + CHECK(cudaMalloc(&d_float, N * sizeof(float))); + CHECK(cudaMalloc(&d_uint, N * sizeof(unsigned int))); + + printf("Launching GPU kernels — attach your profiler now.\n"); + printf("PID: %d\n\n", getpid()); + + sleep(1); + // Each kernel runs for roughly 0.5–1 second depending on GPU. + // Tune the iteration count up/down as needed. + + printf(" [1/3] trig_storm ...\n"); + trig_storm<<>>(d_float, N, 500000ULL); + CHECK(cudaDeviceSynchronize()); + + printf(" [2/3] hash_churn ...\n"); + hash_churn<<>>(d_uint, N, 2000000ULL); + CHECK(cudaDeviceSynchronize()); + + printf(" [3/3] shmem_bounce ...\n"); + shmem_bounce<<>>(d_float, N, 50000ULL); + CHECK(cudaDeviceSynchronize()); + + printf("\nDone.\n"); + + CHECK(cudaFree(d_float)); + CHECK(cudaFree(d_uint)); +} + +int main(int argc, char **argv) { + int loops = 1; + if (argc > 1) { + loops = atoi(argv[1]); + } + while (loops-- > 0) { + go(); + } +} diff --git a/microbenchmarks/rapid_launch.cu b/microbenchmarks/rapid_launch.cu new file mode 100644 index 0000000..aa02679 --- /dev/null +++ b/microbenchmarks/rapid_launch.cu @@ -0,0 +1,44 @@ +// rapid_launch.cu — measures per-kernel-launch overhead from CUPTI injection. +// Launches many tiny kernels to stress the callback path. +// +// Compile: nvcc -o rapid_launch rapid_launch.cu +// Run: ./rapid_launch [num_launches] +// +// Compare: +// ./rapid_launch 50000 # baseline +// CUDA_INJECTION64_PATH=.../libparcagpucupti.so ./rapid_launch 50000 # injected + +#include +#include +#include + +__global__ void empty_kernel() {} + +static double now_sec() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +int main(int argc, char **argv) { + int n = 50000; + if (argc > 1) + n = atoi(argv[1]); + + // Warm up the CUDA context and any injection library init. + empty_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + + // Synchronous launches — each one round-trips through CUPTI callbacks. + double t0 = now_sec(); + for (int i = 0; i < n; i++) { + empty_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + } + double t1 = now_sec(); + + double elapsed = t1 - t0; + printf("%d launches in %.3f s (%.1f us/launch)\n", n, elapsed, + elapsed / n * 1e6); + return 0; +} diff --git a/parcagpu.bt b/parcagpu.bt index c35ca43..4132c47 100755 --- a/parcagpu.bt +++ b/parcagpu.bt @@ -1,50 +1,85 @@ #!/usr/bin/env bpftrace -/* - * Simple CUPTI USDT Test - * Tests the three parcagpu USDT probes - */ +BEGIN { + printf("Monitoring CUDA activity via parcagpu USDT probes...\n"); + printf("%-20s %-12s %-10s %-10s %-10s %-10s %-10s %-10s %s\n", + "TIME", "DURATION_NS", "CORR_ID", "DEVICE", "STREAM", "GRAPH", "NODE", "START", "KERNEL"); +} -BEGIN -{ - printf("Simple CUPTI USDT Test Started\n"); - printf("Monitoring parcagpu USDT probes...\n\n"); +usdt:*:parcagpu:kernel_executed { + $start = arg0; + $end = arg1; + $duration = $end - $start; + $correlation_id = arg2; + $device_id = arg3; + $stream_id = arg4; + $graph_id = arg5; + $graph_node_id = arg6; + $kernel_name = str(arg7); + + printf("%-12s.%-6u %-12lu %-10u %-10u %-10u %-10u %-10lu %-10lu ", + strftime("%H:%M:%S", nsecs), + (nsecs % 1000000000) / 1000, + $duration, + $correlation_id, + $device_id, + $stream_id, + $graph_id, + $graph_node_id, + $start); + printf("%s\n", $kernel_name); } +usdt:*:parcagpu:cuda_correlation { + $correlation_id = arg0; + $cbid = arg1; + $name = str(arg2); + printf("%-12s.%-6u [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S", nsecs), (nsecs % 1000000000) / 1000, $correlation_id, $cbid, $name); +} -// Track CUDA API correlation events -// Arguments: correlationId (uint32), signedCbid (int32), name (char*) -usdt:*:parcagpu:cuda_correlation -{ - $corId = (uint32)arg0; - $cbid = (int32)arg1; - printf("[%d] cuda_correlation: correlationId=%u, cbid=%d, name=%s\n", - pid, $corId, $cbid, str(arg2)); - @cuda_correlations = count(); +usdt:*:parcagpu:pc_sample_batch { + $records = arg0; + $count = arg1; + + printf("%-12s.%-6u [PC_BATCH] count=%u records=%p\n", + strftime("%H:%M:%S", nsecs), + (nsecs % 1000000000) / 1000, + $count, + $records); } -// Track kernel executions -// Arguments: start, end, correlationId, deviceId, streamId, name (pointer) -usdt:*:parcagpu:kernel_executed -{ - $start = arg0; - $end = arg1; - printf("[%d] Kernel executed:\n", pid); - printf(" start=%lu, end=%lu, duration=%lu ns\n", $start, $end, $end - $start); - $devId = arg2 >>32; - $corId = (uint64)arg2 & 0xFFFFFFFF; - printf(" correlationId=%u, deviceId=%u, streamId=%u\n", - $corId, $devId, arg3); - printf(" name=%s\n", str(arg4)); - @kernel_executions = count(); -} - -END -{ - printf("\n=== Summary ===\n"); - printf("CUDA correlations: "); - print(@cuda_correlations); - printf("Kernel executions: "); - print(@kernel_executions); +usdt:*:parcagpu:stall_reason_map { + $names = arg0; + $count = arg1; + + printf("%-12s.%-6u [STALL_MAP] count=%u base=%p\n", + strftime("%H:%M:%S", nsecs), + (nsecs % 1000000000) / 1000, + $count, + $names); +} + +usdt:*:parcagpu:cubin_loaded { + $cubin_crc = arg0; + $cubin_size = arg2; + + printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx size=%lu\n", + strftime("%H:%M:%S", nsecs), + (nsecs % 1000000000) / 1000, + $cubin_crc, + $cubin_size); +} + +usdt:*:parcagpu:cubin_unloaded { + $cubin_crc = arg0; + + printf("%-12s.%-6u [CUBIN_UNLOAD] crc=0x%lx\n", + strftime("%H:%M:%S", nsecs), + (nsecs % 1000000000) / 1000, + $cubin_crc); +} + +END { + printf("\nMonitoring complete.\n"); } diff --git a/proton b/proton new file mode 160000 index 0000000..bfc06a7 --- /dev/null +++ b/proton @@ -0,0 +1 @@ +Subproject commit bfc06a7e18f129da8832a433f993700036121daa diff --git a/src/correlation_filter.cpp b/src/correlation_filter.cpp new file mode 100644 index 0000000..c246c77 --- /dev/null +++ b/src/correlation_filter.cpp @@ -0,0 +1,97 @@ +#include "correlation_filter.h" + +namespace parcagpu { + +//============================================================================= +// CorrelationFilter implementation +//============================================================================= + +void CorrelationFilter::insert(uint32_t correlation_id) { + std::lock_guard lock(mutex_); + set_.insert(correlation_id); +} + +bool CorrelationFilter::check_and_remove(uint32_t correlation_id) { + std::lock_guard lock(mutex_); + auto it = set_.find(correlation_id); + if (it != set_.end()) { + set_.erase(it); + return true; + } + return false; +} + +size_t CorrelationFilter::size() const { + std::lock_guard lock(mutex_); + return set_.size(); +} + +//============================================================================= +// GraphCorrelationEntry implementation +//============================================================================= + +GraphCorrelationEntry::GraphCorrelationEntry(uint32_t cycle) + : state{GRAPH_STATE_UNINITIALIZED, GRAPH_STATE_UNINITIALIZED}, + ever_seen_kernel(false), insertion_cycle(cycle) {} + +//============================================================================= +// GraphCorrelationMap implementation +//============================================================================= + +GraphCorrelationMap::GraphCorrelationMap() : current_cycle_(0) {} + +void GraphCorrelationMap::insert(uint32_t correlation_id) { + std::lock_guard lock(mutex_); + map_.emplace(correlation_id, GraphCorrelationEntry(current_cycle_)); +} + +void GraphCorrelationMap::cycle_start(uint32_t cycle) { + std::lock_guard lock(mutex_); + current_cycle_ = cycle; + uint32_t slot = cycle % 2; + for (auto &pair : map_) { + pair.second.state[slot] = GRAPH_STATE_CYCLE_CLEARED; + } +} + +bool GraphCorrelationMap::check_and_mark_seen(uint32_t correlation_id, + uint32_t cycle) { + std::lock_guard lock(mutex_); + auto it = map_.find(correlation_id); + if (it != map_.end()) { + uint32_t slot = cycle % 2; + it->second.state[slot] = GRAPH_STATE_KERNEL_SEEN; + it->second.ever_seen_kernel = true; + return true; + } + return false; +} + +void GraphCorrelationMap::cycle_end() { + std::lock_guard lock(mutex_); + for (auto it = map_.begin(); it != map_.end();) { + bool should_remove = false; + if (it->second.state[0] == GRAPH_STATE_CYCLE_CLEARED && + it->second.state[1] == GRAPH_STATE_CYCLE_CLEARED) { + if (it->second.ever_seen_kernel) { + // Graph completed normally + should_remove = true; + } else if ((current_cycle_ - it->second.insertion_cycle) > 100) { + // Fallback: never saw kernels and entry is very old + should_remove = true; + } + } + if (should_remove) { + it = map_.erase(it); + } else { + ++it; + } + } +} + +size_t GraphCorrelationMap::size() const { + std::lock_guard lock(mutex_); + return map_.size(); +} + +} // namespace parcagpu diff --git a/src/correlation_filter.h b/src/correlation_filter.h new file mode 100644 index 0000000..953733f --- /dev/null +++ b/src/correlation_filter.h @@ -0,0 +1,60 @@ +#pragma once + +#include +#include +#include +#include + +namespace parcagpu { + +//============================================================================= +// Correlation Filter - tracks which kernel launches we've sampled +//============================================================================= + +// CorrelationFilter for non-graph kernel launches +// Insert on API callback, check-and-remove on kernel activity +class CorrelationFilter { +public: + void insert(uint32_t correlation_id); + bool check_and_remove(uint32_t correlation_id); + size_t size() const; + +private: + std::unordered_set set_; + mutable std::mutex mutex_; +}; + +// Graph correlation state values +enum GraphCorrelationState { + GRAPH_STATE_UNINITIALIZED = 0, // Entry just created, slot not yet processed + GRAPH_STATE_CYCLE_CLEARED = 1, // Cycle started, no kernels seen yet + GRAPH_STATE_KERNEL_SEEN = 2 // At least one kernel seen this cycle +}; + +// GraphCorrelationMap for graph launches (multiple kernels per launch) +// Uses 2-slot state machine to detect when all kernels from a graph have arrived +struct GraphCorrelationEntry { + uint8_t state[2]; // State for alternating cycles + bool ever_seen_kernel; // True once we've seen at least one kernel activity + uint32_t insertion_cycle; // Buffer cycle when entry was created + + GraphCorrelationEntry(uint32_t cycle); +}; + +class GraphCorrelationMap { +public: + GraphCorrelationMap(); + + void insert(uint32_t correlation_id); + void cycle_start(uint32_t cycle); + bool check_and_mark_seen(uint32_t correlation_id, uint32_t cycle); + void cycle_end(); + size_t size() const; + +private: + std::unordered_map map_; + uint32_t current_cycle_; + mutable std::mutex mutex_; +}; + +} // namespace parcagpu diff --git a/src/cupti.cpp b/src/cupti.cpp new file mode 100644 index 0000000..51bd1f5 --- /dev/null +++ b/src/cupti.cpp @@ -0,0 +1,586 @@ +#include +#include +#include +#include +#include +#include +#include + +// USDT probes — must come before any header that might include , +// so that _SDT_HAS_SEMAPHORES is defined first. +#include "probes.h" + +// Include proton headers +#include "Driver/GPU/CuptiApi.h" +#include "Profiler/Cupti/CuptiCallbacks.h" +#include "Utility/Singleton.h" +#include "correlation_filter.h" +#include "env_config.h" +#include "pc_sampling.h" +#include "token_bucket.h" + +namespace parcagpu { + +// Debug logging control +bool debug_enabled = false; +bool limiter_disabled = false; + +// Global correlation tracking instances +static CorrelationFilter g_correlationFilter; +static GraphCorrelationMap g_graphCorrelationMap; +static std::atomic g_bufferCycle{0}; + +// Thread-local tracking: store correlation ID from runtime ENTER +// so we can skip driver EXIT probe when it matches (driver calls happen under +// runtime calls) +thread_local uint32_t runtimeEnterCorrelationId = 0; + +// Thread-local rate limiter for callback probes (default 100/sec, +// configurable via PARCAGPU_RATE_LIMIT). +thread_local TokenBucket callbackLimiter(100.0); + +// --------------------------------------------------------------------------- +// PC sampling probabilistic control. +// Sampling is gated by an interval + probability: at most once per interval, +// roll the probability die; if it hits, sample all kernels until the interval +// window closes. +// --------------------------------------------------------------------------- + +// Config — read from env at startup. Stored in a struct so it can be made +// runtime-adjustable later (e.g. via signal or control file). +struct PCSamplingConfig { + double probability; // PARCAGPU_PC_SAMPLING_PROBABILITY (default 1/100) + uint64_t intervalNs; // PARCAGPU_PC_SAMPLING_INTERVAL in nanoseconds +}; +static PCSamplingConfig g_pcSamplingConfig = {0.01, 1000000000ULL}; + +// Per-thread sampling state. +struct PCSamplingState { + bool active = false; // Currently sampling + uint64_t windowStartNs = 0; // When the current window opened + uint64_t lastCheckNs = 0; // Last time we checked the probability + unsigned int rngSeed = 0; // Thread-local RNG state +}; +thread_local PCSamplingState g_pcSamplingState; + +static uint64_t nowNs() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec; +} + +// Seed the per-thread RNG lazily. +static void ensureRngSeeded(PCSamplingState &s) { + if (s.rngSeed == 0) { + uint64_t t = nowNs(); + s.rngSeed = (unsigned int)(t ^ (uintptr_t)&s); + if (s.rngSeed == 0) + s.rngSeed = 1; + } +} + +static double threadRandom(PCSamplingState &s) { + ensureRngSeeded(s); + return (double)rand_r(&s.rngSeed) / RAND_MAX; +} + +void init_debug() { + static bool initialized = false; + if (!initialized) { + debug_enabled = getenv("PARCAGPU_DEBUG") != nullptr; + limiter_disabled = getenv("PARCAGPU_LIMITER_DISABLE") != nullptr; + const char *rateEnv = getenv("PARCAGPU_RATE_LIMIT"); + if (rateEnv != nullptr) { + double rate = atof(rateEnv); + if (rate > 0) { + callbackLimiter.setRate(rate); + } + } + + const char *probEnv = getenv("PARCAGPU_PC_SAMPLING_PROBABILITY"); + if (probEnv) { + double p = atof(probEnv); + if (p > 0.0 && p <= 1.0) + g_pcSamplingConfig.probability = p; + } + const char *intervalEnv = getenv("PARCAGPU_PC_SAMPLING_INTERVAL"); + if (intervalEnv) { + double s = atof(intervalEnv); + if (s > 0.0) + g_pcSamplingConfig.intervalNs = (uint64_t)(s * 1e9); + } + + validateEnvVars(); + initialized = true; + } +} + +// Out-of-line USDT probe site for activity batches. +// Single call site ensures one probe location in the ELF .note.stapsdt section. +static constexpr int ACTIVITY_BATCH_SIZE = 128; + +} // namespace parcagpu + +__attribute__((noinline)) void parcagpuActivityBatch(const void **ptrs, + uint32_t count) { + PARCAGPU_ACTIVITY_BATCH(ptrs, count); +} + +namespace parcagpu { + +// Simplified profiler using Proton's patterns +class CuptiProfiler : public proton::Singleton { +public: + CuptiProfiler() { + DEBUG_PRINTF("[PARCAGPU] Initializing ParcaGPUProfiler\n"); + } + + ~CuptiProfiler() { cleanup(); } + + bool initialize() { + if (initialized.exchange(true)) { + return true; // Already initialized + } + + DEBUG_PRINTF("[PARCAGPU] Starting initialization\n"); + + // Check if PC sampling is supported + pcSamplingEnabled = parcagpu::PCSampling::isSupported(); + if (pcSamplingEnabled) { + DEBUG_PRINTF("[PARCAGPU] PC sampling enabled (serialized mode)\n"); + } else { + DEBUG_PRINTF( + "[PARCAGPU] PC sampling disabled, using kernel activity only\n"); + } + + // Subscribe to callbacks + auto result = + proton::cupti::subscribe(&subscriber, callbackHandler, nullptr); + if (result != CUPTI_SUCCESS) { + DEBUG_PRINTF("[PARCAGPU] Failed to subscribe to callbacks: error %d\n", + result); + return false; + } + + // Enable runtime and driver API callbacks (using Proton's utilities) + proton::setRuntimeCallbacks(subscriber, /*enable=*/true); + proton::setLaunchCallbacks(subscriber, /*enable=*/true); + + // Enable resource callbacks only if PC sampling is enabled + if (pcSamplingEnabled) { + proton::setResourceCallbacks(subscriber, /*enable=*/true); + } + + // Register activity buffer callbacks (using Proton's pattern) + result = proton::cupti::activityRegisterCallbacks(allocBuffer, + completeBuffer); + if (result != CUPTI_SUCCESS) { + DEBUG_PRINTF( + "[PARCAGPU] Failed to register activity callbacks: error %d\n", + result); + return false; + } + + // Enable kernel activity recording + result = proton::cupti::activityEnable( + CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + if (result != CUPTI_SUCCESS) { + DEBUG_PRINTF( + "[PARCAGPU] Failed to enable concurrent kernel activity: error %d\n", + result); + } else { + DEBUG_PRINTF("[PARCAGPU] Enabled CONCURRENT_KERNEL activity\n"); + } + + DEBUG_PRINTF("[PARCAGPU] Successfully initialized CUPTI callbacks\n"); + return true; + } + + void cleanup() { + if (!initialized.exchange(false)) { + return; // Already cleaned up + } + + DEBUG_PRINTF("[PARCAGPU] Cleanup started\n"); + + // PC sampling data is drained in finalize() during CONTEXT_DESTROY_STARTING + // when the CUDA context is still valid. By the time cleanup() runs, the + // context may already be dead, so we don't drain here. + + // Disable all callbacks + if (subscriber) { + proton::setRuntimeCallbacks(subscriber, /*enable=*/false); + proton::setLaunchCallbacks(subscriber, /*enable=*/false); + if (pcSamplingEnabled) { + proton::setResourceCallbacks(subscriber, /*enable=*/false); + } + } + + // Flush any remaining activity records + proton::cupti::activityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); + + // Disable activity recording + proton::cupti::activityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + + // Unsubscribe + if (subscriber) { + proton::cupti::unsubscribe(subscriber); + subscriber = nullptr; + } + + DEBUG_PRINTF("[PARCAGPU] Cleanup completed\n"); + } + +private: + std::atomic initialized{false}; + bool pcSamplingEnabled = false; + CUpti_SubscriberHandle subscriber = nullptr; + + // PC sampling state — owned by this profiler, destroyed with it. + parcagpu::PCSampling pcSampling; + + // Outstanding event counter for flushing + size_t outstandingEvents = 0; + + // Buffer management - using Proton's pattern (static methods) + // A kernel activity is around 224 bytes so a 128kb buffer + // will hold ~500 activities, we want to flush regularly since + // we are a continuous profiler so we don't need a huge buffer + // like most CUPTI profilers. Also a small size avoids malloc + // just going to mmap every time so the allocator should cache + // and re-use these for us. + static constexpr size_t AlignSize = 8; + static constexpr size_t BufferSize = 128 * 1024; + + static void allocBuffer(uint8_t **buffer, size_t *bufferSize, + size_t *maxNumRecords) { + if (!PARCAGPU_CUDA_CORRELATION_ENABLED()) { + *buffer = nullptr; + return; + } + *buffer = static_cast(aligned_alloc(AlignSize, BufferSize)); + if (*buffer == nullptr) { + DEBUG_PRINTF("[PARCAGPU] ERROR: aligned_alloc failed\n"); + return; + } + *bufferSize = BufferSize; + *maxNumRecords = 0; + DEBUG_PRINTF("[PARCAGPU:allocBuffer] Allocated buffer at %p size %zu\n", + *buffer, *bufferSize); + } + + static void completeBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, + size_t size, size_t validSize) { + CUpti_Activity *record = nullptr; + int recordCount = 0; + int filteredCount = 0; + + // Batch probe: collect pointers to activity records and pass them to + // BPF/USDT every ACTIVITY_BATCH_SIZE records. Stack-allocated array + // of pointers — no heap allocation, no copying, version-independent. + const void *batchPtrs[ACTIVITY_BATCH_SIZE]; + uint32_t batchCount = 0; + + DEBUG_PRINTF( + "[PARCAGPU] completeBuffer called: ctx=%p buffer=%p validSize=%zu\n", + ctx, buffer, validSize); + + // Start a new buffer cycle for graph correlation tracking + uint32_t cycle = g_bufferCycle.fetch_add(1); + g_graphCorrelationMap.cycle_start(cycle); + + while (true) { + CUptiResult result = proton::cupti::activityGetNextRecord( + buffer, validSize, &record); + if (result == CUPTI_ERROR_MAX_LIMIT_REACHED) { + break; + } else if (result != CUPTI_SUCCESS) { + DEBUG_PRINTF("[PARCAGPU] Error reading activity record: error %d\n", + result); + break; + } + + recordCount++; + switch (record->kind) { + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: + case CUPTI_ACTIVITY_KIND_KERNEL: { + auto *k = reinterpret_cast(record); + + // Check correlation filter - only emit probe if this kernel was sampled + bool shouldEmit = false; + if (k->graphId != 0) { + // Graph kernel - check graph correlation map + shouldEmit = g_graphCorrelationMap.check_and_mark_seen( + k->correlationId, cycle); + } else { + // Regular kernel - check and remove from correlation filter + shouldEmit = g_correlationFilter.check_and_remove(k->correlationId); + } + + if (!shouldEmit) { + filteredCount++; + DEBUG_PRINTF("[PARCAGPU] Filtered kernel activity: correlationId=%u " + "graphId=%u (not in filter)\n", + k->correlationId, k->graphId); + break; + } + + DEBUG_PRINTF("[PARCAGPU] Kernel activity: graphId=%u graphNodeId=%lu " + "name=%s, correlationId=%u, deviceId=%u, " + "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n", + k->graphId, k->graphNodeId, k->name, k->correlationId, + k->deviceId, k->streamId, k->start, k->end, + k->end - k->start); + + // Emit USDT probe for kernel execution + PARCAGPU_KERNEL_EXECUTED(k->start, k->end, k->correlationId, + k->deviceId, k->streamId, k->graphId, + k->graphNodeId, k->name); + break; + } + default: + DEBUG_PRINTF("[PARCAGPU] Activity record %d: kind=%d\n", recordCount, + record->kind); + break; + } + + // Collect pointer for batch probe (all activity kinds). + // BPF consumers inspect the kind field to filter types they care about. + batchPtrs[batchCount++] = record; + if (batchCount >= ACTIVITY_BATCH_SIZE) { + parcagpuActivityBatch(batchPtrs, batchCount); + batchCount = 0; + } + } + + // Flush remaining batch + if (batchCount > 0) { + parcagpuActivityBatch(batchPtrs, batchCount); + } + + // End cycle - cleanup completed graph entries + g_graphCorrelationMap.cycle_end(); + + DEBUG_PRINTF("[PARCAGPU] Processed %d activity records (%d filtered) from " + "buffer %p\n", + recordCount, filteredCount, buffer); + + // Reset to 0 rather than decrement - one API callback can produce N + // activities so decrementing by recordCount can cause underflow + CuptiProfiler::instance().outstandingEvents = 0; + + // Free the buffer (Proton's pattern) + std::free(buffer); + } + + static void callbackHandler(void *userdata, CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, const void *cbdata_void) { + auto &profiler = CuptiProfiler::instance(); + + if (domain == CUPTI_CB_DOMAIN_RESOURCE) { + // Handle resource callbacks for PC sampling (only if enabled) + if (!profiler.pcSamplingEnabled) { + return; + } + + const CUpti_ResourceData *resData = + static_cast(cbdata_void); + + switch (cbid) { + case CUPTI_CBID_RESOURCE_MODULE_LOADED: { + const CUpti_ModuleResourceData *modData = + static_cast( + resData->resourceDescriptor); + if (modData && modData->pCubin && modData->cubinSize > 0) { + DEBUG_PRINTF("[PARCAGPU] Module loaded: cubin=%p size=%zu\n", + modData->pCubin, modData->cubinSize); + profiler.pcSampling.loadModule(modData->pCubin, modData->cubinSize); + } + break; + } + case CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING: { + const CUpti_ModuleResourceData *modData = + static_cast( + resData->resourceDescriptor); + if (modData && modData->pCubin && modData->cubinSize > 0) { + DEBUG_PRINTF("[PARCAGPU] Module unloading: cubin=%p size=%zu\n", + modData->pCubin, modData->cubinSize); + profiler.pcSampling.unloadModule(modData->pCubin, modData->cubinSize); + } + break; + } + case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: { + CUcontext ctx = resData->context; + DEBUG_PRINTF("[PARCAGPU] Context created: %p\n", ctx); + profiler.pcSampling.initialize(ctx); + break; + } + case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: { + CUcontext ctx = resData->context; + DEBUG_PRINTF("[PARCAGPU] Context destroying: %p\n", ctx); + profiler.pcSampling.finalize(ctx); + break; + } + default: + break; + } + } else { + // Handle both Runtime and Driver API callbacks + const CUpti_CallbackData *cbdata = + static_cast(cbdata_void); + uint32_t correlationId = cbdata->correlationId; + + // ENTER: manage probabilistic sampling windows. + // start() begins CUPTI PC sampling (kernels serialized). + // stop() ends it and drains data (kernels concurrent again). + if (cbdata->callbackSite == CUPTI_API_ENTER) { + if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) + runtimeEnterCorrelationId = correlationId; + + if (profiler.pcSamplingEnabled) { + auto &st = g_pcSamplingState; + auto &cfg = g_pcSamplingConfig; + uint64_t now = nowNs(); + + // If sampling and the window has closed, stop + drain. + if (st.active && (now - st.windowStartNs >= cfg.intervalNs)) { + profiler.pcSampling.stop(cbdata->context); + st.active = false; + } + + // If not sampling, check interval then probability. + if (!st.active && (now - st.lastCheckNs >= cfg.intervalNs)) { + st.lastCheckNs = now; + if (threadRandom(st) < cfg.probability) { + st.active = true; + st.windowStartNs = now; + profiler.pcSampling.start(cbdata->context); + } + } + + profiler.pcSampling.emitMetadata(); + } + return; + } + + // Process on EXIT to avoid adding latency to GPU launch + if (cbdata->callbackSite != CUPTI_API_EXIT) { + return; + } + + // EXIT: stop sampling if the window has closed. + if (profiler.pcSamplingEnabled) { + auto &st = g_pcSamplingState; + auto &cfg = g_pcSamplingConfig; + uint64_t now = nowNs(); + + if (st.active && (now - st.windowStartNs >= cfg.intervalNs)) { + profiler.pcSampling.stop(cbdata->context); + st.active = false; + } + + profiler.pcSampling.emitMetadata(); + } + + // Skip correlation/rate-limiter work when no profiler is attached. + if (!PARCAGPU_CUDA_CORRELATION_ENABLED()) + return; + + const char *name = + cbdata->symbolName ? cbdata->symbolName : cbdata->functionName; + int signedCbid; + + if (domain == CUPTI_CB_DOMAIN_DRIVER_API) { + // Skip if this driver call is under a runtime call (same correlation + // ID) + if (correlationId == runtimeEnterCorrelationId) { + DEBUG_PRINTF("[PARCAGPU] Skipping driver EXIT correlationId=%u - " + "runtime will handle\n", + correlationId); + return; + } + // Pure driver call (no runtime wrapper) - use negative cbid + signedCbid = -(int)cbid; + DEBUG_PRINTF("[PARCAGPU] Driver API callback: cbid=%d, " + "correlationId=%u, func=%s\n", + cbid, correlationId, name); + } else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) { + signedCbid = (int)cbid; + runtimeEnterCorrelationId = 0; // Clear after use + DEBUG_PRINTF("[PARCAGPU] Runtime API callback: cbid=%d, " + "correlationId=%u, func=%s\n", + cbid, correlationId, name); + } else { + return; + } + + // Check if this is a graph launch (never rate limit these) + bool isGraphLaunch = false; + if (signedCbid < 0) { + // Driver API: cuGraphLaunch = 514, cuGraphLaunch_ptsz = 515 + int driverCbid = -signedCbid; + isGraphLaunch = + (driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch || + driverCbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); + } else { + // Runtime API: cudaGraphLaunch = 311, cudaGraphLaunch_ptsz = 312 + isGraphLaunch = + (signedCbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 || + signedCbid == + CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000); + } + + // Rate limit probes using token bucket (skip for graph launches) + if (!limiter_disabled && !isGraphLaunch) { + if (!callbackLimiter.tryAcquire()) { + DEBUG_PRINTF( + "[PARCAGPU] Rate limited: skipping probe for correlationId=%u\n", + correlationId); + return; + } + } + + profiler.outstandingEvents++; + // Emit USDT probe with signed cbid (negative for driver, positive for + // runtime) + PARCAGPU_CUDA_CORRELATION(correlationId, signedCbid, name); + + // Insert into correlation filter so we can match kernel activities later + if (isGraphLaunch) { + g_graphCorrelationMap.insert(correlationId); + DEBUG_PRINTF("[PARCAGPU] Inserted correlationId=%u into graph map\n", + correlationId); + } else { + g_correlationFilter.insert(correlationId); + DEBUG_PRINTF( + "[PARCAGPU] Inserted correlationId=%u into correlation filter\n", + correlationId); + } + + // Flush if too many events pile up + if (profiler.outstandingEvents > 3000) { + DEBUG_PRINTF("[PARCAGPU] Flushing: outstandingEvents=%zu\n", + profiler.outstandingEvents); + proton::cupti::activityFlushAll(0); + profiler.outstandingEvents = 0; + } + } + } +}; + +} // namespace parcagpu + +// CUPTI initialization function required for CUDA_INJECTION64_PATH +extern "C" int InitializeInjection(void) { + DEBUG_PRINTF("[PARCAGPU] InitializeInjection called\n"); + + auto &profiler = parcagpu::CuptiProfiler::instance(); + if (!profiler.initialize()) { + return 0; // Return 0 on failure, but don't break injection + } + + // Register cleanup at exit + atexit([]() { parcagpu::CuptiProfiler::instance().cleanup(); }); + + return 1; // Success +} diff --git a/src/env_config.cpp b/src/env_config.cpp new file mode 100644 index 0000000..7d0bac7 --- /dev/null +++ b/src/env_config.cpp @@ -0,0 +1,103 @@ +#include "env_config.h" +#include "pc_sampling.h" // DEBUG_PRINTF, fireError +#include "probes.h" + +#include +#include + +extern char **environ; + +namespace parcagpu { + +// Known PARCAGPU_* environment variables. +static const char *knownVars[] = { + "PARCAGPU_DEBUG", + "PARCAGPU_LIMITER_DISABLE", + "PARCAGPU_RATE_LIMIT", + "PARCAGPU_SAMPLING_FACTOR", + "PARCAGPU_PC_SAMPLING_PROBABILITY", + "PARCAGPU_PC_SAMPLING_INTERVAL", +}; +static constexpr size_t numKnownVars = + sizeof(knownVars) / sizeof(knownVars[0]); + +static bool isKnown(const char *name, size_t nameLen) { + for (size_t i = 0; i < numKnownVars; ++i) { + if (std::strlen(knownVars[i]) == nameLen && + std::strncmp(knownVars[i], name, nameLen) == 0) + return true; + } + return false; +} + +void validateEnvVars() { + // Scan environment for unrecognized PARCAGPU_* variables. + static constexpr const char prefix[] = "PARCAGPU_"; + static constexpr size_t prefixLen = sizeof(prefix) - 1; + + for (char **ep = environ; *ep; ++ep) { + if (std::strncmp(*ep, prefix, prefixLen) != 0) + continue; + + // Extract variable name (everything before '='). + const char *eq = std::strchr(*ep, '='); + size_t nameLen = eq ? (size_t)(eq - *ep) : std::strlen(*ep); + + if (!isKnown(*ep, nameLen)) { + // Null-terminate for printing. + char nameBuf[128] = {}; + size_t copyLen = nameLen < sizeof(nameBuf) - 1 ? nameLen : sizeof(nameBuf) - 1; + std::memcpy(nameBuf, *ep, copyLen); + + DEBUG_PRINTF("[PARCAGPU] Warning: unrecognized env var '%s'\n", nameBuf); + fireError(0, nameBuf, "env_config: unrecognized variable"); + } + } + + // Validate specific variables. + const char *val; + + val = std::getenv("PARCAGPU_RATE_LIMIT"); + if (val) { + double rate = std::atof(val); + if (rate <= 0) { + DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_RATE_LIMIT=%s invalid " + "(must be > 0), using default\n", val); + fireError(0, val, "env_config: PARCAGPU_RATE_LIMIT invalid"); + } + } + + val = std::getenv("PARCAGPU_SAMPLING_FACTOR"); + if (val) { + int factor = std::atoi(val); + if (factor != 0 && (factor < 5 || factor > 31)) { + DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%s out of " + "range [0, 5-31], using default\n", val); + fireError(0, val, "env_config: PARCAGPU_SAMPLING_FACTOR out of range"); + } + } + + val = std::getenv("PARCAGPU_PC_SAMPLING_PROBABILITY"); + if (val) { + double p = std::atof(val); + if (p <= 0.0 || p > 1.0) { + DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_SAMPLING_PROBABILITY=%s " + "invalid (must be in (0, 1]), using default\n", val); + fireError(0, val, + "env_config: PARCAGPU_PC_SAMPLING_PROBABILITY invalid"); + } + } + + val = std::getenv("PARCAGPU_PC_SAMPLING_INTERVAL"); + if (val) { + double s = std::atof(val); + if (s <= 0.0) { + DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_SAMPLING_INTERVAL=%s " + "invalid (must be > 0), using default\n", val); + fireError(0, val, + "env_config: PARCAGPU_PC_SAMPLING_INTERVAL invalid"); + } + } +} + +} // namespace parcagpu diff --git a/src/env_config.h b/src/env_config.h new file mode 100644 index 0000000..2075c85 --- /dev/null +++ b/src/env_config.h @@ -0,0 +1,16 @@ +#ifndef PARCAGPU_ENV_CONFIG_H_ +#define PARCAGPU_ENV_CONFIG_H_ + +namespace parcagpu { + +// Scan the process environment for all PARCAGPU_* variables. +// Warns (via DEBUG_PRINTF and the error probe) about unrecognized names. +// Validates types and ranges for known variables; on invalid values, fires +// the error probe, prints a debug warning, and uses the default. +// +// Call once at startup (e.g. from init_debug). +void validateEnvVars(); + +} // namespace parcagpu + +#endif // PARCAGPU_ENV_CONFIG_H_ diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp new file mode 100644 index 0000000..230b494 --- /dev/null +++ b/src/pc_sampling.cpp @@ -0,0 +1,772 @@ +#include "Driver/GPU/CudaApi.h" +#include "Driver/GPU/CuptiApi.h" +#include "pc_sampling.h" +#include "probes.h" +#include +#include +#include +#include +#include +#include + +namespace parcagpu { + +// CUDA driver version for 12.8.1 (minimum for PC sampling) +// Version format: major * 1000 + minor * 10 + patch +#define CUDA_VERSION_12_8_1 12081 + +// CUPTI version that added correlationId to CUpti_PCSamplingPCData, +// breaking ABI compatibility. +#define CUPTI_CUDA12_4_VERSION 22 + +// noinline wrappers so each USDT probe has exactly one call site in the +// binary. Multiple call sites produce multiple .note.stapsdt entries with +// different argument encodings, which complicates BPF attachment. +__attribute__((noinline)) void fireCubinLoaded(uint64_t crc, const char *cubin, + uint64_t size) { + PARCAGPU_CUBIN_LOADED(crc, cubin, size); +} + +__attribute__((noinline)) void fireCubinUnloaded(uint64_t crc) { + PARCAGPU_CUBIN_UNLOADED(crc); +} + +// Max records per pc_sample_batch probe invocation. +static constexpr uint32_t PCSampleBatchSize = 128; + +__attribute__((noinline)) void firePCSampleBatch( + const void **ptrs, uint32_t count) { + PARCAGPU_PC_SAMPLE_BATCH(ptrs, count); +} + +namespace { + +// CUPTI helper functions (adapted from Proton's CuptiPCSamplingUtils.h) +// These wrap Proton's cupti API calls with PARCAGPU-specific setup + +uint64_t getCubinCrc(const char *cubin, size_t size) { + CUpti_GetCubinCrcParams cubinCrcParams = { + /*size=*/CUpti_GetCubinCrcParamsSize, + /*cubinSize=*/size, + /*cubin=*/cubin, + /*cubinCrc=*/0, + }; + proton::cupti::getCubinCrc(&cubinCrcParams); + return cubinCrcParams.cubinCrc; +} + +void enablePCSampling(CUcontext context) { + CUpti_PCSamplingEnableParams params = { + /*size=*/CUpti_PCSamplingEnableParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + }; + proton::cupti::pcSamplingEnable(¶ms); +} + +bool startPCSampling(CUcontext context) { + // CUPTI requires the GPU to be idle before starting PC sampling. + proton::cuda::ctxSynchronize(); + CUpti_PCSamplingStartParams params = { + /*size=*/CUpti_PCSamplingStartParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + }; + auto ret = proton::cupti::pcSamplingStart(¶ms); + if (ret != CUPTI_SUCCESS) { + DEBUG_PRINTF("cuptiPCSamplingStart failed: %d\n", ret); + return false; + } + return true; +} + +bool stopPCSampling(CUcontext context) { + CUpti_PCSamplingStopParams params = { + /*size=*/CUpti_PCSamplingStopParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + }; + auto ret = proton::cupti::pcSamplingStop(¶ms); + if (ret != CUPTI_SUCCESS) { + DEBUG_PRINTF("cuptiPCSamplingStop failed: %d\n", ret); + return false; + } + return true; +} + +void disablePCSampling(CUcontext context) { + CUpti_PCSamplingDisableParams params = { + /*size=*/CUpti_PCSamplingDisableParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + }; + proton::cupti::pcSamplingDisable(¶ms); +} + +// Returns true if data was retrieved successfully, false on error. +bool getPCSamplingData(CUcontext context, + CUpti_PCSamplingData *pcSamplingData) { + CUpti_PCSamplingGetDataParams params = { + /*size=*/CUpti_PCSamplingGetDataParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + /*pcSamplingData=*/pcSamplingData, + }; + auto result = proton::cupti::pcSamplingGetData(¶ms); + if (result != CUPTI_SUCCESS) { + DEBUG_PRINTF("cuptiPCSamplingGetData failed: error %d (ctx=%p)\n", result, + context); + return false; + } + return true; +} + +void setConfigurationAttribute( + CUcontext context, + std::vector &configurationInfos) { + CUpti_PCSamplingConfigurationInfoParams infoParams = { + /*size=*/CUpti_PCSamplingConfigurationInfoParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + /*numAttributes=*/configurationInfos.size(), + /*pPCSamplingConfigurationInfo=*/configurationInfos.data(), + }; + proton::cupti::pcSamplingSetConfigurationAttribute(&infoParams); +} + +std::tuple +getSassToSourceCorrelation(const char *functionName, uint64_t pcOffset, + const char *cubin, size_t cubinSize) { + CUpti_GetSassToSourceCorrelationParams sassToSourceParams = { + /*size=*/CUpti_GetSassToSourceCorrelationParamsSize, + /*cubin=*/cubin, + /*functionName=*/functionName, + /*cubinSize=*/cubinSize, + /*lineNumber=*/0, + /*pcOffset=*/pcOffset, + /*fileName=*/NULL, + /*dirName=*/NULL, + }; + // Get source can fail if the line mapping is not available + proton::cupti::getSassToSourceCorrelation(&sassToSourceParams); + auto fileNameStr = sassToSourceParams.fileName + ? std::string(sassToSourceParams.fileName) + : ""; + auto dirNameStr = + sassToSourceParams.dirName ? std::string(sassToSourceParams.dirName) : ""; + // Free the memory + if (sassToSourceParams.fileName) + std::free(sassToSourceParams.fileName); + if (sassToSourceParams.dirName) + std::free(sassToSourceParams.dirName); + return std::make_tuple(sassToSourceParams.lineNumber, fileNameStr, + dirNameStr); +} + +// Double-checked locking helper +template +void doubleCheckedLock(CheckFn check, std::mutex &mutex, ActionFn action) { + if (check()) { + std::lock_guard lock(mutex); + if (check()) { + action(); + } + } +} + +// Helper to get PARCAGPU's custom sampling frequency from environment +uint32_t getGPUSamplingFrequency() { + // Default frequency for PARCAGPU is 18 (Proton uses 10) + constexpr uint32_t PARCAGPU_DEFAULT_FREQUENCY = 18; + + uint32_t samplingPeriod = PARCAGPU_DEFAULT_FREQUENCY; + const char *sampling_factor_env = getenv("PARCAGPU_SAMPLING_FACTOR"); + if (sampling_factor_env) { + int factor = atoi(sampling_factor_env); + if (factor >= 5 && factor <= 31) { + samplingPeriod = factor; + DEBUG_PRINTF("Using PARCAGPU_SAMPLING_FACTOR=%u\n", samplingPeriod); + } else if (factor != 0) { // 0 is handled in isSupported() + fprintf(stderr, + "[PARCAGPU] Warning: PARCAGPU_SAMPLING_FACTOR=%d out of range " + "[5,31], using default %u\n", + factor, PARCAGPU_DEFAULT_FREQUENCY); + } + } else { + return 0; + } + return samplingPeriod; +} + +// Get number of stall reasons +size_t getNumStallReasons(CUcontext context) { + size_t numStallReasons = 0; + CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = { + /*size=*/CUpti_PCSamplingGetNumStallReasonsParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + /*numStallReasons=*/&numStallReasons}; + proton::cupti::pcSamplingGetNumStallReasons(&numStallReasonsParams); + return numStallReasons; +} + +// Get stall reason names and indices +std::pair +getStallReasonNamesAndIndices(CUcontext context, size_t numStallReasons) { + char **stallReasonNames = + static_cast(std::calloc(numStallReasons, sizeof(char *))); + for (size_t i = 0; i < numStallReasons; i++) { + stallReasonNames[i] = static_cast( + std::calloc(CUPTI_STALL_REASON_STRING_SIZE, sizeof(char))); + } + uint32_t *stallReasonIndices = + static_cast(std::calloc(numStallReasons, sizeof(uint32_t))); + CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = { + /*size=*/CUpti_PCSamplingGetStallReasonsParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + /*numStallReasons=*/numStallReasons, + /*stallReasonIndex=*/stallReasonIndices, + /*stallReasons=*/stallReasonNames, + }; + proton::cupti::pcSamplingGetStallReasons(&stallReasonsParams); + return std::make_pair(stallReasonNames, stallReasonIndices); +} + +// Match stall reasons to indices (PARCAGPU emits all stall reasons) +size_t matchStallReasonsToIndices( + size_t numStallReasons, char **stallReasonNames, + uint32_t *stallReasonIndices, + std::map &stallReasonIndexToMetricIndex, + std::set ¬IssuedStallReasonIndices) { + // PARCAGPU emits all stall reasons + size_t numValidStalls = 0; + for (size_t i = 0; i < numStallReasons; i++) { + std::string cuptiStallName = std::string(stallReasonNames[i]); + bool notIssued = cuptiStallName.find("not_issued") != std::string::npos || + cuptiStallName.find("Not Issued") != std::string::npos; + + if (notIssued) + notIssuedStallReasonIndices.insert(stallReasonIndices[i]); + stallReasonIndexToMetricIndex[stallReasonIndices[i]] = i; + numValidStalls++; + } + return numValidStalls; +} + +// Allocate PC sampling data buffer +CUpti_PCSamplingData allocPCSamplingData(size_t collectNumPCs, + size_t numValidStallReasons) { + CUpti_PCSamplingData pcSamplingData{ + /*size=*/sizeof(CUpti_PCSamplingData), + /*collectNumPcs=*/collectNumPCs, + /*totalSamples=*/0, + /*droppedSamples=*/0, + /*totalNumPcs=*/0, + /*remainingNumPcs=*/0, + /*rangeId=*/0, + /*pPcData=*/ + static_cast( + std::calloc(collectNumPCs, sizeof(CUpti_PCSamplingPCData)))}; + for (size_t i = 0; i < collectNumPCs; ++i) { + pcSamplingData.pPcData[i].stallReason = + static_cast(std::calloc( + numValidStallReasons, sizeof(CUpti_PCSamplingStallReason))); + } + return pcSamplingData; +} + +} // namespace + +// ConfigureData implementation + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureStallReasons() { + numStallReasons = getNumStallReasons(context); + std::tie(this->stallReasonNames, this->stallReasonIndices) = + getStallReasonNamesAndIndices(context, numStallReasons); + numValidStallReasons = matchStallReasonsToIndices( + numStallReasons, stallReasonNames, stallReasonIndices, + stallReasonIndexToMetricIndex, notIssuedStallReasonIndices); + + CUpti_PCSamplingConfigurationInfo stallReasonInfo{}; + stallReasonInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON; + stallReasonInfo.attributeData.stallReasonData.stallReasonCount = + numValidStallReasons; + stallReasonInfo.attributeData.stallReasonData.pStallReasonIndex = + stallReasonIndices; + return stallReasonInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingPeriod() { + CUpti_PCSamplingConfigurationInfo samplingPeriodInfo{}; + samplingPeriodInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD; + + // Use PARCAGPU's custom sampling frequency + uint32_t frequency = getGPUSamplingFrequency(); + + samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod = + frequency; + return samplingPeriodInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingBuffer() { + CUpti_PCSamplingConfigurationInfo samplingBufferInfo{}; + samplingBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER; + this->pcSamplingData = + allocPCSamplingData(DataBufferPCCount, numValidStallReasons); + samplingBufferInfo.attributeData.samplingDataBufferData.samplingDataBuffer = + &this->pcSamplingData; + return samplingBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() { + CUpti_PCSamplingConfigurationInfo scratchBufferInfo{}; + scratchBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE; + scratchBufferInfo.attributeData.scratchBufferSizeData.scratchBufferSize = + ScratchBufferSize; + return scratchBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() { + CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{}; + hardwareBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE; + hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize = + HardwareBufferSize; + return hardwareBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureCollectionMode() { + CUpti_PCSamplingConfigurationInfo collectionModeInfo{}; + collectionModeInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE; + collectionModeInfo.attributeData.collectionModeData.collectionMode = + CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED; + return collectionModeInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureStartStopControl() { + CUpti_PCSamplingConfigurationInfo startStopControlInfo{}; + startStopControlInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL; + startStopControlInfo.attributeData.enableStartStopControlData + .enableStartStopControl = true; + return startStopControlInfo; +} + +void ConfigureData::initialize(CUcontext context) { + this->context = context; + proton::cupti::getContextId(context, &contextId); + + DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n", context, + contextId); + + configurationInfos.emplace_back(configureStallReasons()); + configurationInfos.emplace_back(configureCollectionMode()); + configurationInfos.emplace_back(configureStartStopControl()); + configurationInfos.emplace_back(configureSamplingBuffer()); + // Don't set sampling period — let CUPTI use its default. + // Explicit period values silently break sampling on some GPUs (e.g. + // Blackwell). + + setConfigurationAttribute(context, configurationInfos); + + // Allocate a separate output buffer for getPCSamplingData calls. + // The configured pcSamplingData buffer is owned by CUPTI internally; + // we must pass a different buffer to getPCSamplingData. + this->outputData = + allocPCSamplingData(DataBufferPCCount, numValidStallReasons); + + DEBUG_PRINTF("PC sampling configured with %u stall reasons (%u valid)\n", + numStallReasons, numValidStallReasons); +} + +// GPUPCSampling implementation + +bool PCSampling::isSupported() { + // PC sampling is enabled by default. + // Set PARCAGPU_SAMPLING_FACTOR=0 to disable. + const char *env = getenv("PARCAGPU_SAMPLING_FACTOR"); + if (env && atoi(env) == 0) { + DEBUG_PRINTF("PC sampling disabled via PARCAGPU_SAMPLING_FACTOR=0\n"); + return false; + } + + // Check CUDA driver version >= 12.8.1 + int driverVersion = 0; + proton::cuda::driverGetVersion(&driverVersion); + + if (driverVersion < CUDA_VERSION_12_8_1) { + int major = driverVersion / 1000; + int minor = (driverVersion % 1000) / 10; + int patch = driverVersion % 10; + DEBUG_PRINTF("PC sampling not supported: CUDA driver version %d.%d.%d < " + "required 12.8.1\n", + major, minor, patch); + fireError(driverVersion, + "CUDA driver version too low for PC sampling (need >= 12.8.1)", + "pc_sampling"); + return false; + } + + // Check CUPTI API/driver version compatibility. + // CUPTI 12.4 (v22) added correlationId to CUpti_PCSamplingPCData, breaking + // ABI. Mixing compile-time and runtime versions across this boundary crashes. + uint32_t cuptiVersion = 0; + proton::cupti::getVersion(&cuptiVersion); + + if ((cuptiVersion < CUPTI_CUDA12_4_VERSION && + CUPTI_API_VERSION >= CUPTI_CUDA12_4_VERSION) || + (cuptiVersion >= CUPTI_CUDA12_4_VERSION && + CUPTI_API_VERSION < CUPTI_CUDA12_4_VERSION)) { + DEBUG_PRINTF( + "PC sampling disabled: CUPTI API version %d and driver version %d " + "are incompatible across the 12.4 (v22) ABI boundary\n", + CUPTI_API_VERSION, cuptiVersion); + fireError((int32_t)cuptiVersion, + "CUPTI API/driver version mismatch (12.4 ABI boundary)", + "pc_sampling"); + return false; + } + + // Attempt a lightweight permission probe. CUPTI PC sampling requires + // either root, CAP_SYS_ADMIN, or the NVIDIA module parameter + // NVreg_RestrictProfilingToAdminUsers=0. + // We cannot easily pre-check permissions without attempting CUPTI calls, + // so we defer the real check to initialize() where enablePCSampling() + // will fail with a CUPTI error if permissions are insufficient. + // TODO: Add explicit permission pre-check. + // Reference: https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters + + int major = driverVersion / 1000; + int minor = (driverVersion % 1000) / 10; + int patch = driverVersion % 10; + DEBUG_PRINTF("PC sampling supported: CUDA %d.%d.%d, CUPTI v%u (API v%d)\n", + major, minor, patch, cuptiVersion, CUPTI_API_VERSION); + return true; +} + +ConfigureData *PCSampling::getConfigureData(uint32_t contextId) { + return &contextIdToConfigureData[contextId]; +} + +CubinData *PCSampling::getCubinData(uint64_t cubinCrc) { + return &(cubinCrcToCubinData[cubinCrc].first); +} + +void PCSampling::initialize(CUcontext context) { + uint32_t contextId = 0; + proton::cupti::getContextId(context, &contextId); + + doubleCheckedLock( + [&]() { + return !contextInitialized.contain(contextId) && + !contextFailed.contain(contextId); + }, + contextMutex, + [&]() { + // enablePCSampling can fail due to insufficient permissions + // (ERR_NVGPUCTRPERM). Catch and degrade gracefully. + CUpti_PCSamplingEnableParams enableParams = { + /*size=*/CUpti_PCSamplingEnableParamsSize, + /*pPriv=*/NULL, + /*ctx=*/context, + }; + auto result = + proton::cupti::pcSamplingEnable(&enableParams); + if (result != CUPTI_SUCCESS) { + DEBUG_PRINTF( + "Failed to enable PC sampling for context %u: CUPTI error %d\n" + "This may be a permission issue. See:\n" + "https://developer.nvidia.com/nvidia-development-tools-solutions-" + "err_nvgpuctrperm-permission-issue-performance-counters\n", + contextId, result); + fireError((int32_t)result, + "Failed to enable PC sampling (possible permission issue)", + "pc_sampling"); + contextFailed.insert(contextId); + return; + } + + auto *configData = getConfigureData(contextId); + configData->initialize(context); + + // Build contiguous stall reason map for USDT probe emission. + stallReasonMap.build(configData->numStallReasons, + configData->stallReasonIndices, + configData->stallReasonNames); + + contextInitialized.insert(contextId); + initializedContextIds.push_back(contextId); + DEBUG_PRINTF( + "PC sampling initialized (serialized mode) for context %u\n", + contextId); + }); +} + +void PCSampling::start(CUcontext context) { + std::lock_guard lock(pcSamplingMutex); + if (samplingActive) + return; + if (startPCSampling(context)) { + samplingActive = true; + samplingContext = context; + DEBUG_PRINTF("PC sampling started (kernels serialized)\n"); + } +} + +void PCSampling::stop(CUcontext context) { + std::lock_guard lock(pcSamplingMutex); + if (!samplingActive) + return; + stopPCSampling(context); + samplingActive = false; + DEBUG_PRINTF("PC sampling stopped (kernels concurrent)\n"); + // Drain data collected during this window. + collectData(context); +} + +__attribute__((noinline)) void fireError(int32_t code, const char *message, + const char *component) { + PARCAGPU_ERROR(code, message, component); +} + +void PCSampling::processPCSamplingData(ConfigureData *configureData) { + auto *pcSamplingData = &configureData->outputData; + + if (pcSamplingData->totalNumPcs == 0) { + return; + } + + DEBUG_PRINTF("Processing %zu PCs (remaining: %zu)\n", + pcSamplingData->totalNumPcs, pcSamplingData->remainingNumPcs); + + if (debug_enabled) { + for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) { + auto *pcData = pcSamplingData->pPcData + i; + + uint64_t totalSamples = 0; + uint64_t stalledSamples = 0; + for (size_t j = 0; j < pcData->stallReasonCount; ++j) { + auto *stallReason = &pcData->stallReason[j]; + totalSamples += stallReason->samples; + bool isNotIssued = configureData->notIssuedStallReasonIndices.count( + stallReason->pcSamplingStallReasonIndex) > 0; + if (!isNotIssued) + stalledSamples += stallReason->samples; + } + + auto *cubinData = getCubinData(pcData->cubinCrc); + auto key = + CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset}; + if (cubinData->lineInfo.find(key) == cubinData->lineInfo.end()) { + auto [lineNumber, fileName, dirName] = + getSassToSourceCorrelation(pcData->functionName, pcData->pcOffset, + cubinData->cubin, cubinData->cubinSize); + cubinData->lineInfo.try_emplace(key, lineNumber, + std::string(pcData->functionName), + dirName, fileName); + } + auto &lineInfo = cubinData->lineInfo[key]; + std::string fullPath = lineInfo.fileName.size() + ? lineInfo.dirName + "/" + lineInfo.fileName + : ""; + DEBUG_PRINTF(" [%zu] func=%s pc=0x%lx total=%lu stalled=%lu %s:%u\n", i, + lineInfo.functionName.c_str(), pcData->pcOffset, + totalSamples, stalledSamples, fullPath.c_str(), + lineInfo.lineNumber); + } + } + + // Emit batched PC sample probes as a bag of pointers (like activity_batch). + // Using pointers avoids depending on the CUPTI struct stride, which can + // change across CUDA versions. + const void *batchPtrs[PCSampleBatchSize]; + uint32_t batchCount = 0; + + for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) { + batchPtrs[batchCount++] = &pcSamplingData->pPcData[i]; + if (batchCount == PCSampleBatchSize) { + firePCSampleBatch(batchPtrs, batchCount); + batchCount = 0; + } + } + if (batchCount > 0) { + firePCSampleBatch(batchPtrs, batchCount); + } +} + +void PCSampling::emitMetadata() { + // Re-emit stall reason map when a tracer attaches (semaphore transitions + // to non-zero), using the same pattern as cubin replay. + if (stallReasonMap.data() && PARCAGPU_STALL_REASON_MAP_ENABLED()) { + if (!stallMapEmitted) { + PARCAGPU_STALL_REASON_MAP(stallReasonMap.data(), + stallReasonMap.numEntries()); + stallMapEmitted = true; + } + } else { + stallMapEmitted = false; + } + + // Replay cubin_loaded probes for late-attaching tracers. + // When the semaphore transitions to non-zero, re-emit all known cubins. + if (PARCAGPU_CUBIN_LOADED_ENABLED()) { + if (!cubinsEmitted) { + DEBUG_PRINTF("Emitting cubins"); + std::lock_guard lock(contextMutex); + for (const auto &ref : loadedCubins) { + fireCubinLoaded(ref.crc, ref.data, ref.size); + } + cubinsEmitted = true; + } + } else { + // Tracer detached — reset so we replay again on next attach. + cubinsEmitted = false; + } +} + +void PCSampling::collectData(CUcontext context) { + uint32_t contextId = 0; + proton::cupti::getContextId(context, &contextId); + + if (!contextInitialized.contain(contextId)) { + DEBUG_PRINTF("Context %u not initialized, skipping data collection\n", + contextId); + return; + } + + auto *configureData = getConfigureData(contextId); + DEBUG_PRINTF("Collecting PC sampling data for context %u (cfg total=%zu " + "remaining=%zu)\n", + contextId, configureData->pcSamplingData.totalNumPcs, + configureData->pcSamplingData.remainingNumPcs); + + // Use the separate output buffer for getData — the configured + // pcSamplingData buffer is owned by CUPTI. + bool ok = getPCSamplingData(context, &configureData->outputData); + DEBUG_PRINTF("getData: ok=%d output total=%zu remaining=%zu " + "cfg total=%zu remaining=%zu\n", + ok, configureData->outputData.totalNumPcs, + configureData->outputData.remainingNumPcs, + configureData->pcSamplingData.totalNumPcs, + configureData->pcSamplingData.remainingNumPcs); + processPCSamplingData(configureData); +} + +void PCSampling::collectAllData() { + std::lock_guard lock(contextMutex); + for (auto contextId : initializedContextIds) { + auto result = contextIdToConfigureData.find(contextId); + if (!result) { + DEBUG_PRINTF("Context %u in initializedContextIds but not in map, " + "skipping\n", + contextId); + continue; + } + auto *configureData = &result->get(); + DEBUG_PRINTF("Draining PC sampling data for context %u\n", contextId); + processPCSamplingData(configureData); + } +} + +void PCSampling::finalize(CUcontext context) { + uint32_t contextId = 0; + proton::cupti::getContextId(context, &contextId); + + if (!contextInitialized.contain(contextId)) { + // Clean up failed context tracking if applicable. + contextFailed.erase(contextId); + return; + } + + // Hold contextMutex for the entire finalize to prevent collectAllData + // from racing with us (it iterates initializedContextIds under this lock). + std::lock_guard lock(contextMutex); + + DEBUG_PRINTF("Finalizing PC sampling for context %p\n", context); + + // Remove from iteration list first so collectAllData won't touch this context + initializedContextIds.erase(std::remove(initializedContextIds.begin(), + initializedContextIds.end(), + contextId), + initializedContextIds.end()); + + // Stop sampling if it was started on this context. + { + std::lock_guard lock2(pcSamplingMutex); + if (samplingActive) { + stopPCSampling(context); + samplingActive = false; + } + } + + // Drain remaining data before disabling + auto *configureData = getConfigureData(contextId); + processPCSamplingData(configureData); + + // After disable, CUPTI may fill remaining records — drain once more + if (configureData->pcSamplingData.totalNumPcs > 0) { + processPCSamplingData(configureData); + } + + contextIdToConfigureData.erase(contextId); + contextInitialized.erase(contextId); +} + +void PCSampling::loadModule(const char *cubin, size_t cubinSize) { + auto cubinCrc = getCubinCrc(cubin, cubinSize); + + if (cubinCrcToCubinData.contain(cubinCrc)) { + // Increment reference count + cubinCrcToCubinData[cubinCrc].second++; + DEBUG_PRINTF("Module 0x%lx loaded (refcount=%zu)\n", cubinCrc, + cubinCrcToCubinData[cubinCrc].second); + } else { + // New module — getCubinData after the contain() check so operator[] + // doesn't auto-insert before we test. + auto *cubinData = getCubinData(cubinCrc); + cubinData->cubinCrc = cubinCrc; + cubinData->cubinSize = cubinSize; + cubinData->cubin = cubin; + cubinCrcToCubinData[cubinCrc].second = 1; + DEBUG_PRINTF("Module 0x%lx loaded (new)\n", cubinCrc); + fireCubinLoaded(cubinCrc, cubin, cubinSize); + { + std::lock_guard lock(contextMutex); + loadedCubins.push_back({cubinCrc, cubin, cubinSize}); + } + } +} + +void PCSampling::unloadModule(const char *cubin, size_t cubinSize) { + auto cubinCrc = getCubinCrc(cubin, cubinSize); + + if (!cubinCrcToCubinData.contain(cubinCrc)) + return; + + auto count = cubinCrcToCubinData[cubinCrc].second; + if (count > 1) { + cubinCrcToCubinData[cubinCrc].second = count - 1; + DEBUG_PRINTF("Module 0x%lx unloaded (refcount=%zu)\n", cubinCrc, count - 1); + } else { + cubinCrcToCubinData.erase(cubinCrc); + DEBUG_PRINTF("Module 0x%lx unloaded (removed)\n", cubinCrc); + fireCubinUnloaded(cubinCrc); + { + std::lock_guard lock(contextMutex); + loadedCubins.erase(std::remove_if(loadedCubins.begin(), + loadedCubins.end(), + [cubinCrc](const CubinRef &r) { + return r.crc == cubinCrc; + }), + loadedCubins.end()); + } + } +} + +} // namespace parcagpu diff --git a/src/pc_sampling.h b/src/pc_sampling.h new file mode 100644 index 0000000..0bcd56d --- /dev/null +++ b/src/pc_sampling.h @@ -0,0 +1,171 @@ +#ifndef PC_SAMPLING_H_ +#define PC_SAMPLING_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "stall_reason_map.h" + +#include +#include + +#include "Driver/GPU/CuptiApi.h" +#include "Profiler/Cupti/CuptiPCSampling.h" +#include "Utility/Map.h" +#include "Utility/Set.h" + +#define DEBUG_PRINTF(...) \ + do { \ + parcagpu::init_debug(); \ + if (parcagpu::debug_enabled) { \ + struct timespec ts; \ + clock_gettime(CLOCK_REALTIME, &ts); \ + fprintf(stderr, "[%ld.%09ld] ", ts.tv_sec, ts.tv_nsec); \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) + +namespace parcagpu { + +// Debug logging control — defined in cupti.cpp. +extern bool debug_enabled; +extern void init_debug(); + +// Use Proton's CubinData directly +using proton::CubinData; + +// ConfigureData for PARCAGPU (based on Proton's but standalone) +// We don't inherit to avoid linking Proton's profiler dependencies +struct ConfigureData { + ConfigureData() = default; + + ~ConfigureData() { + if (stallReasonNames) { + for (size_t i = 0; i < numStallReasons; i++) { + if (stallReasonNames[i]) + std::free(stallReasonNames[i]); + } + std::free(stallReasonNames); + } + if (stallReasonIndices) + std::free(stallReasonIndices); + if (pcSamplingData.pPcData) { + for (size_t i = 0; i < numValidStallReasons; ++i) { + std::free(pcSamplingData.pPcData[i].stallReason); + } + std::free(pcSamplingData.pPcData); + } + } + + void initialize(CUcontext context); + + CUpti_PCSamplingConfigurationInfo configureStallReasons(); + CUpti_PCSamplingConfigurationInfo configureSamplingPeriod(); + CUpti_PCSamplingConfigurationInfo configureSamplingBuffer(); + CUpti_PCSamplingConfigurationInfo configureScratchBuffer(); + CUpti_PCSamplingConfigurationInfo configureHardwareBufferSize(); + CUpti_PCSamplingConfigurationInfo configureCollectionMode(); + CUpti_PCSamplingConfigurationInfo configureStartStopControl(); + + // Buffer size constants (from Proton) + static constexpr size_t HardwareBufferSize = 128 * 1024 * 1024; + static constexpr size_t ScratchBufferSize = 16 * 1024 * 1024; + static constexpr size_t DataBufferPCCount = 1024; + + CUcontext context{}; + uint32_t contextId; + uint32_t numStallReasons{}; + uint32_t numValidStallReasons{}; + char **stallReasonNames{}; + uint32_t *stallReasonIndices{}; + std::map stallReasonIndexToMetricIndex{}; + std::set notIssuedStallReasonIndices{}; + CUpti_PCSamplingData pcSamplingData{}; // registered with CUPTI config + CUpti_PCSamplingData outputData{}; // separate buffer for getData calls + std::vector configurationInfos; +}; + +// PC Sampling class (adapted from Proton's CuptiPCSampling) +// Owned by CuptiProfiler — not a standalone singleton, so lifetime +// is tied to the profiler and there are no static destruction order issues. +class PCSampling { +public: + PCSampling() = default; + ~PCSampling() = default; + + // Check if PC sampling is supported (CUPTI >= 12.8.1). + // Enabled by default; set PARCAGPU_SAMPLING_FACTOR=0 to disable. + static bool isSupported(); + + void initialize(CUcontext context); + + // Start PC sampling — kernels become serialized until stop(). + // No-op if already started. Thread-safe. + void start(CUcontext context); + + // Stop PC sampling, drain accumulated data, and emit probes. + // Kernels resume concurrent execution. No-op if not started. + void stop(CUcontext context); + + // Emit stall reason map and replay cubin probes for late-attaching tracers. + // Call periodically regardless of sampling state. + void emitMetadata(); + + void collectData(CUcontext context); + void collectAllData(); + void finalize(CUcontext context); + void loadModule(const char *cubin, size_t cubinSize); + void unloadModule(const char *cubin, size_t cubinSize); + +private: + ConfigureData *getConfigureData(uint32_t contextId); + CubinData *getCubinData(uint64_t cubinCrc); + void processPCSamplingData(ConfigureData *configureData); + + proton::ThreadSafeMap contextIdToConfigureData; + proton::ThreadSafeMap> + cubinCrcToCubinData; + proton::ThreadSafeSet contextInitialized; + proton::ThreadSafeSet contextFailed; // contexts where enable failed + + // Plain vector of initialized context IDs for iteration in collectAllData. + // Protected by contextMutex. + std::vector initializedContextIds; + + // Tracks whether CUPTI PC sampling is currently active (start/stop). + // Only one context can be sampling at a time in KERNEL_SERIALIZED mode. + std::atomic samplingActive{false}; + CUcontext samplingContext{}; + std::mutex pcSamplingMutex{}; + std::mutex contextMutex{}; + + // Contiguous stall reason map for USDT probe emission. + StallReasonMap stallReasonMap; + + // Lightweight cubin metadata for replaying cubin_loaded probes to + // late-attaching tracers. Protected by contextMutex. + struct CubinRef { + uint64_t crc; + const char *data; + size_t size; + }; + std::vector loadedCubins; + + // Tracks whether we've replayed cubin_loaded probes for a late-attaching + // tracer. Reset to false when the cubin_loaded semaphore transitions to + // non-zero. + bool cubinsEmitted = false; + bool stallMapEmitted = false; +}; + +// Fire the error USDT probe. Callable from any translation unit. +void fireError(int32_t code, const char *message, const char *component); + +} // namespace parcagpu + +#endif // PARCAGPU_PC_SAMPLING_H_ diff --git a/src/probes.d b/src/probes.d new file mode 100644 index 0000000..dba3db4 --- /dev/null +++ b/src/probes.d @@ -0,0 +1,14 @@ +provider parcagpu { + probe cuda_correlation(uint32_t correlationId, int signedCbid, + const char *name); + probe kernel_executed(uint64_t start, uint64_t end, + uint32_t correlationId, uint32_t deviceId, + uint32_t streamId, uint32_t graphId, + uint64_t graphNodeId, const char *name); + probe activity_batch(const void **ptrs, uint32_t count); + probe pc_sample_batch(const void **records, uint32_t count); + probe stall_reason_map(const char *names, uint32_t count); + probe cubin_loaded(uint64_t cubinCrc, const char *cubin, uint64_t cubinSize); + probe cubin_unloaded(uint64_t cubinCrc); + probe error(int32_t code, const char *message, const char *component); +}; diff --git a/src/stall_reason_map.h b/src/stall_reason_map.h new file mode 100644 index 0000000..ca64ed9 --- /dev/null +++ b/src/stall_reason_map.h @@ -0,0 +1,56 @@ +#ifndef PARCAGPU_STALL_REASON_MAP_H_ +#define PARCAGPU_STALL_REASON_MAP_H_ + +#include +#include +#include + +namespace parcagpu { + +// Contiguous, BPF-friendly stall reason name table. +// Array of fixed-width 64-byte name slots indexed directly by stall reason +// index. BPF reads names[stallReasonIndex * 64] — no pointer chasing. +static constexpr uint32_t STALL_REASON_NAME_LEN = 64; + +class StallReasonMap { +public: + StallReasonMap() = default; + ~StallReasonMap() { std::free(buf); } + + StallReasonMap(const StallReasonMap &) = delete; + StallReasonMap &operator=(const StallReasonMap &) = delete; + + // Build from parallel arrays (as returned by CUPTI). + // Indices must be dense 0..N-1. + void build(uint32_t numReasons, const uint32_t *indices, char **names) { + // Find max index to size the array. + uint32_t maxIdx = 0; + for (uint32_t i = 0; i < numReasons; i++) { + if (indices[i] > maxIdx) + maxIdx = indices[i]; + } + count = maxIdx + 1; + + std::free(buf); + bufSize = count * STALL_REASON_NAME_LEN; + buf = static_cast(std::calloc(1, bufSize)); + + for (uint32_t i = 0; i < numReasons; i++) { + char *slot = buf + indices[i] * STALL_REASON_NAME_LEN; + strncpy(slot, names[i], STALL_REASON_NAME_LEN - 1); + } + } + + const char *data() const { return buf; } + uint32_t size() const { return bufSize; } + uint32_t numEntries() const { return count; } + +private: + char *buf = nullptr; + uint32_t bufSize = 0; + uint32_t count = 0; +}; + +} // namespace parcagpu + +#endif // PARCAGPU_STALL_REASON_MAP_H_ diff --git a/src/token_bucket.h b/src/token_bucket.h new file mode 100644 index 0000000..88d6d7c --- /dev/null +++ b/src/token_bucket.h @@ -0,0 +1,59 @@ +#ifndef PARCAGPU_TOKEN_BUCKET_H_ +#define PARCAGPU_TOKEN_BUCKET_H_ + +#include +#include + +namespace parcagpu { + +// Simple token bucket rate limiter. Not thread-safe — use one instance per +// thread, or a thread_local instance when shared across call sites. +class TokenBucket { +public: + // startFull=true: first tryAcquire succeeds immediately. + // startFull=false: must wait for refill before first success. + explicit TokenBucket(double tokensPerSec, bool startFull = true) + : rate(tokensPerSec), tokens(startFull ? 1.0 : 0.0) {} + + void setRate(double tokensPerSec) { + rate = tokensPerSec; + if (tokens > rate) + tokens = rate; + } + + // Returns true if a token was available and consumed. + bool tryAcquire() { + refill(); + if (tokens >= 1.0) { + tokens -= 1.0; + return true; + } + return false; + } + +private: + void refill() { + uint64_t now = nowNs(); + if (lastRefillNs > 0) { + double elapsed = (now - lastRefillNs) / 1e9; + tokens += elapsed * rate; + if (tokens > rate) + tokens = rate; + } + lastRefillNs = now; + } + + static uint64_t nowNs() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec; + } + + double rate; + double tokens; + uint64_t lastRefillNs = 0; +}; + +} // namespace parcagpu + +#endif // PARCAGPU_TOKEN_BUCKET_H_ diff --git a/test.sh b/test.sh index 56f4e5a..d486cf7 100755 --- a/test.sh +++ b/test.sh @@ -6,12 +6,7 @@ cd "$(dirname "$0")" # Parse arguments USE_BPFTRACE=0 -# Auto-detect architecture -case "$(uname -m)" in - aarch64|arm64) DEFAULT_ARCH="arm64" ;; - *) DEFAULT_ARCH="amd64" ;; -esac -ARCH="${ARCH:-$DEFAULT_ARCH}" +ARCH="${ARCH:-amd64}" CUDA_MAJOR="${CUDA_MAJOR:-12}" for arg in "$@"; do case $arg in @@ -27,8 +22,11 @@ done echo "" echo "=== Building test infrastructure with CMake ===" -mkdir -p test/build -cd test/build && cmake .. -DCUDA_MAJOR_VERSION="${CUDA_MAJOR}" && make && cd ../.. +mkdir -p build-local +cd build-local +cmake .. +make -j$(nproc) +cd .. # Start bpftrace if requested if [ "$USE_BPFTRACE" -eq 1 ]; then @@ -53,12 +51,12 @@ fi echo "" echo "=== Running test program ===" -# Set LD_LIBRARY_PATH so the test can find libcupti.so at runtime -# Set PARCAGPU_DEBUG externally to enable debug output -# Set PARCAGPU_RATE_LIMIT externally to override default (100/s) -export LD_LIBRARY_PATH="$(pwd)/test/build:$LD_LIBRARY_PATH" -# Use the CMake-built library with mock CUPTI -test/build/test_cupti_prof "build/${CUDA_MAJOR}/${ARCH}/libparcagpucupti.so" --kernel-names=kernel_names.txt "$@" +# Set LD_LIBRARY_PATH so the test can find libcupti.so and libparcagpucupti.so at runtime +# Set PARCAGPU_DEBUG to enable debug output +export LD_LIBRARY_PATH="$(pwd)/build-local/lib:$LD_LIBRARY_PATH" +export PARCAGPU_DEBUG=1 +# Run the test program with path to library +./build-local/bin/test_cupti_prof build-local/lib/libparcagpucupti.so "$@" # If bpftrace was started, stop it and show results if [ "$USE_BPFTRACE" -eq 1 ]; then diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cf805ef..8a0a46b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,6 +20,15 @@ set_target_properties(cupti PROPERTIES POSITION_INDEPENDENT_CODE ON ) +# Mock CUDA driver library +add_library(cuda SHARED mock_cuda.c) +target_include_directories(cuda PRIVATE ${CUDA_INCLUDE_DIRS}) +set_target_properties(cuda PROPERTIES + C_STANDARD 11 + C_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON +) + # CUDA major version - prefer explicitly passed value, fall back to detection if(NOT DEFINED CUDA_MAJOR_VERSION) execute_process( diff --git a/test/README.md b/test/README.md index 4cd932a..ff74c58 100644 --- a/test/README.md +++ b/test/README.md @@ -1,19 +1,19 @@ # CUPTI Profiler Test Infrastructure -This directory contains test infrastructure for `libparcagpucupti.so` using Zig as the build system. +This directory contains test infrastructure for `libparcagpucupti.so` using CMake as the build system. ## Components -- **test/mock_cupti.c**: Mock CUPTI library that provides stub implementations of all CUPTI APIs used by cupti-prof.c +- **test/mock_cupti.c**: Mock CUPTI library that provides stub implementations of all CUPTI APIs used by the profiler - **test/test_cupti_prof.c**: Test program that dynamically loads libparcagpucupti.so and simulates CUPTI callbacks -- **build.zig**: Zig build configuration (at project root) +- **CMakeLists.txt**: CMake build configuration (at project root) - **test.sh**: Test script (at project root) ## Building From the project root: ```bash -zig build +make ``` This builds: @@ -21,32 +21,31 @@ This builds: 2. `libparcagpucupti.so` - The profiler library linked against the mock CUPTI 3. `test_cupti_prof` - Test executable that loads and exercises the profiler -All outputs go to `zig-out/lib/` and `zig-out/bin/`. +All outputs go to `build/lib/` and `build/bin/`. ## Running Using the test script (recommended): ```bash -cd /home/tpr/src/gpu/parcagpu ./test.sh ``` -Using Zig directly: +Using Make directly: ```bash -zig build run +make test ``` Or manually: ```bash -zig build -LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so +make +LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so ``` ### Running Continuously To run the test in continuous mode (useful for monitoring probes with bpftrace): ```bash -LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so --forever +LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so --forever ``` In this mode, the test will: @@ -83,7 +82,7 @@ The test script automatically enables `PARCAGPU_DEBUG=1` to show detailed debug To run without debug output: ```bash -LD_LIBRARY_PATH=zig-out/lib zig-out/bin/test_cupti_prof zig-out/lib/libparcagpucupti.so +LD_LIBRARY_PATH=build/lib build/bin/test_cupti_prof build/lib/libparcagpucupti.so ``` ## Verifying DTRACE Probes @@ -92,13 +91,11 @@ To verify that the DTRACE/USDT probes are firing correctly, use the provided bpf **Terminal 1** - Run bpftrace to monitor probes: ```bash -cd /home/tpr/src/gpu/parcagpu sudo bpftrace parcagpu.bt ``` **Terminal 2** - Run the test: ```bash -cd /home/tpr/src/gpu/parcagpu ./test.sh ``` diff --git a/test/bpf/activity_parser.bpf.c b/test/bpf/activity_parser.bpf.c index 33aa754..cb8c495 100644 --- a/test/bpf/activity_parser.bpf.c +++ b/test/bpf/activity_parser.bpf.c @@ -22,23 +22,25 @@ #include "usdt_args.h" -#include "cupti_activity_bpf.h" +#include "cupti_bpf.h" #define MAX_BATCH_SIZE 128 #define MAX_KERNEL_NAME 128 +#define MAX_CUBIN_SIZE (64 * 1024 * 1024) // 64MB safety cap // USDT spec map — populated by Go loader before uprobe attachment. // Keyed by spec ID (uint32); value is struct bpf_usdt_spec. -// Old-style SEC("maps") definition to match the extern in usdt_args.h. -struct bpf_map_def __bpf_usdt_specs SEC("maps") = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(struct bpf_usdt_spec), - .max_entries = 256, -}; +struct usdt_specs_t { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct bpf_usdt_spec); + __uint(max_entries, 256); +} __bpf_usdt_specs SEC(".maps"); // Event sent to user-space for each kernel activity found. struct kernel_event { + u32 event_type; // EVENT_TYPE_KERNEL + u32 _pad; u64 start; u64 end; u32 correlation_id; @@ -49,20 +51,75 @@ struct kernel_event { char name[MAX_KERNEL_NAME]; }; -// Ring buffer for sending events to user-space. -struct bpf_map_def events SEC("maps") = { - .type = BPF_MAP_TYPE_RINGBUF, - .max_entries = 1 << 20, // 1 MB +// Cubin load/unload events — Go reads actual bytes via /proc/pid/mem. +struct cubin_event { + u32 event_type; // EVENT_TYPE_CUBIN_LOADED or EVENT_TYPE_CUBIN_UNLOADED + u32 _pad; + u64 cubin_crc; + u64 cubin_ptr; // user-space address (for /proc/pid/mem read) + u64 cubin_size; }; -// Stats counters. -struct bpf_map_def stats SEC("maps") = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 4, +// Event type tags for the ring buffer. +#define EVENT_TYPE_KERNEL 1 +#define EVENT_TYPE_CUBIN_LOADED 2 +#define EVENT_TYPE_CUBIN_UNLOADED 3 +#define EVENT_TYPE_PC_SAMPLE 4 +#define EVENT_TYPE_ERROR 5 + +// PC sample event sent to user-space. +struct pc_sample_event { + u32 event_type; // EVENT_TYPE_PC_SAMPLE + u32 stall_reason_count; + u64 cubin_crc; + u64 pc_offset; + u32 function_index; + u32 correlation_id; // kernel correlation ID (CUDA 12.4+ / CUPTI v22+, else 0) + char function_name[MAX_FUNC_NAME]; + struct cupti_stall_reason stall_reasons[MAX_STALL_REASONS]; }; +// Error event sent to user-space. +#define MAX_ERROR_MSG 256 +#define MAX_ERROR_COMPONENT 64 +struct error_event { + u32 event_type; // EVENT_TYPE_ERROR + s32 error_code; + char message[MAX_ERROR_MSG]; + char component[MAX_ERROR_COMPONENT]; +}; + +// Ring buffer for sending events to user-space. +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 20); // 1 MB +} events SEC(".maps"); + +// Stall reason name table — indexed by stall reason index. +// Value is a 64-byte null-terminated string. +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, char[STALL_REASON_NAME_LEN]); + __uint(max_entries, MAX_STALL_REASONS); +} stall_reasons SEC(".maps"); + +// Whether the stall reason map has been populated. +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} stall_map_loaded SEC(".maps"); + +// Stats counters. +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 4); +} stats SEC(".maps"); + enum stat_key { STAT_BATCHES = 0, // number of batch probe invocations STAT_ACTIVITIES = 1, // total activity records scanned @@ -126,6 +183,7 @@ int BPF_USDT(handle_activity_batch, u64 ptrs_base, u32 num_activities) { continue; } + evt->event_type = EVENT_TYPE_KERNEL; evt->start = record.start; evt->end = record.end; evt->correlation_id = record.correlation_id; @@ -148,4 +206,166 @@ int BPF_USDT(handle_activity_batch, u64 ptrs_base, u32 num_activities) { return 0; } +SEC("usdt/parcagpu/stall_reason_map") +int BPF_USDT(handle_stall_reason_map, u64 names_base, u32 count) { + // Only load the map once. + u32 zero = 0; + u32 *loaded = bpf_map_lookup_elem(&stall_map_loaded, &zero); + if (!loaded) + return 0; + if (*loaded) + return 0; + + if (count > MAX_STALL_REASONS) + count = MAX_STALL_REASONS; + + // Read each 64-byte name slot and store in the BPF map. + for (u32 i = 0; i < MAX_STALL_REASONS; i++) { + if (i >= count) + break; + + char name[STALL_REASON_NAME_LEN] = {}; + int ret = bpf_probe_read_user( + name, sizeof(name), + (void *)(names_base + (u64)i * STALL_REASON_NAME_LEN)); + if (ret != 0) + continue; + + bpf_map_update_elem(&stall_reasons, &i, name, BPF_ANY); + } + + u32 one = 1; + bpf_map_update_elem(&stall_map_loaded, &zero, &one, BPF_ANY); + return 0; +} + +SEC("usdt/parcagpu/cubin_loaded") +int BPF_USDT(handle_cubin_loaded, u64 cubin_crc, u64 cubin_ptr, + u64 cubin_size) { + if (cubin_size == 0 || cubin_size > MAX_CUBIN_SIZE) + return 0; + + struct cubin_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0); + if (!evt) { + bump_stat(STAT_DROPS); + return 0; + } + + evt->event_type = EVENT_TYPE_CUBIN_LOADED; + evt->cubin_crc = cubin_crc; + evt->cubin_ptr = cubin_ptr; + evt->cubin_size = cubin_size; + bpf_ringbuf_submit(evt, 0); + return 0; +} + +SEC("usdt/parcagpu/cubin_unloaded") +int BPF_USDT(handle_cubin_unloaded, u64 cubin_crc) { + struct cubin_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0); + if (!evt) { + bump_stat(STAT_DROPS); + return 0; + } + + evt->event_type = EVENT_TYPE_CUBIN_UNLOADED; + evt->cubin_crc = cubin_crc; + evt->cubin_ptr = 0; + evt->cubin_size = 0; + bpf_ringbuf_submit(evt, 0); + return 0; +} + +SEC("usdt/parcagpu/pc_sample_batch") +int BPF_USDT(handle_pc_sample_batch, u64 ptrs_base, u32 count) { + if (count > MAX_PC_BATCH_SIZE) + count = MAX_PC_BATCH_SIZE; + + for (u32 i = 0; i < MAX_PC_BATCH_SIZE; i++) { + if (i >= count) + break; + + // Read the i-th pointer from the array. + u64 rec_ptr = 0; + int ret = bpf_probe_read_user(&rec_ptr, sizeof(rec_ptr), + (void *)(ptrs_base + (u64)i * sizeof(u64))); + if (ret != 0 || rec_ptr == 0) + continue; + + // Chase the pointer to read the CUPTI PC data record. + struct cupti_pc_data rec = {}; + ret = bpf_probe_read_user(&rec, sizeof(rec), (void *)rec_ptr); + if (ret != 0) + continue; + + // Reserve ring buffer space for the event. + struct pc_sample_event *evt = + bpf_ringbuf_reserve(&events, sizeof(*evt), 0); + if (!evt) { + bump_stat(STAT_DROPS); + continue; + } + + evt->event_type = EVENT_TYPE_PC_SAMPLE; + evt->cubin_crc = rec.cubin_crc; + evt->pc_offset = rec.pc_offset; + evt->function_index = rec.function_index; + + // Read correlationId if the struct is large enough (CUDA 12.4+). + // It sits right after the stallReason pointer at offset 56. + evt->correlation_id = 0; + if (rec.size > CUPTI_PC_DATA_BASE_SIZE) { + u32 corr = 0; + bpf_probe_read_user(&corr, sizeof(corr), + (void *)(rec_ptr + CUPTI_PC_DATA_BASE_SIZE)); + evt->correlation_id = corr; + } + + // Chase the function name pointer. + if (rec.function_name_ptr) { + bpf_probe_read_user_str(evt->function_name, sizeof(evt->function_name), + (void *)rec.function_name_ptr); + } else { + evt->function_name[0] = '\0'; + } + + // Chase the stall reason pointer. + u32 sr_count = rec.stall_reason_count; + if (sr_count > MAX_STALL_REASONS) + sr_count = MAX_STALL_REASONS; + evt->stall_reason_count = sr_count; + + if (rec.stall_reason_ptr && sr_count > 0) { + bpf_probe_read_user(evt->stall_reasons, + sr_count * sizeof(struct cupti_stall_reason), + (void *)rec.stall_reason_ptr); + } + + bpf_ringbuf_submit(evt, 0); + } + + return 0; +} + +SEC("usdt/parcagpu/error") +int BPF_USDT(handle_error, s32 code, u64 message_ptr, u64 component_ptr) { + struct error_event *evt = bpf_ringbuf_reserve(&events, sizeof(*evt), 0); + if (!evt) { + bump_stat(STAT_DROPS); + return 0; + } + + evt->event_type = EVENT_TYPE_ERROR; + evt->error_code = code; + evt->message[0] = '\0'; + evt->component[0] = '\0'; + if (message_ptr) + bpf_probe_read_user_str(evt->message, sizeof(evt->message), + (void *)message_ptr); + if (component_ptr) + bpf_probe_read_user_str(evt->component, sizeof(evt->component), + (void *)component_ptr); + bpf_ringbuf_submit(evt, 0); + return 0; +} + char LICENSE[] SEC("license") = "GPL"; diff --git a/test/bpf/cupti_activity_bpf.h b/test/bpf/cupti_activity_bpf.h deleted file mode 100644 index 38086c3..0000000 --- a/test/bpf/cupti_activity_bpf.h +++ /dev/null @@ -1,51 +0,0 @@ -// Minimal CUPTI activity buffer definitions for BPF programs. -// Stripped-down version of CUpti_ActivityKernel5 with exact layout -// matching the CUDA 12.x CUPTI headers (sizeof = 160, packed aligned 8). -// -// Only the fields needed by BPF are named; the rest are padding. -// Field offsets verified against the real struct with offsetof(). - -#ifndef CUPTI_ACTIVITY_BPF_H -#define CUPTI_ACTIVITY_BPF_H - -// CUpti_ActivityKind values we care about -#define CUPTI_ACTIVITY_KIND_KERNEL 3 -#define CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL 10 - -// Matches the layout of CUpti_ActivityKernel5 exactly. -// PACKED_ALIGNMENT in CUPTI is: __attribute__((__packed__)) __attribute__((aligned(8))) -struct cupti_activity_kernel5 { - u32 kind; // offset 0 - CUpti_ActivityKind - u8 _pad1[12]; // offset 4 - cacheConfig, sharedMemConfig, - // registersPerThread, - // partitionedGlobalCache x2 - u64 start; // offset 16 - kernel start timestamp (ns) - u64 end; // offset 24 - kernel end timestamp (ns) - u64 completed; // offset 32 - completion timestamp - u32 device_id; // offset 40 - u32 context_id; // offset 44 - u32 stream_id; // offset 48 - u8 _pad2[40]; // offset 52 - gridX/Y/Z, blockX/Y/Z, - // staticSharedMemory, - // dynamicSharedMemory, - // localMemoryPerThread, - // localMemoryTotal - u32 correlation_id; // offset 92 - s64 grid_id; // offset 96 - u64 name_ptr; // offset 104 - const char* (user-space pointer) - u64 _reserved0; // offset 112 - u64 queued; // offset 120 - u64 submitted; // offset 128 - u8 _pad3[8]; // offset 136 - launchType, isSharedMemoryCarveout, - // sharedMemoryCarveoutRequested, - // padding, sharedMemoryExecuted - u64 graph_node_id; // offset 144 - u32 shmem_limit_cfg; // offset 152 - CUpti_FuncShmemLimitConfig - u32 graph_id; // offset 156 -} __attribute__((__packed__)) __attribute__((aligned(8))); - -// Verify expected size at compile time -_Static_assert(sizeof(struct cupti_activity_kernel5) == 160, - "cupti_activity_kernel5 size mismatch"); - -#endif // CUPTI_ACTIVITY_BPF_H diff --git a/test/bpf/go.mod b/test/bpf/go.mod index ae3ac5b..b3ab1bf 100644 --- a/test/bpf/go.mod +++ b/test/bpf/go.mod @@ -1,25 +1,12 @@ -module github.com/parca-dev/batch-kernel/test/bpf +module parcagpu/test/bpf go 1.25.1 require ( - github.com/cilium/ebpf v0.20.0 - go.opentelemetry.io/ebpf-profiler v0.0.0-00010101000000-000000000000 + github.com/cilium/ebpf v0.21.0 + github.com/gnurizen/sass-table v0.0.1 + github.com/parca-dev/usdt v0.0.2 golang.org/x/sys v0.41.0 ) -require ( - github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect - github.com/go-logr/stdr v1.2.2 // indirect - github.com/google/uuid v1.6.0 // indirect - github.com/klauspost/cpuid/v2 v2.2.8 // indirect - github.com/minio/sha256-simd v1.0.1 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect - go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/otel v1.39.0 // indirect - go.opentelemetry.io/otel/metric v1.39.0 // indirect - go.opentelemetry.io/otel/trace v1.39.0 // indirect -) - -replace go.opentelemetry.io/ebpf-profiler => ../../vendor/opentelemetry-ebpf-profiler +require github.com/rogpeppe/go-internal v1.14.1 // indirect diff --git a/test/bpf/go.sum b/test/bpf/go.sum index 6530420..f1dfcbd 100644 --- a/test/bpf/go.sum +++ b/test/bpf/go.sum @@ -1,66 +1,30 @@ -github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= -github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cilium/ebpf v0.20.0 h1:atwWj9d3NffHyPZzVlx3hmw1on5CLe9eljR8VuHTwhM= -github.com/cilium/ebpf v0.20.0/go.mod h1:pzLjFymM+uZPLk/IXZUL63xdx5VXEo+enTzxkZXdycw= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= -github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= -github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k= +github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ= +github.com/gnurizen/sass-table v0.0.1 h1:LvV2GIUAIv6ZXLzr6TbC51Domg/5iVihhc7JXVGZXe0= +github.com/gnurizen/sass-table v0.0.1/go.mod h1:epXL4VyR6Yuec1rY5RV9awBdWm6ibGUKRB5zajgyUjo= github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6 h1:teYtXy9B7y5lHTp8V9KPxpYRAVA7dozigQcMiBust1s= github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6/go.mod h1:p4lGIVX+8Wa6ZPNDvqcxq36XpUDLh42FLetFU7odllI= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= -github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= -github.com/jsimonetti/rtnetlink/v2 v2.0.3 h1:Jcp7GTnTPepoUAJ9+LhTa7ZiebvNS56T1GtlEUaPNFE= -github.com/jsimonetti/rtnetlink/v2 v2.0.3/go.mod h1:atIkksp/9fqtf6rpAw45JnttnP2gtuH9X88WPfWfS9A= -github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM= -github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= +github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= +github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= -github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= -github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= -github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM= -github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= +github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= +github.com/parca-dev/usdt v0.0.2 h1:bpKQycQ++zV8pwkMaJSxZS07XnEXqO3rkHcLYFJDTl4= +github.com/parca-dev/usdt v0.0.2/go.mod h1:bjh3OTksk+pyP7WsHWlRKWaMSJTUr0gx0piZ/tAv6/w= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= -github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= -go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48= -go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8= -go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0= -go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs= -go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI= -go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= -golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c= -golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk= golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/test/bpf/main.go b/test/bpf/main.go index f6e9738..681e4a0 100644 --- a/test/bpf/main.go +++ b/test/bpf/main.go @@ -1,10 +1,7 @@ // Test program that loads the activity_parser BPF program, attaches it -// to the parcagpu:activity_batch USDT probe in the target shared library, -// and logs kernel activities received through the ring buffer. -// -// The USDT probe location and argument specs are parsed from the ELF -// .note.stapsdt section using pfelf, then populated into the BPF -// __bpf_usdt_specs map so bpf_usdt_arg() reads the correct registers. +// to parcagpu USDT probes in the target shared library, and logs kernel +// activities received through the ring buffer. Also captures cubin modules +// and resolves PC addresses to source lines using llvm-dwarfdump. // // Usage: // @@ -14,30 +11,51 @@ package main import ( + "bufio" "bytes" + "debug/elf" "encoding/binary" "errors" "flag" "fmt" + "io" "log" "os" + "os/exec" "os/signal" "path/filepath" + "sort" + "strconv" + "strings" + "sync" "syscall" "time" "unsafe" - "github.com/cilium/ebpf" + ebpf2 "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/ringbuf" - "go.opentelemetry.io/ebpf-profiler/libpf/pfelf" + "github.com/parca-dev/usdt" "golang.org/x/sys/unix" + + sasstable "github.com/gnurizen/sass-table" ) -//go:generate bpf2go -cc clang -cflags "-O2 -g -Wall -target bpf -D__TARGET_ARCH_x86 -D__x86_64__ -I../../vendor/opentelemetry-ebpf-profiler/support/ebpf" activityParser activity_parser.bpf.c +//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $GOARCH -cflags "-I../../ebpf -I$USDT_HEADERS" activityParser activity_parser.bpf.c + +// Event type tags — must match BPF #defines. +const ( + eventTypeKernel = 1 + eventTypeCubinLoaded = 2 + eventTypeCubinUnloaded = 3 + eventTypePCSample = 4 + eventTypeError = 5 +) // KernelEvent matches struct kernel_event in the BPF program. type KernelEvent struct { + EventType uint32 + _ uint32 Start uint64 End uint64 CorrelationID uint32 @@ -48,6 +66,41 @@ type KernelEvent struct { Name [128]byte } +// CubinEvent matches struct cubin_event in the BPF program. +type CubinEvent struct { + EventType uint32 + _ uint32 + CubinCRC uint64 + CubinPtr uint64 + CubinSize uint64 +} + +// StallReason matches struct cupti_stall_reason in the BPF program. +type StallReason struct { + Index uint32 + Samples uint32 +} + +// ErrorEvent matches struct error_event in the BPF program. +type ErrorEvent struct { + EventType uint32 + ErrorCode int32 + Message [256]byte + Component [64]byte +} + +// PCSampleEvent matches struct pc_sample_event in the BPF program. +type PCSampleEvent struct { + EventType uint32 + StallReasonCount uint32 + CubinCRC uint64 + PCOffset uint64 + FunctionIndex uint32 + CorrelationID uint32 + FunctionName [128]byte + StallReasons [64]StallReason +} + const ( statBatches = 0 statActivities = 1 @@ -55,6 +108,277 @@ const ( statDrops = 3 ) +// lineEntry is a single address→source mapping from the DWARF line table. +type lineEntry struct { + addr uint64 + file string + line int +} + +// cubinStore holds loaded cubins and their parsed line tables. +type cubinStore struct { + mu sync.RWMutex + cubins map[uint64]*cubinInfo // keyed by CRC + pid int +} + +// textSection holds one .text._Zfuncname section from the cubin ELF. +type textSection struct { + name string + data []byte +} + +type cubinInfo struct { + crc uint64 + size uint64 + lines []lineEntry // sorted by addr + files []string // file table from line table header + archSM int // SM version from ELF e_flags (e.g. 121) + texts []textSection // .text sections for instruction decoding + tmpPath string // temp file for llvm-dwarfdump +} + +func newCubinStore(pid int) *cubinStore { + return &cubinStore{ + cubins: make(map[uint64]*cubinInfo), + pid: pid, + } +} + +func (cs *cubinStore) load(crc, ptr, size uint64) { + cs.mu.Lock() + defer cs.mu.Unlock() + + if _, ok := cs.cubins[crc]; ok { + return // already loaded + } + + data, err := readProcessMemory(cs.pid, ptr, size) + if err != nil { + log.Printf(" [CUBIN] failed to read cubin 0x%x (%d bytes) from pid %d: %v", + crc, size, cs.pid, err) + return + } + + info := &cubinInfo{crc: crc, size: size} + + // Write to temp file for llvm-dwarfdump. + tmp, err := os.CreateTemp("", fmt.Sprintf("cubin_%x_*.elf", crc)) + if err != nil { + log.Printf(" [CUBIN] 0x%x loaded (%d bytes), no temp file: %v", crc, size, err) + cs.cubins[crc] = info + return + } + tmp.Write(data) + tmp.Close() + info.tmpPath = tmp.Name() + + // Parse line table with llvm-dwarfdump. + lines, files, err := parseLinesWithDwarfdump(info.tmpPath) + if err != nil { + log.Printf(" [CUBIN] 0x%x loaded (%d bytes), no line info: %v", crc, size, err) + } else { + info.lines = lines + info.files = files + log.Printf(" [CUBIN] 0x%x loaded (%d bytes), %d line entries, %d files", + crc, size, len(lines), len(files)) + } + + // Parse ELF to extract SM version and .text sections for SASS decoding. + archSM, texts := parseCubinELF(data) + info.archSM = archSM + info.texts = texts + if archSM > 0 { + log.Printf(" [CUBIN] 0x%x SM%d, %d text sections", crc, archSM, len(texts)) + } + + cs.cubins[crc] = info +} + +func (cs *cubinStore) unload(crc uint64) { + cs.mu.Lock() + defer cs.mu.Unlock() + if info, ok := cs.cubins[crc]; ok { + if info.tmpPath != "" { + os.Remove(info.tmpPath) + } + delete(cs.cubins, crc) + } + log.Printf(" [CUBIN] 0x%x unloaded", crc) +} + +func (cs *cubinStore) cleanup() { + cs.mu.Lock() + defer cs.mu.Unlock() + for _, info := range cs.cubins { + if info.tmpPath != "" { + os.Remove(info.tmpPath) + } + } +} + +// resolveInstruction looks up the SASS mnemonic at a PC offset. +// Uses the sass-table opcode decoder first, falls back to nvdisasm cache. +func (cs *cubinStore) resolveInstruction(cubinCRC uint64, pcOffset uint64) string { + cs.mu.RLock() + info, ok := cs.cubins[cubinCRC] + cs.mu.RUnlock() + if !ok { + return "" + } + + if info.archSM == 0 || len(info.texts) == 0 { + return "" + } + + // pcOffset is function-relative. Try all text sections — the offset + // should only produce a valid decode in the correct one. + for _, ts := range info.texts { + if int(pcOffset)+16 <= len(ts.data) { + m := sasstable.DecodeMnemonicFromSlice(info.archSM, ts.data[pcOffset:]) + if m != "" { + return m + } + } + } + return "" +} + +// parseCubinELF extracts the SM version and .text section data from a cubin ELF. +func parseCubinELF(data []byte) (archSM int, texts []textSection) { + f, err := elf.NewFile(bytes.NewReader(data)) + if err != nil { + return 0, nil + } + defer f.Close() + + // Go's debug/elf doesn't expose e_flags. Read it directly from the ELF header. + // For ELF64: e_flags is at offset 48, 4 bytes little-endian. + if len(data) >= 52 { + flags := binary.LittleEndian.Uint32(data[48:52]) + archSM = int((flags >> 8) & 0xFF) + } + + for _, s := range f.Sections { + if s.Type == elf.SHT_PROGBITS && s.Flags&elf.SHF_EXECINSTR != 0 && + strings.HasPrefix(s.Name, ".text") { + d, err := s.Data() + if err != nil { + continue + } + texts = append(texts, textSection{name: s.Name, data: d}) + } + } + return archSM, texts +} + +// resolvePC looks up a PC offset in a cubin's line table. +func (cs *cubinStore) resolvePC(cubinCRC uint64, pcOffset uint64) (file string, line int) { + cs.mu.RLock() + info, ok := cs.cubins[cubinCRC] + cs.mu.RUnlock() + if !ok || len(info.lines) == 0 { + return + } + + // Binary search for the largest address <= pcOffset. + i := sort.Search(len(info.lines), func(i int) bool { + return info.lines[i].addr > pcOffset + }) + if i == 0 { + return + } + e := info.lines[i-1] + return e.file, e.line +} + +// parseLinesWithDwarfdump runs llvm-dwarfdump --debug-line on the cubin ELF +// and parses the output into a sorted line table. +func parseLinesWithDwarfdump(path string) ([]lineEntry, []string, error) { + cmd := exec.Command("llvm-dwarfdump", "--debug-line", path) + out, err := cmd.Output() + if err != nil { + return nil, nil, fmt.Errorf("llvm-dwarfdump: %w", err) + } + + var entries []lineEntry + files := map[string]bool{} + fileList := []string{} + + scanner := bufio.NewScanner(bytes.NewReader(out)) + for scanner.Scan() { + line := scanner.Text() + + // Parse file table entries: file_names[ N]: + // name: "foo.cu" + if strings.HasPrefix(line, " name: ") { + name := strings.Trim(strings.TrimPrefix(line, " name: "), "\"") + if !files[name] { + files[name] = true + fileList = append(fileList, name) + } + continue + } + + // Parse line table rows: 0xADDR LINE COL FILE ... + if !strings.HasPrefix(line, "0x") { + continue + } + + fields := strings.Fields(line) + if len(fields) < 4 { + continue + } + + addr, err := strconv.ParseUint(strings.TrimPrefix(fields[0], "0x"), 16, 64) + if err != nil { + continue + } + + lineNum, err := strconv.Atoi(fields[1]) + if err != nil { + continue + } + + // fields[3] is the file index (1-based) + fileIdx, err := strconv.Atoi(fields[3]) + if err != nil || fileIdx < 1 || fileIdx > len(fileList) { + continue + } + + entries = append(entries, lineEntry{ + addr: addr, + file: fileList[fileIdx-1], + line: lineNum, + }) + } + + if len(entries) == 0 { + return nil, nil, fmt.Errorf("no line entries found") + } + + sort.Slice(entries, func(i, j int) bool { + return entries[i].addr < entries[j].addr + }) + + return entries, fileList, nil +} + +func readProcessMemory(pid int, addr, size uint64) ([]byte, error) { + f, err := os.Open(fmt.Sprintf("/proc/%d/mem", pid)) + if err != nil { + return nil, err + } + defer f.Close() + + data := make([]byte, size) + _, err = f.ReadAt(data, int64(addr)) + if err != nil && !errors.Is(err, io.EOF) { + return nil, err + } + return data, nil +} + func main() { pid := flag.Int("pid", 0, "PID of the target process") libPath := flag.String("lib", "", "Path to the shared library containing the USDT probe") @@ -83,7 +407,7 @@ func main() { // Load pre-compiled BPF objects. objs := activityParserObjects{} if err := loadActivityParserObjects(&objs, nil); err != nil { - var ve *ebpf.VerifierError + var ve *ebpf2.VerifierError if errors.As(err, &ve) { log.Fatalf("Verifier error loading BPF objects:\n%+v", ve) } @@ -92,67 +416,69 @@ func main() { defer objs.Close() // Parse USDT probes from the shared library's .note.stapsdt section. - ef, err := pfelf.Open(realLib) - if err != nil { - log.Fatalf("Opening ELF %s: %v", realLib, err) - } - defer ef.Close() - - if err := ef.LoadSections(); err != nil { - log.Fatalf("Loading ELF sections: %v", err) - } - - probes, err := ef.ParseUSDTProbes() + probes, err := usdt.ParseProbesFromFile(realLib) if err != nil { - log.Fatalf("Parsing USDT probes: %v", err) + log.Fatalf("Parsing USDT probes from %s: %v", realLib, err) } - // Find parcagpu:activity_batch probe(s) and attach uprobe at each site. + // Find USDT probes and attach uprobes at each site. ex, err := link.OpenExecutable(realLib) if err != nil { log.Fatalf("Opening executable %s: %v", realLib, err) } + type probeTarget struct { + name string + handler *ebpf2.Program + } + targets := []probeTarget{ + {"activity_batch", objs.HandleActivityBatch}, + {"stall_reason_map", objs.HandleStallReasonMap}, + {"cubin_loaded", objs.HandleCubinLoaded}, + {"cubin_unloaded", objs.HandleCubinUnloaded}, + {"pc_sample_batch", objs.HandlePcSampleBatch}, + {"error", objs.HandleError}, + } + var links []link.Link var specID uint32 - for _, probe := range probes { - if probe.Provider != "parcagpu" || probe.Name != "activity_batch" { - continue - } + for _, t := range targets { + for _, probe := range probes { + if probe.Provider != "parcagpu" || probe.Name != t.name { + continue + } - // Parse the stapsdt argument spec into a bpf_usdt_spec. - spec, err := pfelf.ParseUSDTArguments(probe.Arguments) - if err != nil { - log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err) - } + spec, err := usdt.ParseUSDTArguments(probe.Arguments) + if err != nil { + log.Fatalf("Parsing USDT args %q: %v", probe.Arguments, err) + } - // Store spec in the BPF map so bpf_usdt_arg() can look it up. - specBytes := pfelf.USDTSpecToBytes(spec) - if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil { - log.Fatalf("Populating USDT spec map: %v", err) - } + specBytes := usdt.SpecToBytes(spec) + if err := objs.BpfUsdtSpecs.Put(specID, specBytes); err != nil { + log.Fatalf("Populating USDT spec map: %v", err) + } - // Cookie: spec_id in high 32 bits (bpf_usdt_arg reads it via bpf_get_attach_cookie). - cookie := uint64(specID) << 32 + cookie := uint64(specID) << 32 - log.Printf("USDT probe parcagpu:activity_batch at offset 0x%x, args=%q, spec_id=%d", - probe.Location, probe.Arguments, specID) + log.Printf("USDT probe parcagpu:%s at offset 0x%x, args=%q, spec_id=%d", + t.name, probe.Location, probe.Arguments, specID) - up, err := ex.Uprobe("activity_batch", objs.HandleActivityBatch, &link.UprobeOptions{ - Address: probe.Location, - PID: *pid, - Cookie: cookie, - RefCtrOffset: probe.SemaphoreOffset, - }) - if err != nil { - log.Fatalf("Attaching uprobe at offset 0x%x: %v", probe.Location, err) + up, err := ex.Uprobe(t.name, t.handler, &link.UprobeOptions{ + Address: probe.Location, + PID: *pid, + Cookie: cookie, + RefCtrOffset: probe.SemaphoreOffset, + }) + if err != nil { + log.Fatalf("Attaching uprobe for %s at offset 0x%x: %v", t.name, probe.Location, err) + } + links = append(links, up) + specID++ } - links = append(links, up) - specID++ } if len(links) == 0 { - log.Fatalf("No parcagpu:activity_batch USDT probes found in %s", realLib) + log.Fatalf("No parcagpu USDT probes found in %s", realLib) } defer func() { for _, l := range links { @@ -160,6 +486,12 @@ func main() { } }() + cubins := newCubinStore(*pid) + defer cubins.cleanup() + + // Stall reason index → name cache, populated lazily from BPF map. + stallReasonNames := map[uint32]string{} + // Open ring buffer reader. rd, err := ringbuf.NewReader(objs.Events) if err != nil { @@ -171,7 +503,6 @@ func main() { sig := make(chan os.Signal, 1) signal.Notify(sig, syscall.SIGINT, syscall.SIGTERM) - // Also watch for the target process to exit. done := make(chan struct{}) go func() { for { @@ -193,9 +524,10 @@ func main() { }() log.Printf("Attached %d USDT probe(s) in %s (PID %d)", len(links), realLib, *pid) - log.Printf("Waiting for kernel activity events...") + log.Printf("Waiting for events...") var eventCount uint64 + var pcSampleCount uint64 ticker := time.NewTicker(2 * time.Second) defer ticker.Stop() @@ -205,7 +537,6 @@ func main() { } }() - var event KernelEvent for { record, err := rd.Read() if err != nil { @@ -216,23 +547,113 @@ func main() { continue } - if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil { - log.Printf("Parsing event: %v", err) + raw := record.RawSample + if len(raw) < 4 { continue } - eventCount++ - if *verbose { - name := cString(event.Name[:]) - duration := event.End - event.Start - fmt.Printf("kernel: name=%-40s corr=%-6d dev=%d stream=%d graph=%-3d duration=%dns\n", - name, event.CorrelationID, event.DeviceID, event.StreamID, event.GraphID, duration) + eventType := binary.LittleEndian.Uint32(raw[:4]) + + switch eventType { + case eventTypeKernel: + var event KernelEvent + if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil { + log.Printf("Parsing kernel event: %v", err) + continue + } + eventCount++ + if *verbose { + name := cString(event.Name[:]) + duration := event.End - event.Start + fmt.Printf("kernel: name=%-40s corr=%-6d dev=%d stream=%d graph=%-3d duration=%dns\n", + name, event.CorrelationID, event.DeviceID, event.StreamID, event.GraphID, duration) + } + + case eventTypeCubinLoaded: + var event CubinEvent + if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil { + log.Printf("Parsing cubin event: %v", err) + continue + } + cubins.load(event.CubinCRC, event.CubinPtr, event.CubinSize) + + case eventTypeCubinUnloaded: + var event CubinEvent + if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil { + log.Printf("Parsing cubin event: %v", err) + continue + } + cubins.unload(event.CubinCRC) + + case eventTypeError: + var event ErrorEvent + if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil { + log.Printf("Parsing error event: %v", err) + continue + } + msg := cString(event.Message[:]) + comp := cString(event.Component[:]) + log.Printf("ERROR [%s] code=%d: %s", comp, event.ErrorCode, msg) + + case eventTypePCSample: + var event PCSampleEvent + if err := binary.Read(bytes.NewReader(raw), binary.LittleEndian, &event); err != nil { + log.Printf("Parsing pc sample event: %v", err) + continue + } + pcSampleCount++ + + // Lazily populate stall reason name cache. + if len(stallReasonNames) == 0 { + for i := uint32(0); i < 64; i++ { + var name [64]byte + if err := objs.StallReasons.Lookup(&i, &name); err != nil { + continue + } + s := cString(name[:]) + if s != "" { + stallReasonNames[i] = s + } + } + } + + name := cString(event.FunctionName[:]) + file, line := cubins.resolvePC(event.CubinCRC, event.PCOffset) + insn := cubins.resolveInstruction(event.CubinCRC, event.PCOffset) + + src := "" + if file != "" { + src = fmt.Sprintf(" %s:%d", file, line) + } + insnStr := "" + if insn != "" { + insnStr = fmt.Sprintf(" [%s]", insn) + } + corrStr := "" + if event.CorrelationID != 0 { + corrStr = fmt.Sprintf(" corr=%d", event.CorrelationID) + } + fmt.Printf("pc_sample: %s pc=0x%04x%s%s%s\n", name, event.PCOffset, src, insnStr, corrStr) + for i := uint32(0); i < event.StallReasonCount; i++ { + sr := event.StallReasons[i] + if sr.Samples == 0 { + continue + } + srName := stallReasonNames[sr.Index] + if srName == "" { + srName = fmt.Sprintf("reason[%d]", sr.Index) + } + fmt.Printf(" %s = %d\n", srName, sr.Samples) + } } } fmt.Println() log.Printf("Final stats:") printStats(&objs, eventCount) + log.Printf(" pc_samples=%d", pcSampleCount) + printStallReasonMap(&objs) + printCubins(cubins) } func printStats(objs *activityParserObjects, eventCount uint64) { @@ -251,6 +672,53 @@ func printStats(objs *activityParserObjects, eventCount uint64) { batches, activities, kernels, eventCount, drops) } +func printStallReasonMap(objs *activityParserObjects) { + var loaded uint32 + loadedKey := uint32(0) + if err := objs.StallMapLoaded.Lookup(&loadedKey, &loaded); err != nil || loaded == 0 { + log.Printf(" stall reason map: not received") + return + } + + log.Printf(" stall reason map:") + for i := uint32(0); i < 64; i++ { + var name [64]byte + if err := objs.StallReasons.Lookup(&i, &name); err != nil { + continue + } + s := cString(name[:]) + if s == "" { + continue + } + log.Printf(" [%2d] %s", i, s) + } +} + +func printCubins(cs *cubinStore) { + cs.mu.RLock() + defer cs.mu.RUnlock() + + if len(cs.cubins) == 0 { + log.Printf(" cubins: none loaded") + return + } + + log.Printf(" cubins loaded: %d", len(cs.cubins)) + for crc, info := range cs.cubins { + log.Printf(" crc=0x%x size=%d lines=%d files=%v", + crc, info.size, len(info.lines), info.files) + + // Print first 10 line entries as demo. + for i, e := range info.lines { + if i >= 10 { + log.Printf(" ... and %d more entries", len(info.lines)-10) + break + } + log.Printf(" 0x%04x -> %s:%d", e.addr, e.file, e.line) + } + } +} + func raiseMemlock() error { return unix.Setrlimit(unix.RLIMIT_MEMLOCK, &unix.Rlimit{ Cur: unix.RLIM_INFINITY, @@ -265,5 +733,7 @@ func cString(b []byte) string { return string(b) } -// Ensure KernelEvent matches the BPF struct size. -var _ = [1]struct{}{}[unsafe.Sizeof(KernelEvent{})-168] +// Ensure struct sizes match BPF. +var _ = [1]struct{}{}[unsafe.Sizeof(KernelEvent{})-176] +var _ = [1]struct{}{}[unsafe.Sizeof(CubinEvent{})-32] +var _ = [1]struct{}{}[unsafe.Sizeof(PCSampleEvent{})-672] diff --git a/test/bpf/usdt_compat.h b/test/bpf/usdt_compat.h index ca5a15e..fc3793e 100644 --- a/test/bpf/usdt_compat.h +++ b/test/bpf/usdt_compat.h @@ -1,29 +1,25 @@ -// Compatibility shim for including the otel-ebpf-profiler usdt.h and -// usdt_args.h from our vmlinux.h-based BPF program. +// Compatibility shim for including parca-dev/usdt's usdt_args.h from our +// vmlinux.h-based BPF program. // // Must be included AFTER vmlinux.h + bpf_helpers.h but BEFORE usdt_args.h. -// Defines the include guards for bpfdefs.h and types.h so those vendor -// headers (and their deep dependency trees) are skipped entirely. +// Provides macros that usdt_args.h expects (normally from kernel.h) and +// bridges bpf2go architecture defines to standard compiler builtins. #ifndef USDT_COMPAT_H #define USDT_COMPAT_H -// Skip vendor bpfdefs.h and types.h — we already have everything from vmlinux.h. -#define OPTI_BPFDEFS_H -#define OPTI_TYPES_H - -// Macros expected by usdt_args.h (normally from bpfdefs.h). +// Macros expected by usdt_args.h (normally from kernel.h). #define EBPF_INLINE __always_inline #define UNUSED __attribute__((unused)) -// bpf_map_def — usdt_args.h uses "extern bpf_map_def __bpf_usdt_specs" -// (C++ style, no struct keyword), so we need a typedef. -typedef struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -} bpf_map_def; +// bpf2go passes -target bpfel (not the real platform triple), so the +// usual compiler builtins (__x86_64__, __aarch64__) are never set. +// Bridge from bpf2go's __TARGET_ARCH_* defines to the builtins that +// usdt_args.h checks for pt_regs layout. +#if defined(__TARGET_ARCH_x86) +#define __x86_64__ +#elif defined(__TARGET_ARCH_arm64) +#define __aarch64__ +#endif #endif // USDT_COMPAT_H diff --git a/test/mock_cuda.c b/test/mock_cuda.c new file mode 100644 index 0000000..a512637 --- /dev/null +++ b/test/mock_cuda.c @@ -0,0 +1,31 @@ +/* + * Mock CUDA Driver API for testing + * Provides minimal implementation of cuDriverGetVersion for test environment + */ + +#include +#include +#include + +// Mock implementation of cuDriverGetVersion +// Returns CUDA 12.8.1 (12081) to enable PC sampling in tests +CUresult cuDriverGetVersion(int *driverVersion) { + if (driverVersion == NULL) { + return CUDA_ERROR_INVALID_VALUE; + } + + // Version format: major * 1000 + minor * 10 + patch + // 12.8.1 = 12 * 1000 + 8 * 10 + 1 = 12081 + *driverVersion = 12081; + + if (getenv("PARCAGPU_DEBUG") != NULL) { + fprintf(stderr, "[MOCK_CUDA] cuDriverGetVersion() -> 12.8.1 (12081)\n"); + } + + return CUDA_SUCCESS; +} + +// Mock cuCtxSynchronize — no-op in test (no real GPU work to wait for) +CUresult cuCtxSynchronize(void) { + return CUDA_SUCCESS; +} diff --git a/test/mock_cupti.c b/test/mock_cupti.c index 3c04589..01092fd 100644 --- a/test/mock_cupti.c +++ b/test/mock_cupti.c @@ -1,7 +1,10 @@ #include +#include +#include #include #include #include +#include // Define callback function types if not already defined by CUPTI headers #ifndef CUpti_BufferRequestFunc @@ -134,8 +137,290 @@ CUptiResult cuptiActivityGetNumDroppedRecords(CUcontext context, return CUPTI_SUCCESS; } +CUptiResult cuptiGetContextId(CUcontext context, uint32_t *contextId) { + (void)context; // Mark as intentionally unused + // Return a fixed context ID for testing + *contextId = 1; + return CUPTI_SUCCESS; +} + CUptiResult cuptiUnsubscribe(CUpti_SubscriberHandle subscriber) { (void)subscriber; // Mark as intentionally unused fprintf(stderr, "[MOCK_CUPTI] cuptiUnsubscribe()\n"); return CUPTI_SUCCESS; } + +// ========================================================================= +// PC Sampling mock — uses a real cubin from pc_sample_toy for realistic +// CRC, offsets, and source-line correlation. +// ========================================================================= + +// Real cubin loaded from file (set MOCK_CUBIN_PATH, or auto-detected). +static char *__cubin_data = NULL; +static size_t __cubin_size = 0; +static uint64_t __cubin_crc = 0; + +// CRC function shared between cuptiGetCubinCrc and internal use. +static uint64_t __compute_crc(const void *data, size_t size) { + const uint8_t *bytes = (const uint8_t *)data; + uint64_t crc = 0xDEADBEEFULL; + for (size_t i = 0; i < size; i++) + crc = crc * 31 + bytes[i]; + return crc; +} + +static void __load_cubin(void) { + if (__cubin_data) + return; + const char *path = getenv("MOCK_CUBIN_PATH"); + if (!path) { + fprintf(stderr, "[MOCK_CUPTI] MOCK_CUBIN_PATH not set, no cubin loaded\n"); + return; + } + FILE *f = fopen(path, "rb"); + if (!f) { + fprintf(stderr, "[MOCK_CUPTI] Failed to open cubin: %s\n", path); + return; + } + fseek(f, 0, SEEK_END); + __cubin_size = (size_t)ftell(f); + fseek(f, 0, SEEK_SET); + __cubin_data = (char *)malloc(__cubin_size); + if (fread(__cubin_data, 1, __cubin_size, f) != __cubin_size) { + fprintf(stderr, "[MOCK_CUPTI] Short read on cubin: %s\n", path); + free(__cubin_data); + __cubin_data = NULL; + __cubin_size = 0; + fclose(f); + return; + } + fclose(f); + __cubin_crc = __compute_crc(__cubin_data, __cubin_size); + fprintf(stderr, "[MOCK_CUPTI] Loaded cubin: %s (%zu bytes, crc=0x%lx)\n", + path, __cubin_size, __cubin_crc); +} + +CUptiResult cuptiGetVersion(uint32_t *version) { + *version = 24; + return CUPTI_SUCCESS; +} + +static int __pc_sampling_started = 0; + +CUptiResult cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *params) { + (void)params; + fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingEnable()\n"); + return CUPTI_SUCCESS; +} + +CUptiResult cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *params) { + (void)params; + fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingDisable()\n"); + return CUPTI_SUCCESS; +} + +CUptiResult cuptiPCSamplingStart(CUpti_PCSamplingStartParams *params) { + (void)params; + __pc_sampling_started = 1; + return CUPTI_SUCCESS; +} + +CUptiResult cuptiPCSamplingStop(CUpti_PCSamplingStopParams *params) { + (void)params; + __pc_sampling_started = 0; + return CUPTI_SUCCESS; +} + +CUptiResult cuptiPCSamplingSetConfigurationAttribute( + CUpti_PCSamplingConfigurationInfoParams *params) { + (void)params; + fprintf(stderr, "[MOCK_CUPTI] cuptiPCSamplingSetConfigurationAttribute()\n"); + return CUPTI_SUCCESS; +} + +CUptiResult cuptiPCSamplingGetNumStallReasons( + CUpti_PCSamplingGetNumStallReasonsParams *params) { + *params->numStallReasons = 3; + return CUPTI_SUCCESS; +} + +static const char *__mock_stall_names[] = { + "smsp__pcsamp_warps_issue_stalled_not_selected", + "smsp__pcsamp_warps_issue_stalled_math_pipe_throttle", + "smsp__pcsamp_warps_issue_stalled_barrier", +}; + +CUptiResult cuptiPCSamplingGetStallReasons( + CUpti_PCSamplingGetStallReasonsParams *params) { + size_t n = params->numStallReasons < 3 ? params->numStallReasons : 3; + for (size_t i = 0; i < n; i++) { + strncpy(params->stallReasons[i], __mock_stall_names[i], + CUPTI_STALL_REASON_STRING_SIZE - 1); + params->stallReasons[i][CUPTI_STALL_REASON_STRING_SIZE - 1] = '\0'; + params->stallReasonIndex[i] = (uint32_t)i; + } + return CUPTI_SUCCESS; +} + +// --------------------------------------------------------------------------- +// Synthetic PC samples using real offsets from pc_sample_toy.cu kernels. +// Offsets extracted from nvdisasm -g on the sm_121 cubin. +// --------------------------------------------------------------------------- + +typedef struct { + const char *functionName; + uint32_t functionIndex; + uint64_t pcOffset; + // Expected source correlation for this offset: + uint32_t lineNumber; + const char *fileName; + const char *dirName; +} MockPCSample; + +// Representative offsets covering all three kernels and distinct source lines. +// Offsets and line numbers from: nvdisasm -g -c pc_sample_toy.sm_121.cubin +static MockPCSample __mock_samples[] = { + // shmem_bounce — shared-memory bouncing kernel + {"_Z12shmem_bouncePfiy", 0, 0x00b0, 54, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // STS [R2], R3 + {"_Z12shmem_bouncePfiy", 0, 0x00d0, 55, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // BAR.SYNC + {"_Z12shmem_bouncePfiy", 0, 0x01f0, 58, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // LDS (inner loop) + {"_Z12shmem_bouncePfiy", 0, 0x0230, 58, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // FFMA (inner loop) + {"_Z12shmem_bouncePfiy", 0, 0x0250, 59, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // BAR.SYNC (inner loop) + + // hash_churn — integer bit-twiddling kernel + {"_Z10hash_churnPjiy", 1, 0x0050, 34, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // IMAD (idx calc) + {"_Z10hash_churnPjiy", 1, 0x0080, 39, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // LDCU (loop start) + {"_Z10hash_churnPjiy", 1, 0x0180, 40, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // IMAD.SHL (h ^= h << 13) + + // trig_storm — FP math kernel + {"_Z10trig_stormPfiy", 2, 0x0050, 21, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // IMAD (idx calc) + {"_Z10trig_stormPfiy", 2, 0x00b0, 25, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // FMUL (x * 0.001f) + {"_Z10trig_stormPfiy", 2, 0x0f30, 27, + "pc_sample_toy.cu", "/home/tpr/src/parcagpu-proton/microbenchmarks"}, // FMUL (sinf*cosf inner) +}; +#define NUM_MOCK_SAMPLES (sizeof(__mock_samples) / sizeof(__mock_samples[0])) + +static int __pc_get_data_calls = 0; + +CUptiResult cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *params) { + CUpti_PCSamplingData *data = params->pcSamplingData; + + __pc_get_data_calls++; + if (__pc_get_data_calls % 3 != 0 || data->collectNumPcs == 0) { + data->totalNumPcs = 0; + data->remainingNumPcs = 0; + return CUPTI_SUCCESS; + } + + // Cycle through the sample table, emitting a batch each time. + static size_t sample_cursor = 0; + size_t count = data->collectNumPcs < 4 ? data->collectNumPcs : 4; + if (count > NUM_MOCK_SAMPLES) + count = NUM_MOCK_SAMPLES; + + data->totalNumPcs = count; + data->remainingNumPcs = 0; + data->totalSamples = count * 9; + + for (size_t i = 0; i < count; i++) { + MockPCSample *s = &__mock_samples[(sample_cursor + i) % NUM_MOCK_SAMPLES]; + CUpti_PCSamplingPCData *pc = &data->pPcData[i]; + pc->size = sizeof(CUpti_PCSamplingPCData); + pc->cubinCrc = __cubin_crc; + pc->pcOffset = s->pcOffset; + pc->functionIndex = s->functionIndex; + pc->functionName = (char *)s->functionName; + pc->stallReasonCount = 3; + for (size_t j = 0; j < 3; j++) { + pc->stallReason[j].pcSamplingStallReasonIndex = (uint32_t)j; + pc->stallReason[j].samples = (uint32_t)(5 - j * 2); + } + } + sample_cursor = (sample_cursor + count) % NUM_MOCK_SAMPLES; + return CUPTI_SUCCESS; +} + +CUptiResult cuptiGetCubinCrc(CUpti_GetCubinCrcParams *params) { + params->cubinCrc = __compute_crc(params->cubin, params->cubinSize); + return CUPTI_SUCCESS; +} + +// Source correlation: look up the offset in our known table. +// Falls back to zeros if the offset isn't in the table (same as real CUPTI +// when debug info is missing). +CUptiResult cuptiGetSassToSourceCorrelation( + CUpti_GetSassToSourceCorrelationParams *params) { + for (size_t i = 0; i < NUM_MOCK_SAMPLES; i++) { + MockPCSample *s = &__mock_samples[i]; + if (params->pcOffset == s->pcOffset && + params->functionName && strcmp(params->functionName, s->functionName) == 0) { + params->lineNumber = s->lineNumber; + params->fileName = strdup(s->fileName); + params->dirName = strdup(s->dirName); + return CUPTI_SUCCESS; + } + } + // Unknown offset — no source info available. + params->lineNumber = 0; + params->fileName = NULL; + params->dirName = NULL; + return CUPTI_SUCCESS; +} + +// ========================================================================= +// Resource callback helper — called from test harness after init. +// Fires CONTEXT_CREATED and MODULE_LOADED with the real cubin. +// ========================================================================= + +void __mock_cupti_fire_resource_callbacks(void) { + if (!__cupti_runtime_api_callback) + return; + + __load_cubin(); + + fprintf(stderr, "[MOCK_CUPTI] Firing resource callbacks\n"); + + // 1. CONTEXT_CREATED + CUpti_ResourceData resData; + memset(&resData, 0, sizeof(resData)); + resData.context = (CUcontext)(uintptr_t)0x1; + __cupti_runtime_api_callback( + __cupti_runtime_api_userdata, + CUPTI_CB_DOMAIN_RESOURCE, + CUPTI_CBID_RESOURCE_CONTEXT_CREATED, + &resData); + + // 2. MODULE_LOADED with the real cubin. + CUpti_ModuleResourceData modData; + memset(&modData, 0, sizeof(modData)); + if (__cubin_data) { + modData.pCubin = __cubin_data; + modData.cubinSize = __cubin_size; + } else { + // Fallback: minimal fake cubin if no file was loaded. + static const char fake[] = {0x7f, 'E', 'L', 'F', 0,0,0,0, 0,0,0,0, 0,0,0,0}; + modData.pCubin = fake; + modData.cubinSize = sizeof(fake); + } + resData.resourceDescriptor = &modData; + __cupti_runtime_api_callback( + __cupti_runtime_api_userdata, + CUPTI_CB_DOMAIN_RESOURCE, + CUPTI_CBID_RESOURCE_MODULE_LOADED, + &resData); +} + +CUptiResult cuptiActivityDisable(CUpti_ActivityKind kind) { + (void)kind; + return CUPTI_SUCCESS; +} diff --git a/test/pc_sample_toy.cubin b/test/pc_sample_toy.cubin new file mode 100644 index 0000000..849a335 Binary files /dev/null and b/test/pc_sample_toy.cubin differ diff --git a/test/test-pc-mock.sh b/test/test-pc-mock.sh new file mode 100755 index 0000000..2d8772a --- /dev/null +++ b/test/test-pc-mock.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# Mock PC sampling test: runs test_cupti_prof (mock CUPTI) under parcagpu +# with BPF activity parser. Verifies stall reason map, PC samples, and +# cubin loading WITHOUT requiring a real GPU. +# +# Prerequisites: +# make local bpf-test +# +# Usage: +# sudo -E test/test-pc-mock.sh # default +# sudo -E test/test-pc-mock.sh -v # verbose + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +LIB="$ROOT/build-local/lib/libparcagpucupti.so" +TEST_BIN="$ROOT/build-local/bin/test_cupti_prof" +BPF="$ROOT/test/bpf/activity_parser" +CUBIN="$ROOT/test/pc_sample_toy.cubin" +BPF_LOG="/tmp/parcagpu-pc-mock-bpf.log" +TEST_LOG="/tmp/parcagpu-pc-mock-test.log" +VERBOSE="" + +for arg in "$@"; do + case "$arg" in + -v) VERBOSE="-v" ;; + esac +done + +# --- Preflight checks --- +for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do + if [ ! -x "$f" ] && [ ! -f "$f" ]; then + echo "error: $f not found" >&2 + exit 1 + fi +done + +cleanup() { + [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true + [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true + wait 2>/dev/null || true +} +trap cleanup EXIT + +# --- Launch mock workload --- +# Mock CUPTI/CUDA libs from build-local so proton's dynamic loader finds them +# instead of real libcupti.so / libcuda.so. +# Set probability=1 so every interval check triggers sampling. +echo "=== Starting test_cupti_prof (mock) ===" +LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \ + PARCAGPU_DEBUG=1 \ + PARCAGPU_PC_SAMPLING_PROBABILITY=1 \ + PARCAGPU_PC_SAMPLING_INTERVAL=0.1 \ + MOCK_CUBIN_PATH="$CUBIN" \ + "$TEST_BIN" "$LIB" --duration=5 > "$TEST_LOG" 2>&1 & +TEST_PID=$! +echo "test_cupti_prof PID: $TEST_PID" + +# Wait for library to be loaded into the process. +while kill -0 "$TEST_PID" 2>/dev/null && + ! grep -q libparcagpucupti "/proc/$TEST_PID/maps" 2>/dev/null; do + sleep 0.1 +done + +if ! kill -0 "$TEST_PID" 2>/dev/null; then + echo "error: test_cupti_prof exited before library loaded" >&2 + cat "$TEST_LOG" >&2 + exit 1 +fi + +# --- Attach BPF parser --- +echo "=== Starting BPF activity parser ===" +"$BPF" -pid "$TEST_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 & +BPF_PID=$! +echo "activity_parser PID: $BPF_PID" + +# --- Wait for workload to finish --- +wait "$TEST_PID" 2>/dev/null || true +TEST_PID="" +sleep 2 + +# --- Stop BPF parser --- +kill "$BPF_PID" 2>/dev/null || true +wait "$BPF_PID" 2>/dev/null || true +BPF_PID="" + +# --- Results --- +echo +echo "=== Mock test output (parcagpu debug) ===" +cat "$TEST_LOG" +echo +echo "=== BPF parser output ===" +cat "$BPF_LOG" +echo + +# --- Checks --- +PASS=true + +check() { + local label="$1" pattern="$2" file="$3" + if grep -q "$pattern" "$file"; then + echo "PASS: $label" + else + echo "FAIL: $label" >&2 + PASS=false + fi +} + +check "PC sampling initialized" "PC sampling initialized" "$TEST_LOG" +check "real cubin loaded (mock)" "Loaded cubin.*pc_sample_toy" "$TEST_LOG" +check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TEST_LOG" +check "source correlation: shmem_bounce" "func=_Z12shmem_bounce.*pc_sample_toy.cu" "$TEST_LOG" +check "source correlation: hash_churn" "func=_Z10hash_churn.*pc_sample_toy.cu" "$TEST_LOG" +check "source correlation: trig_storm" "func=_Z10trig_storm.*pc_sample_toy.cu" "$TEST_LOG" +check "stall reason map received" "\[ 0\] smsp__pcsamp" "$BPF_LOG" +check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG" +check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG" +check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG" + +if $PASS; then + echo + echo "=== ALL CHECKS PASSED ===" +else + echo + echo "=== SOME CHECKS FAILED ===" >&2 + exit 1 +fi diff --git a/test/test-pc-real.sh b/test/test-pc-real.sh new file mode 100755 index 0000000..5892957 --- /dev/null +++ b/test/test-pc-real.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# Smoke test: runs pc_sample_toy under parcagpu with BPF activity parser. +# Verifies stall reason map, PC samples, and cubin loading. +# +# Prerequisites: +# make local bpf-test +# nvcc -g -lineinfo -o microbenchmarks/pc_sample_toy microbenchmarks/pc_sample_toy.cu +# +# Usage: +# sudo -E test/test-pc-real.sh # default +# sudo -E test/test-pc-real.sh -v # verbose (print every event) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +LIB="$ROOT/build-local/lib/libparcagpucupti.so" +TOY="$ROOT/microbenchmarks/pc_sample_toy" +BPF="$ROOT/test/bpf/activity_parser" +BPF_LOG="/tmp/parcagpu-pc-test-bpf.log" +TOY_LOG="/tmp/parcagpu-pc-test-toy.log" +VERBOSE="" + +for arg in "$@"; do + case "$arg" in + -v) VERBOSE="-v" ;; + esac +done + +# --- Preflight checks --- +for f in "$LIB" "$TOY" "$BPF"; do + if [ ! -x "$f" ] && [ ! -f "$f" ]; then + echo "error: $f not found" >&2 + exit 1 + fi +done + +cleanup() { + [ -n "${TOY_PID:-}" ] && kill "$TOY_PID" 2>/dev/null || true + [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true + wait 2>/dev/null || true +} +trap cleanup EXIT + +# --- Launch the toy workload --- +echo "=== Starting pc_sample_toy ===" +PARCAGPU_DEBUG=1 PARCAGPU_SAMPLING_FACTOR=18 \ + PARCAGPU_PC_SAMPLING_PROBABILITY=1 \ + PARCAGPU_PC_SAMPLING_INTERVAL=0.5 \ + CUDA_INJECTION64_PATH="$LIB" "$TOY" 8 > "$TOY_LOG" 2>&1 & +TOY_PID=$! +echo "pc_sample_toy PID: $TOY_PID" + +# Wait for library to be loaded into the process. +while kill -0 "$TOY_PID" 2>/dev/null && + ! grep -q libparcagpucupti "/proc/$TOY_PID/maps" 2>/dev/null; do + sleep 0.1 +done + +if ! kill -0 "$TOY_PID" 2>/dev/null; then + echo "error: pc_sample_toy exited before library loaded" >&2 + exit 1 +fi + +# --- Attach BPF parser --- +echo "=== Starting BPF activity parser ===" +"$BPF" -pid "$TOY_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 & +BPF_PID=$! +echo "activity_parser PID: $BPF_PID" + +# --- Wait for workload to finish --- +wait "$TOY_PID" 2>/dev/null || true +TOY_PID="" +sleep 2 + +# --- Stop BPF parser --- +kill "$BPF_PID" 2>/dev/null || true +wait "$BPF_PID" 2>/dev/null || true +BPF_PID="" + +# --- Results --- +echo +echo "=== Toy output (parcagpu debug) ===" +cat "$TOY_LOG" +echo +echo "=== BPF parser output ===" +cat "$BPF_LOG" +echo + +# --- Checks --- +PASS=true + +check() { + local label="$1" pattern="$2" file="$3" + if grep -q "$pattern" "$file"; then + echo "PASS: $label" + else + echo "FAIL: $label" >&2 + PASS=false + fi +} + +check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TOY_LOG" +check "stall reason map received" "\[ 0\] smsp__pcsamp" "$BPF_LOG" +check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG" +check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG" +check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG" + +if $PASS; then + echo + echo "=== ALL CHECKS PASSED ===" +else + echo + echo "=== SOME CHECKS FAILED ===" >&2 + exit 1 +fi diff --git a/test/test_cupti_prof.c b/test/test_cupti_prof.c index 04810a0..c16649c 100644 --- a/test/test_cupti_prof.c +++ b/test/test_cupti_prof.c @@ -628,6 +628,13 @@ void *cupti_thread(void *arg) { size_t maxNumRecords; bufferRequestedCallback(&buffer, &bufferSize, &maxNumRecords); + // The profiler may return NULL when no tracer is attached + // (semaphore-gated short circuit). Skip this flush cycle. + if (buffer == NULL) { + usleep(100000); + continue; + } + // Fill the buffer with activity records for launched kernels size_t offset = 0; size_t recordSize = sizeof(CUpti_ActivityKernel5); @@ -894,6 +901,16 @@ int main(int argc, char **argv) { return 0; } + // Fire resource callbacks (CONTEXT_CREATED, MODULE_LOADED) so that + // PC sampling initialization runs in the profiler library. + typedef void (*FireResourceCallbacksFunc)(void); + FireResourceCallbacksFunc fireResourceCbs = + (FireResourceCallbacksFunc)dlsym(RTLD_DEFAULT, + "__mock_cupti_fire_resource_callbacks"); + if (fireResourceCbs) { + fireResourceCbs(); + } + // Load kernel names if specified KernelNameList *kernel_names = NULL; if (config.kernel_names) { diff --git a/vendor/opentelemetry-ebpf-profiler b/vendor/opentelemetry-ebpf-profiler deleted file mode 160000 index d13351c..0000000 --- a/vendor/opentelemetry-ebpf-profiler +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d13351cf54ac5cad54bbf313f321539f4ba9f70b