Skip to content

Commit 3abfe7d

Browse files
committed
Fix CUDA QEMU tests for new libparcagpucupti.so
The arm64 libparcagpucupti.so (Triton Proton) dropped libcupti.so from DT_NEEDED and now loads it via runtime dlopen/dlsym. The old empty stub had no CUPTI symbols, so dlsym("cuptiGetVersion") threw "Failed to load libcupti.so". Build mock libcupti.so and libcuda.so from the parcagpu repo's test sources (test/mock_cupti.c, test/mock_cuda.c) which provide all CUPTI/ CUDA API functions that libparcagpucupti.so resolves at runtime. Minimal type-definition headers are included so the mocks compile without the CUDA SDK. Set TRITON_CUPTI_LIB_PATH in the QEMU init script so the library finds the mocks via the explicit path it checks first. Read the registered CUPTI callbacks from the mock library's globals via dlsym after InitializeInjection, since the mock has its own copy of these symbols separate from the test binary.
1 parent 29c7088 commit 3abfe7d

7 files changed

Lines changed: 253 additions & 30 deletions

File tree

.github/workflows/unit-test-on-pull-request.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,8 @@ jobs:
251251
- name: Get parcagpu image digest
252252
id: parcagpu-digest
253253
run: |
254-
digest=$(docker buildx imagetools inspect ghcr.io/parca-dev/parcagpu:latest --format '{{.Digest}}' 2>/dev/null || echo "unknown")
254+
digest=$(docker buildx imagetools inspect ghcr.io/parca-dev/parcagpu:latest --raw 2>/dev/null | sha256sum | awk '{print $1}')
255+
digest=${digest:-unknown}
255256
echo "digest=${digest}" >> "$GITHUB_OUTPUT"
256257
- name: Cache parcagpu library
257258
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4

test/cudaverify/mock_cupti.c

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,18 @@ static void *soHandle = NULL;
198198
// init_parcagpu loads the .so, calls InitializeInjection, and extracts
199199
// the callback pointers that were registered via the mock CUPTI APIs.
200200
int init_parcagpu(const char *so_path) {
201+
// Pre-load mock libcupti.so with RTLD_NODELETE so its state persists
202+
// across test runs. InitializeInjection uses a singleton pattern and
203+
// won't re-register CUPTI callbacks on subsequent calls, so the mock's
204+
// globals from the first run must survive cleanup/reinit cycles.
205+
const char *cupti_dir = getenv("TRITON_CUPTI_LIB_PATH");
206+
char cupti_path[512];
207+
if (cupti_dir)
208+
snprintf(cupti_path, sizeof(cupti_path), "%s/libcupti.so", cupti_dir);
209+
else
210+
snprintf(cupti_path, sizeof(cupti_path), "libcupti.so");
211+
dlopen(cupti_path, RTLD_LAZY | RTLD_NODELETE);
212+
201213
soHandle = dlopen(so_path, RTLD_NOW | RTLD_GLOBAL);
202214
if (!soHandle) {
203215
fprintf(stderr, "mock_cupti: dlopen failed: %s\n", dlerror());
@@ -217,10 +229,28 @@ int init_parcagpu(const char *so_path) {
217229
int rc = initFunc();
218230
fprintf(stderr, "mock_cupti: InitializeInjection returned %d\n", rc);
219231

220-
// Extract callbacks set by InitializeInjection via our mock CUPTI.
221-
parcagpuCallback = __cupti_runtime_api_callback;
222-
bufferReqCallback = __cupti_buffer_requested_callback;
223-
bufferCompCallback = __cupti_buffer_completed_callback;
232+
// Extract callbacks registered by InitializeInjection via cuptiSubscribe.
233+
//
234+
// Two cases depending on the libparcagpucupti.so build:
235+
// (a) New builds: no DT_NEEDED for libcupti — the library loads the mock
236+
// libcupti.so via dlopen/dlsym, so callbacks live in the mock's globals.
237+
// (b) Old builds: DT_NEEDED for libcupti.so.N — symbols resolve from the
238+
// test binary's --export-dynamic, so callbacks live in our own globals.
239+
//
240+
// Check the mock library first, then fall back to our own copy.
241+
void *cupti_lib = dlopen(cupti_path, RTLD_LAZY);
242+
if (cupti_lib) {
243+
CUpti_CallbackFunc *p1 = dlsym(cupti_lib, "__cupti_runtime_api_callback");
244+
CUpti_BufferRequestFunc *p2 = dlsym(cupti_lib, "__cupti_buffer_requested_callback");
245+
CUpti_BufferCompletedFunc *p3 = dlsym(cupti_lib, "__cupti_buffer_completed_callback");
246+
if (p1 && *p1) parcagpuCallback = *p1;
247+
if (p2 && *p2) bufferReqCallback = *p2;
248+
if (p3 && *p3) bufferCompCallback = *p3;
249+
dlclose(cupti_lib);
250+
}
251+
if (!parcagpuCallback) parcagpuCallback = __cupti_runtime_api_callback;
252+
if (!bufferReqCallback) bufferReqCallback = __cupti_buffer_requested_callback;
253+
if (!bufferCompCallback) bufferCompCallback = __cupti_buffer_completed_callback;
224254

225255
if (!parcagpuCallback) {
226256
fprintf(stderr, "mock_cupti: parcagpuCuptiCallback is NULL\n");

test/distro-qemu/build-initramfs.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,10 @@ copy_lib_deps() {
100100
cp "${BUILD_DIR}"/*.test "$ROOTFS_DIR/"
101101
cp "${PARCAGPU_DIR}/libparcagpucupti.so" "$ROOTFS_DIR/"
102102

103-
# Copy stub libcupti .so and libstdc++ into the RUNPATH so dlopen of
104-
# libparcagpucupti.so can resolve its DT_NEEDED entries.
105-
for stub in "${PARCAGPU_DIR}"/libcupti.so*; do
106-
[ -f "$stub" ] && cp "$stub" "$ROOTFS_DIR/usr/local/cuda/lib64/"
103+
# Copy mock CUPTI/CUDA libraries into the RUNPATH so dlopen of
104+
# libparcagpucupti.so can resolve them at runtime.
105+
for lib in "${PARCAGPU_DIR}"/libcupti.so* "${PARCAGPU_DIR}"/libcuda.so*; do
106+
[ -f "$lib" ] && cp -a "$lib" "$ROOTFS_DIR/usr/local/cuda/lib64/"
107107
done
108108
LIBSTDCXX=$(find /lib* /usr/lib* -name 'libstdc++.so.6' 2>/dev/null | head -1)
109109
if [ -n "$LIBSTDCXX" ]; then
@@ -155,6 +155,8 @@ mount -t debugfs debugfs /sys/kernel/debug 2>/dev/null || true
155155
export DEBUG_TEST=1
156156
# Help the dynamic linker find libs in the CUDA RUNPATH.
157157
export LD_LIBRARY_PATH=/usr/local/cuda/lib64
158+
# Tell libparcagpucupti.so where to find the stub libcupti.so.
159+
export TRITON_CUPTI_LIB_PATH=/usr/local/cuda/lib64
158160
159161
# Run the tests
160162
echo ""

test/distro-qemu/download-parcagpu.sh

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -58,25 +58,45 @@ if [[ ! -f "${PARCAGPU_DIR}/libparcagpucupti.so" ]]; then
5858
ls -la "${PARCAGPU_DIR}/libparcagpucupti.so"
5959
fi
6060

61-
# Build a stub libcupti .so so that dlopen of libparcagpucupti.so succeeds
62-
# without a real CUDA installation. The stub only needs to satisfy the
63-
# DT_NEEDED file lookup — the actual CUPTI symbols are provided by
64-
# mock_cupti.c in the test binary (exported via --export-dynamic).
65-
CUPTI_SONAME=$(readelf -d "${PARCAGPU_DIR}/libparcagpucupti.so" \
66-
| sed -n 's/.*NEEDED.*\[\(libcupti\.so[^]]*\)\].*/\1/p')
67-
68-
if [[ -n "${CUPTI_SONAME}" && ! -f "${PARCAGPU_DIR}/${CUPTI_SONAME}" ]]; then
69-
STUB_C=$(mktemp --suffix=.c)
70-
echo "void __cupti_stub(void){}" > "${STUB_C}"
71-
72-
# Determine cross-compiler for the target arch.
73-
case "$QEMU_ARCH" in
74-
aarch64) STUB_CC="${CC:-aarch64-linux-gnu-gcc}" ;;
75-
*) STUB_CC="${CC:-cc}" ;;
76-
esac
77-
78-
${STUB_CC} -shared -o "${PARCAGPU_DIR}/${CUPTI_SONAME}" \
79-
-Wl,-soname,"${CUPTI_SONAME}" "${STUB_C}"
80-
rm -f "${STUB_C}"
81-
echo "✅ Built stub ${CUPTI_SONAME}"
61+
# Build mock libcupti.so and libcuda.so from the parcagpu repo's test sources.
62+
# These provide real mock implementations of all CUPTI/CUDA APIs that
63+
# libparcagpucupti.so resolves via dlsym at runtime.
64+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
65+
MOCK_HEADERS="${SCRIPT_DIR}/mock-cupti-headers"
66+
PARCAGPU_REPO="parca-dev/parcagpu"
67+
68+
# Determine compiler for the target arch.
69+
if [ "$QEMU_ARCH" = "aarch64" ] && [ "$(uname -m)" != "aarch64" ]; then
70+
STUB_CC="${CC:-aarch64-linux-gnu-gcc}"
71+
else
72+
STUB_CC="${CC:-cc}"
73+
fi
74+
75+
if [[ ! -f "${PARCAGPU_DIR}/libcupti.so" ]]; then
76+
echo "Building mock libcupti.so from ${PARCAGPU_REPO}..."
77+
MOCK_SRC=$(mktemp -d)
78+
79+
# Download mock sources from the parcagpu repo.
80+
for f in test/mock_cupti.c test/mock_cuda.c; do
81+
curl -sL "https://raw.githubusercontent.com/${PARCAGPU_REPO}/main/${f}" \
82+
-o "${MOCK_SRC}/$(basename "$f")"
83+
done
84+
85+
# Build mock libcupti.so with our minimal type-definition headers.
86+
${STUB_CC} -shared -fPIC -o "${PARCAGPU_DIR}/libcupti.so" \
87+
-Wl,-soname,"libcupti.so" \
88+
-I"${MOCK_HEADERS}" \
89+
"${MOCK_SRC}/mock_cupti.c"
90+
echo "✅ Built mock libcupti.so"
91+
92+
# Build mock libcuda.so.
93+
${STUB_CC} -shared -fPIC -o "${PARCAGPU_DIR}/libcuda.so" \
94+
-Wl,-soname,"libcuda.so.1" \
95+
-I"${MOCK_HEADERS}" \
96+
"${MOCK_SRC}/mock_cuda.c"
97+
# Triton's Proton looks for the versioned soname.
98+
ln -sf libcuda.so "${PARCAGPU_DIR}/libcuda.so.1"
99+
echo "✅ Built mock libcuda.so"
100+
101+
rm -rf "${MOCK_SRC}"
82102
fi
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Minimal CUDA Driver API type definitions for building mock libraries.
2+
// Only the types referenced by parcagpu's test/mock_cuda.c are needed.
3+
#ifndef MOCK_CUDA_H
4+
#define MOCK_CUDA_H
5+
6+
typedef int CUresult;
7+
#define CUDA_SUCCESS 0
8+
#define CUDA_ERROR_INVALID_VALUE 1
9+
10+
typedef void *CUcontext;
11+
12+
#endif // MOCK_CUDA_H
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Minimal CUPTI type definitions for building mock libraries.
2+
// Only the types referenced by parcagpu's test/mock_cupti.c are needed.
3+
// Struct layouts must be ABI-compatible with the real CUPTI headers.
4+
#ifndef MOCK_CUPTI_H
5+
#define MOCK_CUPTI_H
6+
7+
#include <stdint.h>
8+
#include <stddef.h>
9+
#include "cuda.h"
10+
11+
typedef int CUptiResult;
12+
#define CUPTI_SUCCESS 0
13+
#define CUPTI_ERROR_MAX_LIMIT_REACHED 21
14+
#define CUPTI_ERROR_INVALID_KIND 46
15+
16+
typedef void *CUpti_SubscriberHandle;
17+
18+
typedef enum {
19+
CUPTI_CB_DOMAIN_RUNTIME_API = 2,
20+
CUPTI_CB_DOMAIN_DRIVER_API = 3,
21+
CUPTI_CB_DOMAIN_RESOURCE = 4,
22+
} CUpti_CallbackDomain;
23+
24+
typedef uint32_t CUpti_CallbackId;
25+
26+
#define CUPTI_CBID_RESOURCE_CONTEXT_CREATED 1
27+
#define CUPTI_CBID_RESOURCE_MODULE_LOADED 4
28+
29+
typedef void (*CUpti_CallbackFunc)(void *userdata,
30+
CUpti_CallbackDomain domain,
31+
CUpti_CallbackId cbid,
32+
const void *cbdata);
33+
34+
typedef void (*CUpti_BufferRequestFunc)(uint8_t **buffer, size_t *size,
35+
size_t *maxNumRecords);
36+
typedef void (*CUpti_BufferCompletedFunc)(CUcontext ctx, uint32_t streamId,
37+
uint8_t *buffer, size_t size,
38+
size_t validSize);
39+
40+
typedef uint32_t CUpti_ActivityKind;
41+
#define CUPTI_ACTIVITY_KIND_KERNEL 3
42+
#define CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL 10
43+
#define CUPTI_ACTIVITY_KIND_GRAPH_TRACE 34
44+
45+
typedef struct {
46+
uint32_t kind;
47+
} CUpti_Activity;
48+
49+
// CUpti_ActivityKernel5 — 160 bytes, matching cupti_activity.h.
50+
typedef struct {
51+
uint32_t kind; // offset 0
52+
uint8_t _pad1[12]; // offset 4
53+
uint64_t start; // offset 16
54+
uint64_t end; // offset 24
55+
uint64_t completed; // offset 32
56+
uint32_t deviceId; // offset 40
57+
uint32_t contextId; // offset 44
58+
uint32_t streamId; // offset 48
59+
uint8_t _pad2[40]; // offset 52
60+
uint32_t correlationId; // offset 92
61+
int64_t gridId; // offset 96
62+
const char *name; // offset 104
63+
uint64_t _reserved0; // offset 112
64+
uint64_t queued; // offset 120
65+
uint64_t submitted; // offset 128
66+
uint8_t _pad3[8]; // offset 136
67+
uint64_t graphNodeId; // offset 144
68+
uint32_t shmemLimitCfg; // offset 152
69+
uint32_t graphId; // offset 156
70+
} __attribute__((aligned(8))) CUpti_ActivityKernel5;
71+
72+
// CUpti_ActivityGraphTrace — 64 bytes.
73+
typedef struct {
74+
uint32_t kind;
75+
uint8_t _pad[60];
76+
} __attribute__((aligned(8))) CUpti_ActivityGraphTrace;
77+
78+
// Resource callback data.
79+
typedef struct {
80+
CUcontext context;
81+
void *resourceDescriptor;
82+
} CUpti_ResourceData;
83+
84+
typedef struct {
85+
const char *pCubin;
86+
size_t cubinSize;
87+
} CUpti_ModuleResourceData;
88+
89+
// cuptiGetCubinCrc params.
90+
typedef struct {
91+
const char *cubin;
92+
size_t cubinSize;
93+
uint64_t cubinCrc;
94+
} CUpti_GetCubinCrcParams;
95+
96+
// cuptiGetSassToSourceCorrelation params.
97+
typedef struct {
98+
uint64_t pcOffset;
99+
const char *functionName;
100+
uint32_t lineNumber;
101+
char *fileName;
102+
char *dirName;
103+
} CUpti_GetSassToSourceCorrelationParams;
104+
105+
// cuptiActivitySetAttribute placeholder.
106+
typedef uint32_t CUpti_ActivityAttribute;
107+
108+
#endif // MOCK_CUPTI_H
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Minimal CUPTI PC-sampling type definitions for building mock libraries.
2+
#ifndef MOCK_CUPTI_PCSAMPLING_H
3+
#define MOCK_CUPTI_PCSAMPLING_H
4+
5+
#include <stdint.h>
6+
#include <stddef.h>
7+
#include "cupti.h"
8+
9+
#define CUPTI_STALL_REASON_STRING_SIZE 128
10+
#define CUPTI_PC_SAMPLING_MAX_STALL_REASONS 32
11+
12+
typedef struct {
13+
uint32_t pcSamplingStallReasonIndex;
14+
uint32_t samples;
15+
} CUpti_PCSamplingStallReason;
16+
17+
typedef struct {
18+
size_t size;
19+
uint64_t cubinCrc;
20+
uint64_t pcOffset;
21+
uint32_t functionIndex;
22+
char *functionName;
23+
uint32_t stallReasonCount;
24+
CUpti_PCSamplingStallReason stallReason[CUPTI_PC_SAMPLING_MAX_STALL_REASONS];
25+
} CUpti_PCSamplingPCData;
26+
27+
typedef struct {
28+
uint32_t collectNumPcs;
29+
uint32_t totalNumPcs;
30+
uint32_t remainingNumPcs;
31+
uint64_t totalSamples;
32+
CUpti_PCSamplingPCData *pPcData;
33+
} CUpti_PCSamplingData;
34+
35+
typedef struct { CUpti_PCSamplingData *pcSamplingData; } CUpti_PCSamplingGetDataParams;
36+
typedef struct { void *dummy; } CUpti_PCSamplingEnableParams;
37+
typedef struct { void *dummy; } CUpti_PCSamplingDisableParams;
38+
typedef struct { void *dummy; } CUpti_PCSamplingStartParams;
39+
typedef struct { void *dummy; } CUpti_PCSamplingStopParams;
40+
typedef struct { void *dummy; } CUpti_PCSamplingConfigurationInfoParams;
41+
typedef struct {
42+
uint32_t *numStallReasons;
43+
} CUpti_PCSamplingGetNumStallReasonsParams;
44+
typedef struct {
45+
size_t numStallReasons;
46+
char (*stallReasons)[CUPTI_STALL_REASON_STRING_SIZE];
47+
uint32_t *stallReasonIndex;
48+
} CUpti_PCSamplingGetStallReasonsParams;
49+
50+
#endif // MOCK_CUPTI_PCSAMPLING_H

0 commit comments

Comments
 (0)