Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
b106967
Implement docker device request in CLI
akshaver Feb 2, 2026
9aca3e9
Fix issues with duplicate symlink processing
akshaver Feb 2, 2026
c3b3f19
fix: resolve ptrace monitor bugs causing hangs with multiprocess apps
akshaver Feb 23, 2026
224efa9
fix null ref crash when ptrace disabled
akshaver Feb 17, 2026
da8eb2c
fix: prevent namespace package directories from being excluded by IsS…
akshaver Feb 23, 2026
a7ed39a
example use case with nvidia runtime
akshaver Dec 15, 2025
77da28d
Fix HasSuccessfulAccess for open-type syscalls where success is fd >=…
akshaver Feb 24, 2026
0895e01
Key deduplicateFileMap by (dev, inode) instead of inode alone to avoi…
akshaver Feb 24, 2026
d90297b
Fail fast on invalid --cro-device-request JSON instead of silently dr…
akshaver Feb 24, 2026
e47d9a2
Remove unused SIGNAL_PIPE named pipe from test_vllm.sh
akshaver Feb 24, 2026
f10d5a8
Update pkg/monitor/ptrace/ptrace.go
akshaver Feb 28, 2026
3f8dcb4
Update pkg/monitor/ptrace/ptrace.go
akshaver Feb 28, 2026
755b6d5
patch up kilo-code-bot fix
akshaver Mar 2, 2026
27c344b
correct indentation
akshaver Mar 2, 2026
a2ad80e
Revert HasSuccessfulAccess to use retVal==0 check instead of OKReturn…
akshaver Mar 2, 2026
2d09c8a
chore: moved to vLLM v0.17.1-cu130 for testing
akshaver Mar 19, 2026
b33fb91
lint: fixed indenting in ptrace.go with gofmt
akshaver Mar 19, 2026
a245d14
add unit tests for bugs related to ghost paths
akshaver Apr 2, 2026
b45ab6f
Revert OKReturnStatus to success-only and remove HasSuccessfulAccess
akshaver Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/nvidia_runtime/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
host-config.json
vllm_test_results.json
vllm_test_results_slim.json
original_log.txt
slim_log.txt
6 changes: 6 additions & 0 deletions examples/nvidia_runtime/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM nvcr.io/nvidia/pytorch:25.04-py3

# Add the tests to the entrypoint set. Docker Slim only traces/monitors the processes started by the entrypoint.
RUN echo "pytest /opt/pytorch/pytorch/test/test_cuda.py::TestCuda::test_graph_cudnn_dropout" > /opt/nvidia/entrypoint.d/99-trace.sh
RUN chmod +x /opt/nvidia/entrypoint.d/99-trace.sh

19 changes: 19 additions & 0 deletions examples/nvidia_runtime/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
As a pre-requisite, install nvidia-container toolkit, including adding the nvidia runtime. Then you should be able to translate runtime and capabilities from a OCI/Docker string like `--runtime=nvidia --gpus all` to `--cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' --cro-runtime nvidia`

See the example `test_nvidia_smi.sh`, which slims ubuntu to just the files necessary to run the runtime mounted nvidia-smi. Similarly, see `test_nvidia_pytorch.sh` which minimizes nvidia-pytorch to run a subset of the CUDA tests.

97 changes: 97 additions & 0 deletions examples/nvidia_runtime/test_nvidia_pytorch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create host config file with ulimit settings and capabilities
cat > host-config.json <<'EOF'
{
"IpcMode": "host",
"CapAdd": ["SYS_ADMIN"],
"Ulimits": [
{
"Name": "memlock",
"Soft": -1,
"Hard": -1
},
{
"Name": "stack",
"Soft": 67108864,
"Hard": 67108864
},
{
"Name": "nofile",
"Soft": 1048576,
"Hard": 1048576
}
]
}
EOF

# Build the slim image
# CAP_SYS_ADMIN is added via host-config.json for fanotify support (required for filesystem monitoring)
# Build custom image with test in entrypoint first
echo "Building custom test image with pytest in entrypoint..."
docker build -t nvcr.io/nvidia/pytorch:25.04-py3-test -f Dockerfile .

echo "Running mint on the test image..."
mint build \
--target nvcr.io/nvidia/pytorch:25.04-py3-test \
--tag nvcr.io/nvidia/pytorch:25.04-py3-slim \
--cro-host-config-file host-config.json \
--cro-shm-size 1200 \
--cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' \
--cro-runtime nvidia \
--http-probe=false \
--continue-after 10 \
--preserve-path /etc/ld.so.conf \
--preserve-path /etc/ld.so.conf.d \
.

# Get output of original and slim images stored in a log file
echo "Running original image..."
docker run --rm --runtime nvidia --gpus all nvcr.io/nvidia/pytorch:25.04-py3-test > original_log.txt 2>&1
echo "Running slim image..."
docker run --rm --runtime nvidia --gpus all nvcr.io/nvidia/pytorch:25.04-py3-slim > slim_log.txt 2>&1

# Verify that both logs contain the pytest success message (ignoring timing)
echo "Checking test results..."

# Look for "X passed" pattern in both logs
original_passed=$(grep -oE "[0-9]+ passed" original_log.txt | head -1)
slim_passed=$(grep -oE "[0-9]+ passed" slim_log.txt | head -1)

if [ -z "$original_passed" ]; then
echo "Error: Original image test did not pass"
echo "Original log tail:"
tail -20 original_log.txt
exit 1
fi

if [ -z "$slim_passed" ]; then
echo "Error: Slim image test did not pass"
echo "Slim log tail:"
tail -20 slim_log.txt
exit 1
fi

echo "Original image: $original_passed"
echo "Slim image: $slim_passed"

if [ "$original_passed" = "$slim_passed" ]; then
echo "SUCCESS: Both images passed the same number of tests!"
else
echo "Warning: Different number of tests passed (original: $original_passed, slim: $slim_passed)"
fi

echo "Successfully minimized nvidia-pytorch to run a subset of the CUDA tests"
34 changes: 34 additions & 0 deletions examples/nvidia_runtime/test_nvidia_smi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Build the slim image
mint build --target ubuntu:24.04 --tag ubuntu:24.04-slim --cro-shm-size 1200 --cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' --cro-runtime nvidia --http-probe=false --exec "/usr/bin/nvidia-smi" .

# Get output of original and slim images stored in a log file
docker run --rm --runtime nvidia --gpus all ubuntu:24.04 nvidia-smi > original_log.txt
docker run --rm --runtime nvidia --gpus all ubuntu:24.04-slim nvidia-smi > slim_log.txt

# verify that both logs include the nvidia-smi output with an assert
assert_contains() {
if ! grep -q "$1" "$2"; then
echo "Error: '$1' not found in $2"
exit 1
fi
}

# verify that both logs include the nvidia-smi output with an assert
assert_contains "NVIDIA-SMI" original_log.txt
assert_contains "NVIDIA-SMI" slim_log.txt

Loading