Skip to content

Commit f93ff24

Browse files
committed
Map /dev/aperture_devices for TCPXO
1 parent ad08291 commit f93ff24

File tree

1 file changed

+9
-48
lines changed

1 file changed

+9
-48
lines changed

runner/internal/shim/docker.go

Lines changed: 9 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -995,58 +995,19 @@ func configureGpus(config *container.Config, hostConfig *container.HostConfig, v
995995
// AMD: ids are DRI render node paths, e.g., /dev/dri/renderD128
996996
switch vendor {
997997
case host.GpuVendorNvidia:
998-
// hostConfig.Resources.DeviceRequests = append(
999-
// hostConfig.Resources.DeviceRequests,
1000-
// container.DeviceRequest{
1001-
// // Request all capabilities to maximize compatibility with all sorts of GPU workloads.
1002-
// // Default capabilities: utility, compute.
1003-
// // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.16.0/docker-specialized.html
1004-
// Capabilities: [][]string{{"gpu", "utility", "compute", "graphics", "video", "display", "compat32"}},
1005-
// DeviceIDs: ids,
1006-
// },
1007-
// )
1008-
for i := range 8 {
1009-
devPath := fmt.Sprintf("/dev/nvidia%d", i)
1010-
hostConfig.Resources.Devices = append(
1011-
hostConfig.Resources.Devices,
1012-
container.DeviceMapping{
1013-
PathOnHost: devPath,
1014-
PathInContainer: devPath,
1015-
CgroupPermissions: "rwm",
1016-
},
1017-
)
1018-
}
1019-
hostConfig.Resources.Devices = append(
1020-
hostConfig.Resources.Devices,
1021-
container.DeviceMapping{
1022-
PathOnHost: "/dev/nvidia-uvm",
1023-
PathInContainer: "/dev/nvidia-uvm",
1024-
CgroupPermissions: "rwm",
1025-
},
1026-
)
1027-
hostConfig.Resources.Devices = append(
1028-
hostConfig.Resources.Devices,
1029-
container.DeviceMapping{
1030-
PathOnHost: "/dev/nvidiactl",
1031-
PathInContainer: "/dev/nvidiactl",
1032-
CgroupPermissions: "rwm",
998+
hostConfig.Resources.DeviceRequests = append(
999+
hostConfig.Resources.DeviceRequests,
1000+
container.DeviceRequest{
1001+
// Request all capabilities to maximize compatibility with all sorts of GPU workloads.
1002+
// Default capabilities: utility, compute.
1003+
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.16.0/docker-specialized.html
1004+
Capabilities: [][]string{{"gpu", "utility", "compute", "graphics", "video", "display", "compat32"}},
1005+
DeviceIDs: ids,
10331006
},
10341007
)
10351008
hostConfig.Mounts = append(
10361009
hostConfig.Mounts,
1037-
mount.Mount{Type: mount.TypeBind, Source: "/var/lib/nvidia/lib64", Target: "/usr/local/nvidia/lib64"},
1038-
)
1039-
hostConfig.Mounts = append(
1040-
hostConfig.Mounts,
1041-
mount.Mount{Type: mount.TypeBind, Source: "/var/lib/nvidia/bin", Target: "/usr/local/nvidia/bin"},
1042-
)
1043-
hostConfig.Mounts = append(
1044-
hostConfig.Mounts,
1045-
mount.Mount{Type: mount.TypeBind, Source: "/var/lib/tcpx/lib64", Target: "/usr/local/tcpx/lib64"},
1046-
)
1047-
hostConfig.Mounts = append(
1048-
hostConfig.Mounts,
1049-
mount.Mount{Type: mount.TypeBind, Source: "/run/tcpx", Target: "/run/tcpx"},
1010+
mount.Mount{Type: mount.TypeBind, Source: "/dev/aperture_devices", Target: "/dev/aperture_devices"},
10501011
)
10511012
case host.GpuVendorAmd:
10521013
// All options are listed here: https://hub.docker.com/r/rocm/pytorch

0 commit comments

Comments
 (0)