Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cmd/compute-domain-kubelet-plugin/computedomain.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ const (
informerResyncPeriod = 10 * time.Minute
cleanupInterval = 10 * time.Minute

ComputeDomainDaemonSettingsRoot = DriverPluginPath + "/domains"
ComputeDomainDaemonConfigFilesDirName = "domains"
ComputeDomainDaemonConfigTemplatePath = "/templates/compute-domain-daemon-config.tmpl.cfg"
)

Expand All @@ -67,9 +67,10 @@ type ComputeDomainDaemonSettings struct {
nodesConfigPath string
}

func NewComputeDomainManager(config *Config, configFilesRoot, cliqueID string) *ComputeDomainManager {
func NewComputeDomainManager(config *Config, cliqueID string) *ComputeDomainManager {
factory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, informerResyncPeriod)
informer := factory.Resource().V1beta1().ComputeDomains().Informer()
configFilesRoot := filepath.Join(config.DriverPluginPath(), ComputeDomainDaemonConfigFilesDirName)

m := &ComputeDomainManager{
config: config,
Expand Down
4 changes: 2 additions & 2 deletions cmd/compute-domain-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
return nil, fmt.Errorf("error getting cliqueID: %w", err)
}

computeDomainManager := NewComputeDomainManager(config, ComputeDomainDaemonSettingsRoot, cliqueID)
computeDomainManager := NewComputeDomainManager(config, cliqueID)

if err := cdi.CreateStandardDeviceSpecFile(allocatable); err != nil {
return nil, fmt.Errorf("unable to create base CDI spec file: %v", err)
}

checkpointManager, err := checkpointmanager.NewCheckpointManager(DriverPluginPath)
checkpointManager, err := checkpointmanager.NewCheckpointManager(config.DriverPluginPath())
if err != nil {
return nil, fmt.Errorf("unable to create checkpoint manager: %v", err)
}
Expand Down
9 changes: 7 additions & 2 deletions cmd/compute-domain-kubelet-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"errors"
"fmt"
"path/filepath"
"sync"
"time"

Expand All @@ -45,7 +46,7 @@ const (
// DriverPrepUprepFlockPath is the path to a lock file used to make sure
// that calls to nodePrepareResource() / nodeUnprepareResource() never
// interleave, node-globally.
DriverPrepUprepFlockPath = DriverPluginPath + "/pu.lock"
DriverPrepUprepFlockFileName = "pu.lock"
)

// permanentError defines an error indicating that it is permanent.
Expand All @@ -70,10 +71,12 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
return nil, err
}

puLockPath := filepath.Join(config.DriverPluginPath(), DriverPrepUprepFlockFileName)

driver := &driver{
client: config.clientsets.Core,
state: state,
pulock: flock.NewFlock(DriverPrepUprepFlockPath),
pulock: flock.NewFlock(puLockPath),
}

helper, err := kubeletplugin.Start(
Expand All @@ -89,6 +92,8 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
// prepare() must be incoming). Concurrency management for incoming
// requests is done with this driver's work queue abstraction.
kubeletplugin.Serialize(false),
kubeletplugin.RegistrarDirectoryPath(config.flags.kubeletRegistrarDirectoryPath),
kubeletplugin.PluginDataDirectoryPath(config.DriverPluginPath()),
)
if err != nil {
return nil, err
Expand Down
46 changes: 33 additions & 13 deletions cmd/compute-domain-kubelet-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/urfave/cli/v2"

"k8s.io/dynamic-resource-allocation/kubeletplugin"
"k8s.io/klog/v2"

"github.com/NVIDIA/k8s-dra-driver-gpu/internal/info"
Expand All @@ -34,7 +35,6 @@ import (

const (
DriverName = "compute-domain.nvidia.com"
DriverPluginPath = "/var/lib/kubelet/plugins/" + DriverName
DriverPluginCheckpointFileBasename = "checkpoint.json"
)

Expand All @@ -43,19 +43,25 @@ type Flags struct {
loggingConfig *flags.LoggingConfig
featureGateConfig *flags.FeatureGateConfig

nodeName string
namespace string
cdiRoot string
containerDriverRoot string
hostDriverRoot string
nvidiaCDIHookPath string
nodeName string
namespace string
cdiRoot string
containerDriverRoot string
hostDriverRoot string
nvidiaCDIHookPath string
kubeletRegistrarDirectoryPath string
kubeletPluginsDirectoryPath string
}

type Config struct {
flags *Flags
clientsets flags.ClientSets
}

func (c Config) DriverPluginPath() string {
return filepath.Join(c.flags.kubeletPluginsDirectoryPath, DriverName)
}

func main() {
if err := newApp().Run(os.Args); err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
Expand Down Expand Up @@ -111,6 +117,20 @@ func newApp() *cli.App {
Destination: &flags.nvidiaCDIHookPath,
EnvVars: []string{"NVIDIA_CDI_HOOK_PATH"},
},
&cli.StringFlag{
Name: "kubelet-registrar-directory-path",
Usage: "Absolute path to the directory where kubelet stores plugin registrations.",
Copy link
Copy Markdown
Contributor

@jgehrcke jgehrcke Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh. I noted that in another PR: this is not where the kubelet stores plugin registrations. This is the directory where the kubelet looks for unix domain sockets to discover plugins. A plugin registration then is performed through such socket.

I'd rather have no help text than a misleading one. Feel free to act on this opinion as you wish (I am OK with a merge).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. see this is how upstream describes it https://github.com/kubernetes/dynamic-resource-allocation/blob/master/kubeletplugin/draplugin.go#L226

go doc --all k8s.io/dynamic-resource-allocation/kubeletplugin.RegistrarDirectoryPath
package kubeletplugin // import "k8s.io/dynamic-resource-allocation/kubeletplugin"

func RegistrarDirectoryPath(path string) Option
    RegistrarDirectoryPath sets the path to the directory where the kubelet
    expects to find registration sockets of plugins. Typically this is
    /var/lib/kubelet/plugins_registry with /var/lib/kubelet being the kubelet's
    data directory.

    This is also the default. Some Kubernetes clusters may use a different data
    directory. This path must be the same inside and outside of the driver's
    container. The directory must exist.

i am okay either way. i would prefer not to deviate from upstream as a different description may confuse more.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's leave it as is for now. Feel free to merge.

Value: kubeletplugin.KubeletRegistryDir,
Destination: &flags.kubeletRegistrarDirectoryPath,
EnvVars: []string{"KUBELET_REGISTRAR_DIRECTORY_PATH"},
},
&cli.StringFlag{
Name: "kubelet-plugins-directory-path",
Usage: "Absolute path to the directory where kubelet stores plugin data.",
Value: kubeletplugin.KubeletPluginsDir,
Destination: &flags.kubeletPluginsDirectoryPath,
EnvVars: []string{"KUBELET_PLUGINS_DIRECTORY_PATH"},
},
}
cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...)
cliFlags = append(cliFlags, flags.featureGateConfig.Flags()...)
Expand Down Expand Up @@ -158,13 +178,13 @@ func newApp() *cli.App {
// StartPlugin initializes and runs the compute domain kubelet plugin.
func StartPlugin(ctx context.Context, config *Config) error {
// Create the plugin directory
err := os.MkdirAll(DriverPluginPath, 0750)
err := os.MkdirAll(config.DriverPluginPath(), 0750)
if err != nil {
return err
}

// Setup nvidia-cdi-hook binary
if err := config.flags.setNvidiaCDIHookPath(); err != nil {
if err := config.setNvidiaCDIHookPath(); err != nil {
return fmt.Errorf("error setting up nvidia-cdi-hook: %w", err)
}

Expand Down Expand Up @@ -215,13 +235,13 @@ func StartPlugin(ctx context.Context, config *Config) error {
// to this path. The /usr/bin/nvidia-cdi-hook is present in the current
// container image because it is copied from the toolkit image into this
// container at build time.
func (f *Flags) setNvidiaCDIHookPath() error {
if f.nvidiaCDIHookPath != "" {
func (c Config) setNvidiaCDIHookPath() error {
Comment thread
guptaNswati marked this conversation as resolved.
if c.flags.nvidiaCDIHookPath != "" {
return nil
}

sourcePath := "/usr/bin/nvidia-cdi-hook"
targetPath := filepath.Join(DriverPluginPath, "nvidia-cdi-hook")
targetPath := filepath.Join(c.DriverPluginPath(), "nvidia-cdi-hook")

input, err := os.ReadFile(sourcePath)
if err != nil {
Expand All @@ -232,7 +252,7 @@ func (f *Flags) setNvidiaCDIHookPath() error {
return fmt.Errorf("error copying nvidia-cdi-hook: %w", err)
}

f.nvidiaCDIHookPath = targetPath
c.flags.nvidiaCDIHookPath = targetPath

return nil
}
4 changes: 2 additions & 2 deletions cmd/gpu-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
}

tsManager := NewTimeSlicingManager(nvdevlib)
mpsManager := NewMpsManager(config, nvdevlib, MpsRoot, hostDriverRoot, MpsControlDaemonTemplatePath)
mpsManager := NewMpsManager(config, nvdevlib, hostDriverRoot, MpsControlDaemonTemplatePath)

if err := cdi.CreateStandardDeviceSpecFile(allocatable); err != nil {
return nil, fmt.Errorf("unable to create base CDI spec file: %v", err)
}

checkpointManager, err := checkpointmanager.NewCheckpointManager(DriverPluginPath)
checkpointManager, err := checkpointmanager.NewCheckpointManager(config.DriverPluginPath())
if err != nil {
return nil, fmt.Errorf("unable to create checkpoint manager: %v", err)
}
Expand Down
10 changes: 8 additions & 2 deletions cmd/gpu-kubelet-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package main
import (
"context"
"fmt"
"path/filepath"
"time"

resourceapi "k8s.io/api/resource/v1"
Expand All @@ -35,7 +36,7 @@ import (
// DriverPrepUprepFlockPath is the path to a lock file used to make sure
// that calls to nodePrepareResource() / nodeUnprepareResource() never
// interleave, node-globally.
const DriverPrepUprepFlockPath = DriverPluginPath + "/pu.lock"
const DriverPrepUprepFlockFileName = "pu.lock"

type driver struct {
client coreclientset.Interface
Expand All @@ -49,10 +50,13 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
if err != nil {
return nil, err
}

puLockPath := filepath.Join(config.DriverPluginPath(), DriverPrepUprepFlockFileName)

driver := &driver{
client: config.clientsets.Core,
state: state,
pulock: flock.NewFlock(DriverPrepUprepFlockPath),
pulock: flock.NewFlock(puLockPath),
}

helper, err := kubeletplugin.Start(
Expand All @@ -62,6 +66,8 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
kubeletplugin.NodeName(config.flags.nodeName),
kubeletplugin.DriverName(DriverName),
kubeletplugin.Serialize(false),
kubeletplugin.RegistrarDirectoryPath(config.flags.kubeletRegistrarDirectoryPath),
kubeletplugin.PluginDataDirectoryPath(config.DriverPluginPath()),
)
if err != nil {
return nil, err
Expand Down
49 changes: 35 additions & 14 deletions cmd/gpu-kubelet-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/urfave/cli/v2"

"k8s.io/dynamic-resource-allocation/kubeletplugin"
"k8s.io/klog/v2"

"github.com/NVIDIA/k8s-dra-driver-gpu/internal/info"
Expand All @@ -34,7 +35,6 @@ import (

const (
DriverName = "gpu.nvidia.com"
DriverPluginPath = "/var/lib/kubelet/plugins/" + DriverName
DriverPluginCheckpointFileBasename = "checkpoint.json"
)

Expand All @@ -43,20 +43,26 @@ type Flags struct {
loggingConfig *flags.LoggingConfig
featureGateConfig *flags.FeatureGateConfig

nodeName string
namespace string
cdiRoot string
containerDriverRoot string
hostDriverRoot string
nvidiaCDIHookPath string
imageName string
nodeName string
namespace string
cdiRoot string
containerDriverRoot string
hostDriverRoot string
nvidiaCDIHookPath string
imageName string
kubeletRegistrarDirectoryPath string
kubeletPluginsDirectoryPath string
}

type Config struct {
flags *Flags
clientsets flags.ClientSets
}

func (c Config) DriverPluginPath() string {
return filepath.Join(c.flags.kubeletPluginsDirectoryPath, DriverName)
}

func main() {
if err := newApp().Run(os.Args); err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
Expand Down Expand Up @@ -119,6 +125,20 @@ func newApp() *cli.App {
Destination: &flags.imageName,
EnvVars: []string{"IMAGE_NAME"},
},
&cli.StringFlag{
Name: "kubelet-registrar-directory-path",
Usage: "Absolute path to the directory where kubelet stores plugin registrations.",
Value: kubeletplugin.KubeletRegistryDir,
Destination: &flags.kubeletRegistrarDirectoryPath,
EnvVars: []string{"KUBELET_REGISTRAR_DIRECTORY_PATH"},
},
&cli.StringFlag{
Name: "kubelet-plugins-directory-path",
Usage: "Absolute path to the directory where kubelet stores plugin data.",
Value: kubeletplugin.KubeletPluginsDir,
Destination: &flags.kubeletPluginsDirectoryPath,
EnvVars: []string{"KUBELET_PLUGINS_DIRECTORY_PATH"},
},
}
cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...)
cliFlags = append(cliFlags, flags.featureGateConfig.Flags()...)
Expand Down Expand Up @@ -166,13 +186,13 @@ func newApp() *cli.App {
// StartPlugin initializes and runs the GPU kubelet plugin.
func StartPlugin(ctx context.Context, config *Config) error {
// Create the plugin directory
err := os.MkdirAll(DriverPluginPath, 0750)
err := os.MkdirAll(config.DriverPluginPath(), 0750)
if err != nil {
return err
}

// Setup nvidia-cdi-hook binary
if err := config.flags.setNvidiaCDIHookPath(); err != nil {
if err := config.setNvidiaCDIHookPath(); err != nil {
return fmt.Errorf("error setting up nvidia-cdi-hook: %w", err)
}

Expand Down Expand Up @@ -216,19 +236,20 @@ func StartPlugin(ctx context.Context, config *Config) error {
return nil
}

// change to config
// If 'f.nvidiaCDIHookPath' is already set (from the command line), do nothing.
// If 'f.nvidiaCDIHookPath' is empty, it copies the nvidia-cdi-hook binary from
// /usr/bin/nvidia-cdi-hook to DriverPluginPath and sets 'f.nvidiaCDIHookPath'
// to this path. The /usr/bin/nvidia-cdi-hook is present in the current
// container image because it is copied from the toolkit image into this
// container at build time.
func (f *Flags) setNvidiaCDIHookPath() error {
if f.nvidiaCDIHookPath != "" {
func (c Config) setNvidiaCDIHookPath() error {
if c.flags.nvidiaCDIHookPath != "" {
return nil
}

sourcePath := "/usr/bin/nvidia-cdi-hook"
targetPath := filepath.Join(DriverPluginPath, "nvidia-cdi-hook")
targetPath := filepath.Join(c.DriverPluginPath(), "nvidia-cdi-hook")

input, err := os.ReadFile(sourcePath)
if err != nil {
Expand All @@ -239,7 +260,7 @@ func (f *Flags) setNvidiaCDIHookPath() error {
return fmt.Errorf("error copying nvidia-cdi-hook: %w", err)
}

f.nvidiaCDIHookPath = targetPath
c.flags.nvidiaCDIHookPath = targetPath

return nil
}
7 changes: 5 additions & 2 deletions cmd/gpu-kubelet-plugin/sharing.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"fmt"
"os"
"os/exec"
"path/filepath"
"slices"
"strconv"
"strings"
Expand All @@ -51,7 +52,7 @@ import (
)

const (
MpsRoot = DriverPluginPath + "/mps"
Comment thread
guptaNswati marked this conversation as resolved.
MpsControlFilesDirName = "mps"
MpsControlDaemonTemplatePath = "/templates/mps-control-daemon.tmpl.yaml"
MpsControlDaemonNameFmt = "mps-control-daemon-%v" // Fill with ClaimUID
)
Expand Down Expand Up @@ -124,7 +125,9 @@ func (t *TimeSlicingManager) SetTimeSlice(devices UUIDProvider, config *configap
return nil
}

func NewMpsManager(config *Config, deviceLib *deviceLib, controlFilesRoot, hostDriverRoot, templatePath string) *MpsManager {
func NewMpsManager(config *Config, deviceLib *deviceLib, hostDriverRoot, templatePath string) *MpsManager {
controlFilesRoot := filepath.Join(config.DriverPluginPath(), MpsControlFilesDirName)

return &MpsManager{
controlFilesRoot: controlFilesRoot,
hostDriverRoot: hostDriverRoot,
Expand Down
Loading