Skip to content

Commit 9032228

Browse files
committed
implement NRI plugin server to inject management CDI devices
Signed-off-by: Tariq Ibrahim <[email protected]>
1 parent f845983 commit 9032228

File tree

589 files changed

+165223
-23
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

589 files changed

+165223
-23
lines changed

cmd/nvidia-ctk-installer/container/container.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,15 @@ type Options struct {
4949
// mount.
5050
ExecutablePath string
5151
// EnabledCDI indicates whether CDI should be enabled.
52-
EnableCDI bool
53-
RuntimeName string
54-
RuntimeDir string
55-
SetAsDefault bool
56-
RestartMode string
57-
HostRootMount string
52+
EnableCDI bool
53+
EnableNRI bool
54+
RuntimeName string
55+
RuntimeDir string
56+
SetAsDefault bool
57+
RestartMode string
58+
HostRootMount string
59+
NRIPluginIndex string
60+
NRISocket string
5861

5962
ConfigSources []string
6063
}
@@ -128,6 +131,10 @@ func (o Options) UpdateConfig(cfg engine.Interface) error {
128131
cfg.EnableCDI()
129132
}
130133

134+
if o.EnableNRI {
135+
cfg.EnableNRI()
136+
}
137+
131138
return nil
132139
}
133140

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
package nri
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"github.com/containerd/nri/pkg/api"
8+
"github.com/containerd/nri/pkg/stub"
9+
"sigs.k8s.io/yaml"
10+
11+
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
12+
)
13+
14+
const (
15+
// nodeResourceCDIDeviceKey is the prefix of the key used for CDI device annotations.
16+
nodeResourceCDIDeviceKey = "cdi-devices.noderesource.dev"
17+
// Prefix of the key used for CDI device annotations.
18+
nriCDIDeviceKey = "cdi-devices.nri.io"
19+
)
20+
21+
type Plugin struct {
22+
logger logger.Interface
23+
24+
Stub stub.Stub
25+
}
26+
27+
// CreateContainer handles container creation requests.
28+
func (p *Plugin) CreateContainer(_ context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) {
29+
adjust := &api.ContainerAdjustment{}
30+
31+
if err := p.injectCDIDevices(pod, ctr, adjust); err != nil {
32+
return nil, nil, err
33+
}
34+
35+
return adjust, nil, nil
36+
}
37+
38+
func (p *Plugin) injectCDIDevices(pod *api.PodSandbox, ctr *api.Container, a *api.ContainerAdjustment) error {
39+
devices, err := parseCDIDevices(ctr.Name, pod.Annotations)
40+
if err != nil {
41+
return err
42+
}
43+
44+
if len(devices) == 0 {
45+
p.logger.Debugf("%s: no CDI devices annotated...", containerName(pod, ctr))
46+
return nil
47+
}
48+
49+
for _, name := range devices {
50+
a.AddCDIDevice(
51+
&api.CDIDevice{
52+
Name: name,
53+
},
54+
)
55+
p.logger.Infof("%s: injected CDI device %q...", containerName(pod, ctr), name)
56+
}
57+
58+
return nil
59+
}
60+
61+
func parseCDIDevices(ctr string, annotations map[string]string) ([]string, error) {
62+
var (
63+
cdiDevices []string
64+
)
65+
66+
annotation := getAnnotation(annotations, nodeResourceCDIDeviceKey, nriCDIDeviceKey, ctr)
67+
if len(annotation) == 0 {
68+
return nil, nil
69+
}
70+
71+
if err := yaml.Unmarshal(annotation, &cdiDevices); err != nil {
72+
return nil, fmt.Errorf("invalid CDI device annotation %q: %w", string(annotation), err)
73+
}
74+
75+
return cdiDevices, nil
76+
}
77+
78+
func getAnnotation(annotations map[string]string, mainKey, oldKey, ctr string) []byte {
79+
for _, key := range []string{
80+
mainKey + "/container." + ctr,
81+
oldKey + "/container." + ctr,
82+
mainKey + "/pod",
83+
oldKey + "/pod",
84+
mainKey,
85+
oldKey,
86+
} {
87+
if value, ok := annotations[key]; ok {
88+
return []byte(value)
89+
}
90+
}
91+
92+
return nil
93+
}
94+
95+
// Construct a container name for log messages.
96+
func containerName(pod *api.PodSandbox, container *api.Container) string {
97+
if pod != nil {
98+
return pod.Name + "/" + container.Name
99+
}
100+
return container.Name
101+
}

cmd/nvidia-ctk-installer/container/runtime/runtime.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,19 @@
1717
package runtime
1818

1919
import (
20+
"context"
21+
"errors"
2022
"fmt"
23+
"strings"
2124

25+
"github.com/containerd/nri/pkg/stub"
2226
"github.com/urfave/cli/v3"
2327

2428
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container"
2529
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/containerd"
2630
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/crio"
2731
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/docker"
32+
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/container/runtime/nri"
2833
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk-installer/toolkit"
2934
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
3035
)
@@ -34,6 +39,8 @@ const (
3439
// defaultRuntimeName specifies the NVIDIA runtime to be use as the default runtime if setting the default runtime is enabled
3540
defaultRuntimeName = "nvidia"
3641
defaultHostRootMount = "/host"
42+
defaultNRIPluginIdx = "10"
43+
defaultNRISocket = "/var/run/nri/nri.sock"
3744

3845
runtimeSpecificDefault = "RUNTIME_SPECIFIC_DEFAULT"
3946
)
@@ -94,6 +101,27 @@ func Flags(opts *Options) []cli.Flag {
94101
Destination: &opts.EnableCDI,
95102
Sources: cli.EnvVars("RUNTIME_ENABLE_CDI"),
96103
},
104+
&cli.BoolFlag{
105+
Name: "enable-nri-in-runtime",
106+
Usage: "Enable NRI in the configured runtime",
107+
Destination: &opts.EnableNRI,
108+
Value: true,
109+
Sources: cli.EnvVars("RUNTIME_ENABLE_NRI"),
110+
},
111+
&cli.StringFlag{
112+
Name: "nri-plugin-index",
113+
Usage: "Specify the plugin index to register to NRI",
114+
Value: defaultNRIPluginIdx,
115+
Destination: &opts.NRIPluginIndex,
116+
Sources: cli.EnvVars("RUNTIME_NRI_PLUGIN_INDEX"),
117+
},
118+
&cli.StringFlag{
119+
Name: "nri-socket",
120+
Usage: "Specify the path to the NRI socket file to register the NRI plugin server",
121+
Value: defaultNRISocket,
122+
Destination: &opts.NRISocket,
123+
Sources: cli.EnvVars("RUNTIME_NRI_SOCKET"),
124+
},
97125
&cli.StringFlag{
98126
Name: "host-root",
99127
Usage: "Specify the path to the host root to be used when restarting the runtime using systemd",
@@ -250,3 +278,56 @@ func GetLowlevelRuntimePaths(opts *Options, runtime string) ([]string, error) {
250278
return nil, fmt.Errorf("undefined runtime %v", runtime)
251279
}
252280
}
281+
282+
func StartNRIPlugin(ctx context.Context, opts *Options) (*nri.Plugin, error) {
283+
var pluginOpts []stub.Option
284+
pluginOpts = append(pluginOpts, stub.WithPluginIdx(opts.NRIPluginIndex))
285+
286+
socketPaths := getNRISocketPaths(opts)
287+
p := &nri.Plugin{}
288+
var errs []error
289+
var nriSocketConnSuccess bool
290+
for _, socketPath := range socketPaths {
291+
fmt.Printf("Attempting to connect to %s", opts.HostRootMount+"/"+socketPath)
292+
pluginOpts = append(pluginOpts, stub.WithSocketPath(opts.HostRootMount+"/"+socketPath))
293+
var err error
294+
p.Stub, err = stub.New(p, pluginOpts...)
295+
if err != nil {
296+
errs = append(errs, fmt.Errorf("failed to start plugin at %s: %w", socketPath, err))
297+
} else {
298+
nriSocketConnSuccess = true
299+
break
300+
}
301+
}
302+
303+
if !nriSocketConnSuccess {
304+
return nil, errors.Join(errs...)
305+
}
306+
307+
err := p.Stub.Run(ctx)
308+
if err != nil {
309+
return nil, fmt.Errorf("plugin exited with error %w", err)
310+
}
311+
return p, nil
312+
}
313+
314+
func getNRISocketPaths(opts *Options) []string {
315+
var socketPaths []string
316+
var origSocketPath string
317+
318+
if len(opts.NRISocket) == 0 {
319+
origSocketPath = defaultNRISocket
320+
}
321+
322+
socketPaths = append(socketPaths, origSocketPath)
323+
socketPathSuffix, found := strings.CutPrefix(origSocketPath, "/var/run/")
324+
if found {
325+
fallbackSocketPath := fmt.Sprintf("%s/%s", "/run", socketPathSuffix)
326+
socketPaths = append(socketPaths, fallbackSocketPath)
327+
}
328+
return socketPaths
329+
}
330+
331+
func StopNRIPlugin(plugin *nri.Plugin) {
332+
plugin.Stub.Stop()
333+
}

cmd/nvidia-ctk-installer/main.go

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"path/filepath"
99
"syscall"
1010

11+
"github.com/containerd/nri/pkg/stub"
1112
"github.com/urfave/cli/v3"
1213
"golang.org/x/sys/unix"
1314

@@ -70,7 +71,8 @@ func main() {
7071
type app struct {
7172
logger logger.Interface
7273

73-
toolkit *toolkit.Installer
74+
pluginStub stub.Stub
75+
toolkit *toolkit.Installer
7476
}
7577

7678
// NewApp creates the CLI app fro the specified options.
@@ -93,8 +95,8 @@ func (a app) build() *cli.Command {
9395
Before: func(ctx context.Context, cmd *cli.Command) (context.Context, error) {
9496
return ctx, a.Before(cmd, &options)
9597
},
96-
Action: func(_ context.Context, cmd *cli.Command) error {
97-
return a.Run(cmd, &options)
98+
Action: func(ctx context.Context, cmd *cli.Command) error {
99+
return a.Run(ctx, cmd, &options)
98100
},
99101
Flags: []cli.Flag{
100102
&cli.BoolFlag{
@@ -194,7 +196,7 @@ func (a *app) validateFlags(c *cli.Command, o *options) error {
194196
// Run installs the NVIDIA Container Toolkit and updates the requested runtime.
195197
// If the application is run as a daemon, the application waits and unconfigures
196198
// the runtime on termination.
197-
func (a *app) Run(c *cli.Command, o *options) error {
199+
func (a *app) Run(ctx context.Context, c *cli.Command, o *options) error {
198200
err := a.initialize(o.pidFile)
199201
if err != nil {
200202
return fmt.Errorf("unable to initialize: %v", err)
@@ -221,6 +223,11 @@ func (a *app) Run(c *cli.Command, o *options) error {
221223
return fmt.Errorf("unable to setup runtime: %v", err)
222224
}
223225

226+
err = a.startNRIPluginServer(ctx, &o.runtimeOptions)
227+
if err != nil {
228+
return fmt.Errorf("unable to start runtime plugin server: %w", err)
229+
}
230+
224231
if !o.noDaemon {
225232
err = a.waitForSignal()
226233
if err != nil {
@@ -290,6 +297,11 @@ func (a *app) waitForSignal() error {
290297
func (a *app) shutdown(pidFile string) {
291298
a.logger.Infof("Shutting Down")
292299

300+
if a.pluginStub != nil {
301+
a.logger.Infof("Stopping NRI plugin server...")
302+
a.pluginStub.Stop()
303+
}
304+
293305
err := os.Remove(pidFile)
294306
if err != nil {
295307
a.logger.Warningf("Unable to remove pidfile: %v", err)
@@ -327,3 +339,14 @@ func (a *app) resolvePackageType(hostRoot string, packageType string) (rPackageT
327339

328340
return "deb", nil
329341
}
342+
343+
func (a *app) startNRIPluginServer(ctx context.Context, opts *runtime.Options) error {
344+
a.logger.Info("Starting NRI Plugin server...")
345+
plugin, err := runtime.StartNRIPlugin(ctx, opts)
346+
if plugin == nil || err != nil {
347+
a.logger.Errorf("Failed to start NRI plugin server: %v", err)
348+
return fmt.Errorf("unable to setup NRI plugin server: %w", err)
349+
}
350+
a.pluginStub = plugin.Stub
351+
return nil
352+
}

cmd/nvidia-ctk-installer/main_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,9 @@ version = 2
327327
328328
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-legacy.options]
329329
BinaryName = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime.legacy"
330+
331+
[plugins."io.containerd.nri.v1.nri"]
332+
disable = false
330333
`,
331334
},
332335
{
@@ -415,6 +418,9 @@ version = 2
415418
416419
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-legacy.options]
417420
BinaryName = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime.legacy"
421+
422+
[plugins."io.containerd.nri.v1.nri"]
423+
disable = false
418424
`,
419425
},
420426
}

go.mod

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ go 1.25.0
55
require (
66
github.com/NVIDIA/go-nvlib v0.8.1
77
github.com/NVIDIA/go-nvml v0.13.0-1
8+
github.com/containerd/nri v0.10.1-0.20251120153915-7d8611f87ad7
89
github.com/google/uuid v1.6.0
910
github.com/moby/sys/mountinfo v0.7.2
1011
github.com/moby/sys/reexec v0.1.0
@@ -19,24 +20,31 @@ require (
1920
github.com/urfave/cli/v3 v3.6.1
2021
golang.org/x/mod v0.30.0
2122
golang.org/x/sys v0.38.0
23+
sigs.k8s.io/yaml v1.4.0
2224
tags.cncf.io/container-device-interface v1.0.2-0.20251114135136-1b24d969689f
2325
tags.cncf.io/container-device-interface/specs-go v1.0.0
2426
)
2527

2628
require (
2729
cyphar.com/go-pathrs v0.2.1 // indirect
30+
github.com/containerd/log v0.1.0 // indirect
31+
github.com/containerd/ttrpc v1.2.7 // indirect
2832
github.com/cyphar/filepath-securejoin v0.6.0 // indirect
2933
github.com/davecgh/go-spew v1.1.1 // indirect
3034
github.com/fsnotify/fsnotify v1.7.0 // indirect
35+
github.com/golang/protobuf v1.5.3 // indirect
3136
github.com/hashicorp/errwrap v1.1.0 // indirect
32-
github.com/kr/pretty v0.3.1 // indirect
37+
github.com/knqyf263/go-plugin v0.9.0 // indirect
38+
github.com/kr/text v0.2.0 // indirect
3339
github.com/moby/sys/capability v0.4.0 // indirect
3440
github.com/opencontainers/cgroups v0.0.4 // indirect
3541
github.com/opencontainers/runtime-tools v0.9.1-0.20251114084447-edf4cb3d2116 // indirect
3642
github.com/pmezard/go-difflib v1.0.0 // indirect
3743
github.com/rogpeppe/go-internal v1.11.0 // indirect
44+
github.com/tetratelabs/wazero v1.9.0 // indirect
3845
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
39-
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
46+
google.golang.org/genproto/googleapis/rpc v0.0.0-20230731190214-cbb8c96f2d6d // indirect
47+
google.golang.org/grpc v1.57.1 // indirect
48+
google.golang.org/protobuf v1.36.5 // indirect
4049
gopkg.in/yaml.v3 v3.0.1 // indirect
41-
sigs.k8s.io/yaml v1.4.0 // indirect
4250
)

0 commit comments

Comments
 (0)