From 5cf797405ba193ebf482d5daf42342a1a0110585 Mon Sep 17 00:00:00 2001
From: Fang Han <fhan0520@gmail.com>
Date: Tue, 11 Nov 2025 22:55:03 -0800
Subject: [PATCH] [Feat] Add per-model runtimeClassName configuration support

Signed-off-by: Fang Han <fhan0520@gmail.com>
---
 helm/templates/deployment-vllm-multi.yaml |   4 +-
 helm/templates/ray-cluster.yaml           |   8 +-
 helm/tests/runtimeClassName_test.yaml     | 136 ++++++++++++++++++++++
 helm/values-example.yaml                  |   5 +
 helm/values.schema.json                   |   3 +
 helm/values.yaml                          |   1 +
 6 files changed, 151 insertions(+), 6 deletions(-)
 create mode 100644 helm/tests/runtimeClassName_test.yaml

diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
index 207be3297..a4b770e58 100644
--- a/helm/templates/deployment-vllm-multi.yaml
+++ b/helm/templates/deployment-vllm-multi.yaml
@@ -446,8 +446,8 @@ spec:
       {{-   end }}
       {{- end }}
 
-      {{- if .Values.servingEngineSpec.runtimeClassName }}
-      runtimeClassName: {{ .Values.servingEngineSpec.runtimeClassName }}
+      {{- with (ternary $modelSpec.runtimeClassName .Values.servingEngineSpec.runtimeClassName (hasKey $modelSpec "runtimeClassName")) }}
+      runtimeClassName: {{ . }}
       {{- end }}
       {{- if .Values.servingEngineSpec.schedulerName }}
       schedulerName: {{ .Values.servingEngineSpec.schedulerName }}
diff --git a/helm/templates/ray-cluster.yaml b/helm/templates/ray-cluster.yaml
index 41f45bf38..8319b5892 100644
--- a/helm/templates/ray-cluster.yaml
+++ b/helm/templates/ray-cluster.yaml
@@ -231,8 +231,8 @@ spec:
           {{-   toYaml . | nindent 10 }}
         {{-   end }}
         {{- end }}
-        {{- if .Values.servingEngineSpec.runtimeClassName }}
-        runtimeClassName: {{ .Values.servingEngineSpec.runtimeClassName }}
+        {{- with (ternary $modelSpec.runtimeClassName .Values.servingEngineSpec.runtimeClassName (hasKey $modelSpec "runtimeClassName")) }}
+        runtimeClassName: {{ . }}
         {{- end }}
         {{- if .Values.servingEngineSpec.schedulerName }}
         schedulerName: {{ .Values.servingEngineSpec.schedulerName }}
@@ -441,8 +441,8 @@ spec:
           {{-   end }}
           {{- end }}
 
-          {{- if .Values.servingEngineSpec.runtimeClassName }}
-          runtimeClassName: {{ .Values.servingEngineSpec.runtimeClassName }}
+          {{- with (ternary $modelSpec.runtimeClassName .Values.servingEngineSpec.runtimeClassName (hasKey $modelSpec "runtimeClassName")) }}
+          runtimeClassName: {{ . }}
           {{- end }}
           {{- if .Values.servingEngineSpec.schedulerName }}
           schedulerName: {{ .Values.servingEngineSpec.schedulerName }}
diff --git a/helm/tests/runtimeClassName_test.yaml b/helm/tests/runtimeClassName_test.yaml
new file mode 100644
index 000000000..77a83a61a
--- /dev/null
+++ b/helm/tests/runtimeClassName_test.yaml
@@ -0,0 +1,136 @@
+suite: test runtimeClassName configuration
+templates:
+  - deployment-vllm-multi.yaml
+  - ray-cluster.yaml
+tests:
+  - it: should use global runtimeClassName when no model override is set
+    set:
+      servingEngineSpec:
+        enableEngine: true
+        runtimeClassName: "nvidia"
+        modelSpec:
+          - name: "test-model"
+            repository: "vllm/vllm-openai"
+            tag: "latest"
+            modelURL: "facebook/opt-125m"
+            replicaCount: 1
+            requestCPU: 1
+            requestMemory: "1Gi"
+            requestGPU: 1
+    asserts:
+      - template: deployment-vllm-multi.yaml
+        equal:
+          path: spec.template.spec.runtimeClassName
+          value: "nvidia"
+
+  - it: should use model-specific runtimeClassName when set
+    set:
+      servingEngineSpec:
+        enableEngine: true
+        runtimeClassName: "nvidia"
+        modelSpec:
+          - name: "test-model-custom"
+            repository: "vllm/vllm-openai"
+            tag: "latest"
+            modelURL: "facebook/opt-125m"
+            runtimeClassName: "custom-runtime"
+            replicaCount: 1
+            requestCPU: 1
+            requestMemory: "1Gi"
+            requestGPU: 1
+    asserts:
+      - template: deployment-vllm-multi.yaml
+        equal:
+          path: spec.template.spec.runtimeClassName
+          value: "custom-runtime"
+
+  - it: should use model-specific runtimeClassName in Ray cluster head and worker nodes
+    set:
+      servingEngineSpec:
+        enableEngine: true
+        runtimeClassName: "nvidia"
+        modelSpec:
+          - name: "ray-model-custom"
+            repository: "vllm/vllm-openai"
+            tag: "latest"
+            modelURL: "facebook/opt-125m"
+            runtimeClassName: "custom-ray"
+            replicaCount: 2
+            requestCPU: 1
+            requestMemory: "1Gi"
+            requestGPU: 1
+            raySpec:
+              enabled: true
+              headNode:
+                requestCPU: 1
+                requestMemory: "1Gi"
+                requestGPU: 1
+    asserts:
+      - template: ray-cluster.yaml
+        documentIndex: 0
+        equal:
+          path: spec.headGroupSpec.template.spec.runtimeClassName
+          value: "custom-ray"
+      - template: ray-cluster.yaml
+        documentIndex: 0
+        equal:
+          path: spec.workerGroupSpecs[0].template.spec.runtimeClassName
+          value: "custom-ray"
+
+  - it: should default to nvidia if runtimeClassName is not provided
+    set:
+      servingEngineSpec:
+        enableEngine: true
+        modelSpec:
+          - name: "test-model-no-runtime"
+            repository: "vllm/vllm-openai"
+            tag: "latest"
+            modelURL: "facebook/opt-125m"
+            replicaCount: 1
+            requestCPU: 1
+            requestMemory: "1Gi"
+            requestGPU: 1
+    asserts:
+      - template: deployment-vllm-multi.yaml
+        equal:
+          path: spec.template.spec.runtimeClassName
+          value: "nvidia"
+
+  - it: should use model-specific runtimeClassName when no global is set
+    set:
+      servingEngineSpec:
+        enableEngine: true
+        modelSpec:
+          - name: "test-model-only-model-runtime"
+            repository: "vllm/vllm-openai"
+            tag: "latest"
+            modelURL: "facebook/opt-125m"
+            runtimeClassName: "model-only-runtime"
+            replicaCount: 1
+            requestCPU: 1
+            requestMemory: "1Gi"
+            requestGPU: 1
+    asserts:
+      - template: deployment-vllm-multi.yaml
+        equal:
+          path: spec.template.spec.runtimeClassName
+          value: "model-only-runtime"
+
+  - it: should not set runtimeClassName when global is explicitly set to empty string
+    set:
+      servingEngineSpec:
+        enableEngine: true
+        runtimeClassName: ""
+        modelSpec:
+          - name: "test-model-empty-runtime"
+            repository: "vllm/vllm-openai"
+            tag: "latest"
+            modelURL: "facebook/opt-125m"
+            replicaCount: 1
+            requestCPU: 1
+            requestMemory: "1Gi"
+            requestGPU: 1
+    asserts:
+      - template: deployment-vllm-multi.yaml
+        notExists:
+          path: spec.template.spec.runtimeClassName
diff --git a/helm/values-example.yaml b/helm/values-example.yaml
index 004880fbe..3809276ae 100644
--- a/helm/values-example.yaml
+++ b/helm/values-example.yaml
@@ -1,9 +1,14 @@
 servingEngineSpec:
+  # Global runtime class for all models (can be overridden per model)
+  runtimeClassName: "nvidia"
+
   modelSpec:
   - name: "opt125m"
     repository: "lmcache/vllm-openai"
     tag: "latest"
     modelURL: "facebook/opt-125m"
+    # Override global runtimeClassName for this specific model
+    runtimeClassName: "custom-runtime"
 
     replicaCount: 1
 
diff --git a/helm/values.schema.json b/helm/values.schema.json
index a7fb2a332..cead7f133 100644
--- a/helm/values.schema.json
+++ b/helm/values.schema.json
@@ -109,6 +109,9 @@
               "priorityClassName": {
                 "type": "string"
               },
+              "runtimeClassName": {
+                "type": "string"
+              },
               "pvcStorage": {
                 "type": "string"
               },
diff --git a/helm/values.yaml b/helm/values.yaml
index f1680db68..6414ed8cc 100644
--- a/helm/values.yaml
+++ b/helm/values.yaml
@@ -21,6 +21,7 @@ servingEngineSpec:
   # - annotations: (Optional, map) The annotations to add to the deployment, e.g., {model: "opt125m"}
   # - serviceAccountName: (Optional, string) The name of the service account to use for the deployment, e.g., "vllm-service-account"
   # - priorityClassName: (Optional, string) The name of the priority class name for the deployment, e.g., "high-priority"
+  # - runtimeClassName: (Optional, string) Runtime class for the pod, e.g., "nvidia". If not specified, falls back to servingEngineSpec.runtimeClassName
   # - podAnnotations: (Optional, map) The annotations to add to the pod, e.g., {model: "opt125m"}
   # - name: (string) The name of the model, e.g., "example-model"
   # - repository: (string) The repository of the model, e.g., "vllm/vllm-openai"