replace .github/values-10-disagg-prefill.yaml with values-16-disagg-prefill.yaml

kobe0938 · kobe0938 · commit 7998394d23f3 · 2025-09-05T22:28:11.000Z
Signed-off-by: Kobe Chen &lt;xiaokunchen0@gmail.com&gt;
diff --git a/.github/values-10-disagg-prefill.yaml b/.github/values-10-disagg-prefill.yaml
@@ -1,88 +1,90 @@
 # Unified configuration for disaggregated prefill setup
 servingEngineSpec:
-  strategy:
-    type: Recreate
   enableEngine: true
-  runtimeClassName: ""
+  runtimeClassName: "nvidia"
   containerPort: 8000
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "llama-prefill"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
-      modelURL: "facebook/opt-125m"
+      tag: "nightly-2025-09-04"
+      modelURL: "meta-llama/Llama-3.1-8B-Instruct"
       replicaCount: 1
       requestCPU: 8
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
-        enablePrefixCaching: true
-        maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        enablePrefixCaching: false
+        # maxModelLen: 2048
+        extraArgs:
+          - "--enforce-eager"
+          - "--disable-log-requests"
       lmcacheConfig:
         cudaVisibleDevices: "0"
         enabled: true
         kvRole: "kv_producer"
+        localCpu: true
+        maxLocalCpuSize: 5
+        maxLocalDiskSize: 0
         enableNixl: true
+        enableXpyd: true
         nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
+        nixlProxyHost: "vllm-router-service"
+        nixlProxyPort: 7500
+        nixlBufferSize: "1073741824"
         nixlBufferDevice: "cuda"
-        nixlEnableGc: true
         enablePD: true
-        cpuOffloadingBufferSize: 0
+        rpcPort: "producer1"
       labels:
-        model: "opt125m-prefill"
-      chatTemplate: "chat.jinja2"
-      chatTemplateConfigMap: |-
-        {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
-        {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+        model: "llama-prefill"
+      hf_token: <hf-token>
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "llama-decode"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
-      modelURL: "facebook/opt-125m"
+      tag: "nightly-2025-09-04"
+      modelURL: "meta-llama/Llama-3.1-8B-Instruct"
       replicaCount: 1
       requestCPU: 8
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
-        enablePrefixCaching: true
-        maxModelLen: 1024
-        v1: 1
+        enablePrefixCaching: false
+        # maxModelLen: 2048
+        extraArgs:
+          - "--enforce-eager"
+          - "--disable-log-requests"
       lmcacheConfig:
         cudaVisibleDevices: "1"
         enabled: true
         kvRole: "kv_consumer"  # Set decode node as consumer
+        localCpu: false
+        maxLocalCpuSize: 0
         enableNixl: true
+        enableXpyd: true
         nixlRole: "receiver"
         nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
+        nixlPeerInitPort: 7300
+        nixlPeerAllocPort: 7400
+        nixlBufferSize: "2147483648"
         nixlBufferDevice: "cuda"
-        nixlEnableGc: true
+        # nixlBackends: ["UCX"]
         enablePD: true
+        rpcPort: "consumer1"
+        skipLastNTokens: 1
+      hf_token: <hf-token>
       labels:
-        model: "opt125m-decode"
-      chatTemplate: "chat.jinja2"
-      chatTemplateConfigMap: |-
-        {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
-        {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+        model: "llama-decode"
   containerSecurityContext:
     capabilities:
       add:
         - SYS_PTRACE
-
 routerSpec:
   enableRouter: true
-  repository: "git-act-router"
-  imagePullPolicy: "IfNotPresent"
-  strategy:
-    type: Recreate
+  repository: "xiaokunchen/vllm-router"
+  tag: "08-27-v8"
+  imagePullPolicy: "Always"
   replicaCount: 1
   containerPort: 8000
   servicePort: 80
@@ -102,6 +104,11 @@ routerSpec:
     release: "router"
   extraArgs:
     - "--prefill-model-labels"
-    - "opt125m-prefill"
+    - "llama-prefill"
     - "--decode-model-labels"
-    - "opt125m-decode"
+    - "llama-decode"
+  nixlPeerHost: "vllm-llama-decode-engine-service"
+  nixlPeerInitPort: 7300
+  nixlPeerAllocPort: 7400
+  nixlProxyHost: "0.0.0.0"
+  nixlProxyPort: 7500