Skip to content

Commit 7998394

Browse files
committed
replace .github/values-10-disagg-prefill.yaml with values-16-disagg-prefill.yaml
Signed-off-by: Kobe Chen <[email protected]>
1 parent 1db19aa commit 7998394

File tree

1 file changed

+48
-41
lines changed

1 file changed

+48
-41
lines changed
Lines changed: 48 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,90 @@
11
# Unified configuration for disaggregated prefill setup
22
servingEngineSpec:
3-
strategy:
4-
type: Recreate
53
enableEngine: true
6-
runtimeClassName: ""
4+
runtimeClassName: "nvidia"
75
containerPort: 8000
86
modelSpec:
97
# Prefill node configuration
10-
- name: "opt125m-prefill"
8+
- name: "llama-prefill"
119
repository: "lmcache/vllm-openai"
12-
tag: "2025-05-27-v1"
13-
modelURL: "facebook/opt-125m"
10+
tag: "nightly-2025-09-04"
11+
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
1412
replicaCount: 1
1513
requestCPU: 8
1614
requestMemory: "30Gi"
1715
# requestGPU: 1
1816
pvcStorage: "50Gi"
1917
vllmConfig:
20-
enablePrefixCaching: true
21-
maxModelLen: 1024
22-
v1: 1
23-
gpuMemoryUtilization: 0.6
18+
enablePrefixCaching: false
19+
# maxModelLen: 2048
20+
extraArgs:
21+
- "--enforce-eager"
22+
- "--disable-log-requests"
2423
lmcacheConfig:
2524
cudaVisibleDevices: "0"
2625
enabled: true
2726
kvRole: "kv_producer"
27+
localCpu: true
28+
maxLocalCpuSize: 5
29+
maxLocalDiskSize: 0
2830
enableNixl: true
31+
enableXpyd: true
2932
nixlRole: "sender"
30-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
31-
nixlPeerPort: "55555"
32-
nixlBufferSize: "1073741824" # 1GB
33+
nixlProxyHost: "vllm-router-service"
34+
nixlProxyPort: 7500
35+
nixlBufferSize: "1073741824"
3336
nixlBufferDevice: "cuda"
34-
nixlEnableGc: true
3537
enablePD: true
36-
cpuOffloadingBufferSize: 0
38+
rpcPort: "producer1"
3739
labels:
38-
model: "opt125m-prefill"
39-
chatTemplate: "chat.jinja2"
40-
chatTemplateConfigMap: |-
41-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
42-
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
40+
model: "llama-prefill"
41+
hf_token: <hf-token>
4342
# Decode node configuration
44-
- name: "opt125m-decode"
43+
- name: "llama-decode"
4544
repository: "lmcache/vllm-openai"
46-
tag: "2025-05-27-v1"
47-
modelURL: "facebook/opt-125m"
45+
tag: "nightly-2025-09-04"
46+
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
4847
replicaCount: 1
4948
requestCPU: 8
5049
requestMemory: "30Gi"
5150
# requestGPU: 1
5251
pvcStorage: "50Gi"
5352
vllmConfig:
54-
enablePrefixCaching: true
55-
maxModelLen: 1024
56-
v1: 1
53+
enablePrefixCaching: false
54+
# maxModelLen: 2048
55+
extraArgs:
56+
- "--enforce-eager"
57+
- "--disable-log-requests"
5758
lmcacheConfig:
5859
cudaVisibleDevices: "1"
5960
enabled: true
6061
kvRole: "kv_consumer" # Set decode node as consumer
62+
localCpu: false
63+
maxLocalCpuSize: 0
6164
enableNixl: true
65+
enableXpyd: true
6266
nixlRole: "receiver"
6367
nixlPeerHost: "0.0.0.0"
64-
nixlPeerPort: "55555"
65-
nixlBufferSize: "1073741824" # 1GB
68+
nixlPeerInitPort: 7300
69+
nixlPeerAllocPort: 7400
70+
nixlBufferSize: "2147483648"
6671
nixlBufferDevice: "cuda"
67-
nixlEnableGc: true
72+
# nixlBackends: ["UCX"]
6873
enablePD: true
74+
rpcPort: "consumer1"
75+
skipLastNTokens: 1
76+
hf_token: <hf-token>
6977
labels:
70-
model: "opt125m-decode"
71-
chatTemplate: "chat.jinja2"
72-
chatTemplateConfigMap: |-
73-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
74-
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
78+
model: "llama-decode"
7579
containerSecurityContext:
7680
capabilities:
7781
add:
7882
- SYS_PTRACE
79-
8083
routerSpec:
8184
enableRouter: true
82-
repository: "git-act-router"
83-
imagePullPolicy: "IfNotPresent"
84-
strategy:
85-
type: Recreate
85+
repository: "xiaokunchen/vllm-router"
86+
tag: "08-27-v8"
87+
imagePullPolicy: "Always"
8688
replicaCount: 1
8789
containerPort: 8000
8890
servicePort: 80
@@ -102,6 +104,11 @@ routerSpec:
102104
release: "router"
103105
extraArgs:
104106
- "--prefill-model-labels"
105-
- "opt125m-prefill"
107+
- "llama-prefill"
106108
- "--decode-model-labels"
107-
- "opt125m-decode"
109+
- "llama-decode"
110+
nixlPeerHost: "vllm-llama-decode-engine-service"
111+
nixlPeerInitPort: 7300
112+
nixlPeerAllocPort: 7400
113+
nixlProxyHost: "0.0.0.0"
114+
nixlProxyPort: 7500

0 commit comments

Comments
 (0)