@@ -102,239 +102,6 @@ make install
102102
103103We can run the following scripts to launch a server on the prefiller/decoder node, respectively. Please note that each P/D node will occupy ports ranging from kv_port to kv_port + num_chips to initialize socket listeners. To avoid any issues, port conflicts should be prevented. Additionally, ensure that each node's engine_id is uniquely assigned to avoid conflicts.
104104
105- ### Layerwise
106-
107- :::::{tab-set}
108-
109- ::::{tab-item} Prefiller node 1
110-
111- ``` shell
112- unset ftp_proxy
113- unset https_proxy
114- unset http_proxy
115- export HCCL_IF_IP=192.0.0.1
116- export GLOO_SOCKET_IFNAME=" eth0" # network card name
117- export TP_SOCKET_IFNAME=" eth0"
118- export HCCL_SOCKET_IFNAME=" eth0"
119- export VLLM_USE_V1=1
120- export HCCL_BUFFSIZE=1024
121- export OMP_PROC_BIND=false
122- export OMP_NUM_THREADS=10
123- export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
124-
125- vllm serve /model/Qwen3-235B-A22B-W8A8 \
126- --host 0.0.0.0 \
127- --port 8004 \
128- --api-server-count 1 \
129- --data-parallel-size 2 \
130- --data-parallel-size-local 2 \
131- --data-parallel-address 192.0.0.1 \
132- --data-parallel-rpc-port 13389 \
133- --tensor-parallel-size 8 \
134- --enable-expert-parallel \
135- --seed 1024 \
136- --enforce-eager \
137- --distributed-executor-backend mp \
138- --served-model-name qwen3-moe \
139- --max-model-len 32768 \
140- --max-num-batched-tokens 32768 \
141- --trust-remote-code \
142- --gpu-memory-utilization 0.9 \
143- --kv-transfer-config \
144- ' {"kv_connector": "MooncakeLayerwiseConnector",
145- "kv_role": "kv_producer",
146- "kv_port": "30000",
147- "engine_id": "0",
148- "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
149- "kv_connector_extra_config": {
150- "prefill": {
151- "dp_size": 2,
152- "tp_size": 8
153- },
154- "decode": {
155- "dp_size": 32,
156- "tp_size": 1
157- }
158- }
159- }'
160- ```
161-
162- ::::
163-
164- ::::{tab-item} Prefiller node 2
165-
166- ``` shell
167- unset ftp_proxy
168- unset https_proxy
169- unset http_proxy
170- export HCCL_IF_IP=192.0.0.2
171- export GLOO_SOCKET_IFNAME=" eth0" # network card name
172- export TP_SOCKET_IFNAME=" eth0"
173- export HCCL_SOCKET_IFNAME=" eth0"
174- export VLLM_USE_V1=1
175- export HCCL_BUFFSIZE=1024
176- export OMP_PROC_BIND=false
177- export OMP_NUM_THREADS=10
178- export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
179-
180- vllm serve /model/Qwen3-235B-A22B-W8A8 \
181- --host 0.0.0.0 \
182- --port 8004 \
183- --api-server-count 1 \
184- --data-parallel-size 2 \
185- --data-parallel-size-local 2 \
186- --data-parallel-address 192.0.0.2 \
187- --data-parallel-rpc-port 13389 \
188- --tensor-parallel-size 8 \
189- --enable-expert-parallel \
190- --seed 1024 \
191- --enforce-eager \
192- --distributed-executor-backend mp \
193- --served-model-name qwen3-moe \
194- --max-model-len 32768 \
195- --max-num-batched-tokens 32768 \
196- --trust-remote-code \
197- --gpu-memory-utilization 0.9 \
198- --kv-transfer-config \
199- ' {"kv_connector": "MooncakeLayerwiseConnector",
200- "kv_role": "kv_producer",
201- "kv_port": "30100",
202- "engine_id": "1",
203- "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
204- "kv_connector_extra_config": {
205- "prefill": {
206- "dp_size": 2,
207- "tp_size": 8
208- },
209- "decode": {
210- "dp_size": 32,
211- "tp_size": 1
212- }
213- }
214- }'
215- ```
216-
217- ::::
218-
219- ::::{tab-item} Decoder node 1 (master node)
220-
221- ``` shell
222- unset ftp_proxy
223- unset https_proxy
224- unset http_proxy
225- export HCCL_IF_IP=192.0.0.3
226- export GLOO_SOCKET_IFNAME=" eth0" # network card name
227- export TP_SOCKET_IFNAME=" eth0"
228- export HCCL_SOCKET_IFNAME=" eth0"
229- export VLLM_USE_V1=1
230- export HCCL_BUFFSIZE=2048
231- export OMP_PROC_BIND=false
232- export OMP_NUM_THREADS=10
233- export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
234-
235- vllm serve /model/Qwen3-235B-A22B-W8A8 \
236- --host 0.0.0.0 \
237- --port 8004 \
238- --api-server-count 1 \
239- --data-parallel-size 32 \
240- --data-parallel-size-local 16 \
241- --data-parallel-address 192.0.0.3 \
242- --data-parallel-rpc-port 5964 \
243- --tensor-parallel-size 1 \
244- --enable-expert-parallel \
245- --seed 1024 \
246- --distributed-executor-backend mp \
247- --served-model-name qwen3-moe \
248- --max-model-len 32768 \
249- --max-num-batched-tokens 512 \
250- --max-num_seqs 16 \
251- --trust-remote-code \
252- --no-enable-prefix-caching \
253- --gpu-memory-utilization 0.9 \
254- --compilation-config ' {"cudagraph_capture_sizes":[16]}' \
255- --kv-transfer-config \
256- ' {"kv_connector": "MooncakeLayerwiseConnector",
257- "kv_role": "kv_consumer",
258- "kv_port": "30200",
259- "engine_id": "2",
260- "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
261- "kv_connector_extra_config": {
262- "prefill": {
263- "dp_size": 2,
264- "tp_size": 8
265- },
266- "decode": {
267- "dp_size": 32,
268- "tp_size": 1
269- }
270- }
271- }'
272- ```
273-
274- ::::
275-
276- ::::{tab-item} Decoder node 2 (primary node)
277-
278- ``` shell
279- unset ftp_proxy
280- unset https_proxy
281- unset http_proxy
282- export HCCL_IF_IP=192.0.0.4
283- export GLOO_SOCKET_IFNAME=" eth0" # network card name
284- export TP_SOCKET_IFNAME=" eth0"
285- export HCCL_SOCKET_IFNAME=" eth0"
286- export VLLM_USE_V1=1
287- export HCCL_BUFFSIZE=2048
288- export OMP_PROC_BIND=false
289- export OMP_NUM_THREADS=10
290- export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
291-
292- vllm serve /model/Qwen3-235B-A22B-W8A8 \
293- --host 0.0.0.0 \
294- --port 8004 \
295- --headless \
296- --data-parallel-size 32 \
297- --data-parallel-size-local 16 \
298- --data-parallel-start-rank 16 \
299- --data-parallel-address 192.0.0.3 \
300- --data-parallel-rpc-port 5964 \
301- --tensor-parallel-size 1 \
302- --enable-expert-parallel \
303- --seed 1024 \
304- --distributed-executor-backend mp \
305- --served-model-name qwen3-moe \
306- --max-model-len 32768 \
307- --max-num-batched-tokens 512 \
308- --max-num_seqs 16 \
309- --trust-remote-code \
310- --no-enable-prefix-caching \
311- --gpu-memory-utilization 0.9 \
312- --compilation-config ' {"cudagraph_capture_sizes":[16]}' \
313- --kv-transfer-config \
314- ' {"kv_connector": "MooncakeLayerwiseConnector",
315- "kv_role": "kv_consumer",
316- "kv_port": "30200",
317- "engine_id": "2",
318- "kv_connector_module_path": "vllm_ascend.distributed.mooncake_layerwise_connector",
319- "kv_connector_extra_config": {
320- "prefill": {
321- "dp_size": 2,
322- "tp_size": 8
323- },
324- "decode": {
325- "dp_size": 32,
326- "tp_size": 1
327- }
328- }
329- }'
330- ```
331-
332- ::::
333-
334- :::::
335-
336- ### Non-layerwise
337-
338105:::::{tab-set}
339106
340107::::{tab-item} Prefiller node 1
@@ -566,25 +333,11 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
566333
567334## Example Proxy for Deployment
568335
569- Run a proxy server on the same node with the prefiller service instance. You can get the proxy program in the repository's examples: [ load\_ balance\_ proxy\_ layerwise \_ server \_ example.py ] ( https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py ) or [ load \_ balance \_ proxy \ _ server\_ example.py] ( https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py )
336+ Run a proxy server on the same node with the prefiller service instance. You can get the proxy program in the repository's examples: [ load\_ balance\_ proxy\_ server\_ example.py] ( https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py )
570337
571338:::::{tab-set}
572339
573- ::::{tab-item} Layerwise
574-
575- ``` shell
576- python load_balance_proxy_layerwise_server_example.py \
577- --host 192.0.0.1 \
578- --port 8080 \
579- --prefiller-hosts 192.0.0.1 192.0.0.2\
580- --prefiller-port 8004 8004\
581- --decoder-hosts 192.0.0.3\
582- --decoder-ports 8004
583- ```
584-
585- ::::
586-
587- ::::{tab-item} Non-layerwise
340+ ::::{tab-item}
588341
589342``` shell
590343python load_balance_proxy_server_example.py \
0 commit comments