Skip to content

Commit c2c1db7

Browse files
nwpu-zxrliziyu179
andauthored
[Bugfix] fix ZeroDivisionError when prefill_tp_size > num_kv_head and fix tp_resharding README (#3437)
### What this PR does / why we need it? Fix ZeroDivisionError when prefill_tp_size > num_kv_head, in this situation, num_head_replica can be 0 and used to divide another value, this PR restricts the minimum value of a to be 1. And this PR fix tp_resharding README. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By CI. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: liziyu <[email protected]> Signed-off-by: nwpu-zxr <[email protected]> Co-authored-by: liziyu <[email protected]>
1 parent 02c26dc commit c2c1db7

File tree

4 files changed

+26
-14
lines changed

4 files changed

+26
-14
lines changed

docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,10 @@ export VLLM_USE_V1=1
114114
export HCCL_BUFFSIZE=1024
115115
export OMP_PROC_BIND=false
116116
export OMP_NUM_THREADS=10
117-
export ASCEND_AGGREGATE_ENABLE=1
118-
export ASCEND_TRANSPORT_PRINT=0
119-
export ACL_OP_INIT_MODE=1
120-
export ASCEND_A3_ENABLE=1
117+
export ASCEND_AGGREGATE_ENABLE=1 # enable aggregated transmission
118+
export ASCEND_TRANSPORT_PRINT=0 # print ascend transport logs
119+
export ACL_OP_INIT_MODE=1 # acl op initialization mode to prevent device id acquisition failure
120+
export ASCEND_A3_ENABLE=1 # enable hccs transmission for A3; set to 0 for A2
121121
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
122122

123123
vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -137,7 +137,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
137137
--max-model-len 32768 \
138138
--max-num-batched-tokens 32768 \
139139
--trust-remote-code \
140-
--no-enable-prefix-caching \
141140
--gpu-memory-utilization 0.9 \
142141
--kv-transfer-config \
143142
'{"kv_connector": "MooncakeLayerwiseConnector",
@@ -197,7 +196,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
197196
--max-model-len 32768 \
198197
--max-num-batched-tokens 32768 \
199198
--trust-remote-code \
200-
--no-enable-prefix-caching \
201199
--gpu-memory-utilization 0.9 \
202200
--kv-transfer-config \
203201
'{"kv_connector": "MooncakeLayerwiseConnector",
@@ -363,6 +361,10 @@ export VLLM_USE_V1=1
363361
export HCCL_BUFFSIZE=1024
364362
export OMP_PROC_BIND=false
365363
export OMP_NUM_THREADS=10
364+
export ASCEND_AGGREGATE_ENABLE=1
365+
export ASCEND_TRANSPORT_PRINT=0
366+
export ACL_OP_INIT_MODE=1
367+
export ASCEND_A3_ENABLE=1
366368
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
367369

368370
vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -382,7 +384,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
382384
--max-model-len 32768 \
383385
--max-num-batched-tokens 32768 \
384386
--trust-remote-code \
385-
--no-enable-prefix-caching \
386387
--gpu-memory-utilization 0.9 \
387388
--kv-transfer-config \
388389
'{"kv_connector": "MooncakeConnector",
@@ -419,6 +420,10 @@ export VLLM_USE_V1=1
419420
export HCCL_BUFFSIZE=1024
420421
export OMP_PROC_BIND=false
421422
export OMP_NUM_THREADS=10
423+
export ASCEND_AGGREGATE_ENABLE=1
424+
export ASCEND_TRANSPORT_PRINT=0
425+
export ACL_OP_INIT_MODE=1
426+
export ASCEND_A3_ENABLE=1
422427
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
423428

424429
vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -438,7 +443,6 @@ vllm serve /model/Qwen3-235B-A22B-W8A8 \
438443
--max-model-len 32768 \
439444
--max-num-batched-tokens 32768 \
440445
--trust-remote-code \
441-
--no-enable-prefix-caching \
442446
--gpu-memory-utilization 0.9 \
443447
--kv-transfer-config \
444448
'{"kv_connector": "MooncakeConnector",
@@ -475,6 +479,10 @@ export VLLM_USE_V1=1
475479
export HCCL_BUFFSIZE=2048
476480
export OMP_PROC_BIND=false
477481
export OMP_NUM_THREADS=10
482+
export ASCEND_AGGREGATE_ENABLE=1
483+
export ASCEND_TRANSPORT_PRINT=0
484+
export ACL_OP_INIT_MODE=1
485+
export ASCEND_A3_ENABLE=1
478486
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
479487

480488
vllm serve /model/Qwen3-235B-A22B-W8A8 \
@@ -532,6 +540,10 @@ export VLLM_USE_V1=1
532540
export HCCL_BUFFSIZE=2048
533541
export OMP_PROC_BIND=false
534542
export OMP_NUM_THREADS=10
543+
export ASCEND_AGGREGATE_ENABLE=1
544+
export ASCEND_TRANSPORT_PRINT=0
545+
export ACL_OP_INIT_MODE=1
546+
export ASCEND_A3_ENABLE=1
535547
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
536548

537549
vllm serve /model/Qwen3-235B-A22B-W8A8 \

tests/ut/kv_connector/test_mooncake_layerwise_connector.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def setUp(self):
7979
self.p1 = patch(
8080
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
8181
new=MagicMock(return_value=SimpleNamespace(
82-
pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1)))
82+
pd_tp_ratio=1, num_head_replica=1, pd_head_ratio=1)))
8383
self.p2 = patch(
8484
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
8585
new=MagicMock(return_value=SimpleNamespace(
@@ -244,7 +244,7 @@ def setUp(self):
244244
self.p1 = patch(
245245
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
246246
new=MagicMock(return_value=SimpleNamespace(
247-
pd_tp_ratio=1, num_head_replica=0, pd_head_ratio=1)))
247+
pd_tp_ratio=1, num_head_replica=1, pd_head_ratio=1)))
248248
self.p2 = patch(
249249
'vllm_ascend.distributed.mooncake_layerwise_connector.get_current_vllm_config',
250250
new=MagicMock(return_value=SimpleNamespace(
@@ -903,7 +903,7 @@ def setUp(self):
903903
patch(
904904
'vllm_ascend.distributed.mooncake_layerwise_connector.get_ascend_config',
905905
return_value=SimpleNamespace(pd_tp_ratio=1,
906-
num_head_replica=0,
906+
num_head_replica=1,
907907
pd_head_ratio=1),
908908
),
909909
patch(

vllm_ascend/ascend_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def __init__(self, vllm_config):
102102
)
103103
self.pd_tp_ratio = 1
104104
self.pd_head_ratio = 1
105-
self.num_head_replica = 0
105+
self.num_head_replica = 1
106106
if vllm_config.kv_transfer_config is not None and not vllm_config.model_config.is_deepseek_mla:
107107
prefill_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
108108
"prefill", {"tp_size": 1})["tp_size"]
@@ -115,7 +115,7 @@ def __init__(self, vllm_config):
115115
# only support Qwen model now
116116
# TODO: use a more robust method to get kv_head_num
117117
num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
118-
self.num_head_replica = prefill_tp_size // num_kv_head
118+
self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1
119119
prefill_tp_size = min(prefill_tp_size, num_kv_head)
120120
decode_tp_size = min(decode_tp_size, num_kv_head)
121121
self.pd_head_ratio = prefill_tp_size // decode_tp_size

vllm_ascend/distributed/mooncake_layerwise_connector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ def _transfer_kv_cache(self, req_meta: DecodeMooncakeAgentMetadata,
360360
remote_kv_base_addrs = req_meta.kv_caches_base_addr
361361

362362
remote_block_ids = req_meta.block_ids
363-
if self.num_head_replica >= 1 and self.tp_rank % self.num_head_replica != 0:
363+
if self.tp_rank % self.num_head_replica != 0:
364364
pass
365365
elif self.pd_head_ratio == 1:
366366
layer_local_kv_base_addr = [

0 commit comments

Comments
 (0)