diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index 457c7378cb7..c3fb6e1e8ef 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -1145,7 +1145,8 @@ def save_kv_layer(self, layer_name: str, kv_layer: Tuple[torch.Tensor, connector_metadata: MooncakeLayerwiseConnectorMetadata, **kwargs) -> None: """MooncakeLayerwiseConnector does not save explicitly.""" - if self.kv_role == 'kv_producer': + if self.kv_role == 'kv_producer' and connector_metadata.requests.keys( + ): if self.pd_head_ratio != 1: if self.current_layer != 0: self.completion_event.wait() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index cd98c6058d4..20309d5e326 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2370,6 +2370,10 @@ def _dummy_run( tp_size = self.vllm_config.parallel_config.tensor_parallel_size num_tokens = math.ceil(num_tokens / tp_size) * tp_size + # Force dummy run on prefill stage when this node is deemed as kv producer. + if self.is_kv_producer and not self.is_kv_consumer: + with_prefill = True + # Padding for DP (num_tokens, num_tokens_across_dp, with_prefill, _) = self._sync_metadata_across_dp(num_tokens, with_prefill, False) @@ -2417,10 +2421,6 @@ def _dummy_run( num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - # Force dummy run on prefill stage when this node is deemed as kv producer. - if self.is_kv_producer and not self.is_kv_consumer: - with_prefill = True - if not self.in_profile_run and self.dynamic_eplb: self.eplb_updator.forward_before()