Skip to content

Commit 490ddf5

Browse files
authored
[perf][dsv3.2][async_scheduling] improve dsv3.2 performance by eliminating HD synchronization (#4805)
### What this PR does / why we need it? This PR eliminates the simplicit HD synchronization in sfa backend, and _build_dummy_attn_metadata and dummy_run in mtp_proposer, significantly improving dsv3.2 performance in low-latency scenarios. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Performance improvements are observed with E2E performance serving (P: DP4TP8EP32 D: DP8TP4EP32) with `num_speculative_tokens=3`. DSV3.2-W8A8-EXP: TPOT: 41.67ms -> 23.36ms ITL: 85.93ms -> 55.96ms DSV3.2-W8A8 (relaesed in December): TPOT: 18.11ms ITL: 56.13ms - vLLM version: v0.12.0 - vLLM main: vllm-project/vllm@ad32e3e Signed-off-by: linfeng-yuan <[email protected]>
1 parent dd622aa commit 490ddf5

File tree

3 files changed

+16
-7
lines changed

3 files changed

+16
-7
lines changed

vllm_ascend/attention/sfa_v1.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,9 @@ def build(
170170
input_positions = common_attn_metadata.positions[:
171171
num_input_tokens].long(
172172
)
173-
query_start_loc = common_attn_metadata.query_start_loc
174-
query_lens = query_start_loc[1:] - query_start_loc[:-1]
175-
has_prefill = any(query_lens > self.decode_threshold)
173+
query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
174+
query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
175+
has_prefill = any(query_lens_cpu > self.decode_threshold)
176176

177177
if self.cos_cache is None:
178178
self.cos_cache = model.model.layers[

vllm_ascend/spec_decode/mtp_proposer.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,13 @@ def dummy_run(self,
233233
num_tokens_across_dp,
234234
with_prefill,
235235
) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
236-
236+
if self.use_async_scheduling:
237+
# there is synchronization between mtp steps when enabling aclgraph,
238+
# disable aclgraph when use async scheduling to avoid the
239+
# synchronization overhead.
240+
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
241+
# and _propose.
242+
aclgraph_runtime_mode = CUDAGraphMode.NONE
237243
moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
238244
# TODO: remove this after moe_comm_type selection logic is finalized
239245
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
@@ -742,9 +748,11 @@ def _propose(
742748
aclgraph_runtime_mode, batch_descriptor = \
743749
self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
744750
if self.use_async_scheduling:
745-
# there is synchronize between mtp steps when enable aclgraph,
751+
# there is synchronization between mtp steps when enabling aclgraph,
746752
# disable aclgraph when use async scheduling to avoid the
747-
# synchronize overhead.
753+
# synchronization overhead.
754+
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
755+
# and _propose.
748756
aclgraph_runtime_mode = CUDAGraphMode.NONE
749757

750758
if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(

vllm_ascend/worker/model_runner_v1.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2923,9 +2923,10 @@ def _build_dummy_attn_metadata(
29232923
cu_num_tokens, arange = self._get_cumsum_and_arange(
29242924
num_scheduled_tokens)
29252925

2926-
self.query_start_loc[1:num_reqs + 1] = torch.Tensor(cu_num_tokens)
29272926
self.query_start_loc_cpu[1:num_reqs +
29282927
1] = torch.Tensor(cu_num_tokens)
2928+
self.query_start_loc = self.query_start_loc_cpu.pin_memory().to(
2929+
self.device, non_blocking=True)
29292930
self.query_lens = torch.from_numpy(num_scheduled_tokens)
29302931
self.attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask()
29312932

0 commit comments

Comments
 (0)