Skip to content

Commit a47aa4d

Browse files
authored
[feat] apply flashcomm1 on bailing (#4868)
### What this PR does / why we need it? This PR adjusts the layer prefix matching rules for tensor parallelism (column/row parallel ops) to fit Bailing model's naming conventions (adding "query_key_value" for column parallel and "attention.dense" for row parallel), enabling flashcomm1 to work properly on the Bailing model. ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.12.0 - vLLM main: vllm-project/vllm@ad32e3e Signed-off-by: hwhaokun <[email protected]>
1 parent 2f965d8 commit a47aa4d

File tree

1 file changed

+19
-8
lines changed

1 file changed

+19
-8
lines changed

vllm_ascend/ops/linear_op.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -610,12 +610,16 @@ def _get_column_parallel_op(
610610
if enable_sp():
611611
if "shared_expert" in prefix:
612612
return None
613-
if "gate_up_proj" in prefix:
614-
return SequenceColumnParallelOp(layer)
615-
if "in_proj" in prefix:
616-
return SequenceColumnParallelOp(layer)
617-
if "qkv_proj" in prefix or "conv1d" in prefix:
618-
return SequenceColumnParallelOp(layer)
613+
sp_column_prefix = [
614+
"gate_up_proj", # first MLP of most LLMs
615+
"in_proj", # gated deltanet of Qwen3 Next
616+
"qkv_proj", # qkv linear of most LLMs
617+
"conv1d", # gated deltanet of Qwen3 Next
618+
"query_key_value", # qkv linear of Bailing
619+
]
620+
for a_prefix in sp_column_prefix:
621+
if a_prefix in prefix:
622+
return SequenceColumnParallelOp(layer)
619623

620624
return None
621625

@@ -637,8 +641,15 @@ def _get_row_parallel_op(
637641
if enable_sp():
638642
if "shared_expert" in prefix:
639643
return None
640-
if "o_proj" in prefix or "out_proj" in prefix or "down_proj" in prefix:
641-
return SequenceRowParallelOp(layer)
644+
sp_row_prefixes = [
645+
"o_proj", # attn output linear of most LLMs
646+
"out_proj", # attn output linear of Qwen3 Next
647+
"down_proj", # second MLP of most LLMs
648+
"attention.dense", # attn output linear of Bailing
649+
]
650+
for a_prefix in sp_row_prefixes:
651+
if a_prefix in prefix:
652+
return SequenceRowParallelOp(layer)
642653

643654
return None
644655

0 commit comments

Comments
 (0)