Support pooling models

lianyiibo · lianyiibo · commit dd092f7547d3 · 2025-09-29T11:47:24.000+08:00
Signed-off-by: lianyibo &lt;lianyibo1@kunlunit.com&gt;
diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
@@ -15,7 +15,9 @@
 import torch
 
 
-def _generate_attn_mask(max_seq_len, dtype):
+def _generate_attn_mask(max_seq_len, dtype, tril):
+    if not tril:
+        return torch.zeros(size=(max_seq_len, max_seq_len)).to(dtype)
     # Construct lower triangle matrix.
     mask_flag = torch.tril(
         torch.ones((max_seq_len, max_seq_len),
@@ -40,12 +42,13 @@ def __init__(
         max_seq_len: int,
         dtype: torch.dtype,
         device: torch.device = None,
+        tril: bool = True,
     ):
         # NOTE: The device argument specifies the target NPU
         # to be used for the newly added FIA operator.
         # Only pass this parameter when using the new FIA operator.
-
-        attn_mask = _generate_attn_mask(max_seq_len, dtype)
+        self.tril = tril
+        attn_mask = _generate_attn_mask(max_seq_len, dtype, self.tril)
 
         self._seq_len_cached = attn_mask.shape[0]
         self.attn_mask_cache = attn_mask
@@ -103,6 +106,7 @@ def get_splitfuse_attn_mask(
     def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
         if seqlen > self._seq_len_cached:
             self._seq_len_cached = seqlen
-            self.attn_mask_cache = _generate_attn_mask(seqlen, dtype)
+            self.attn_mask_cache = _generate_attn_mask(seqlen, dtype,
+                                                       self.tril)
         if self.attn_mask_cache.dtype != dtype:
             self.attn_mask_cache = self.attn_mask_cache.to(dtype)
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -287,6 +287,27 @@ def __init__(
         self.key_cache = None
         self.value_cache = None
 
+    def _forward_encoder(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: AscendMetadata,
+        output: Optional[torch.Tensor] = None,
+        num_tokens=0,
+    ) -> torch.Tensor:
+        torch_npu._npu_flash_attention(query=query,
+                                       key=key,
+                                       value=value,
+                                       mask=attn_metadata.attn_mask,
+                                       seq_len=attn_metadata.seq_lens,
+                                       scale_value=self.scale,
+                                       num_heads=self.num_heads,
+                                       num_kv_heads=self.num_kv_heads,
+                                       out=output)
+        assert output is not None
+        return output[:num_tokens, :, :]
+
     def _forward_prefill_no_cache(
         self,
         query: torch.Tensor,
@@ -570,10 +591,11 @@ def forward(
             num_actual_tokens = attn_metadata.num_actual_tokens
             assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
             attn_type = self.attn_type
-            if attn_type != AttentionType.DECODER:
-                raise NotImplementedError("Encoder self-attention and "
-                                          "encoder/decoder cross-attention "
-                                          "are not implemented for "
+            if attn_type not in [
+                    AttentionType.DECODER, AttentionType.ENCODER_ONLY
+            ]:
+                raise NotImplementedError("Encoder/Decoder cross-attention "
+                                          "is not implemented for "
                                           "PallasAttentionBackendImpl")
             # View q k v to BSH.
             query = query.view(-1, self.num_heads, self.head_size)
@@ -594,7 +616,11 @@ def forward(
                     slot_indices=slots)
 
             # V0-Style scheduler situation.
-            if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
+            if attn_type == AttentionType.ENCODER_ONLY:
+                output = self._forward_encoder(query, key, value,
+                                               attn_metadata, output,
+                                               num_tokens)
+            elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
                 output = self._forward_prefill_no_cache(
                     query, key, value, attn_metadata, output, num_tokens)
             elif attn_metadata.attn_state == \
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -145,7 +145,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 structured_outputs_config.backend == "auto" and \
                 not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
                 not scheduler_config.send_delta_data and \
-                scheduler_config.policy == "fcfs":
+                scheduler_config.policy == "fcfs" and \
+                model_config.runner_type == "generate":
                 ascend_scheduler_config.enabled = True
                 chunked_prefill_enabled_in_ascend_scheduler = getattr(
                     ascend_scheduler_config, "enable_chunked_prefill", False)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -76,9 +76,11 @@
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
-                                        KVCacheConfig, KVCacheGroupSpec,
-                                        KVCacheSpec, MambaSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec,
+                                        EncoderOnlyAttentionSpec,
+                                        FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheSpec,
+                                        MambaSpec)
 # yapf: enable
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
                              DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
@@ -324,13 +326,17 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
                 self.block_size,
                 use_mla=self.model_config.use_mla,
             )
+        pooler_config = self.model_config.pooler_config
+        tril = self.model_config.runner_type == "generate" or (
+            pooler_config is not None
+            and pooler_config.pooling_type.lower() == "last")
         if torch.version.cann.startswith("8.3"):
             self.attn_mask_builder = AttentionMaskBuilder(
                 self.scheduler_config.max_num_batched_tokens, self.dtype,
-                self.device)
+                self.device, tril)
         else:
             self.attn_mask_builder = AttentionMaskBuilder(
-                self.model_config.max_model_len, self.dtype)
+                self.model_config.max_model_len, self.dtype, tril=tril)
 
         # Set up speculative decoding.
         self.spec_attn_mask = None
@@ -1487,14 +1493,29 @@ def _prepare_inputs(
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
                 self.kv_cache_config.kv_cache_groups):
-            blk_table = self.input_batch.block_table[kv_cache_group_id]
-            blk_table_tensor = blk_table.get_device_tensor()
-            slot_mapping = blk_table.slot_mapping_cpu[:
-                                                      total_num_scheduled_tokens]
-            self.slot_mapping[:total_num_scheduled_tokens].copy_(
-                slot_mapping[:total_num_scheduled_tokens],
-                non_blocking=True,
-            )
+            if isinstance(kv_cache_group_spec.kv_cache_spec,
+                          EncoderOnlyAttentionSpec):
+                # Encoder-only layers do not have KV cache, so we need to
+                # create a dummy block table and slot mapping for them.
+                blk_table_tensor = torch.zeros(
+                    (num_reqs, 1),
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                slot_mapping = torch.zeros(
+                    (total_num_scheduled_tokens, ),
+                    dtype=torch.int64,
+                    device=self.device,
+                )
+            else:
+                blk_table = self.input_batch.block_table[kv_cache_group_id]
+                blk_table_tensor = blk_table.get_device_tensor()
+                slot_mapping = blk_table.slot_mapping_cpu[:
+                                                          total_num_scheduled_tokens]
+                self.slot_mapping[:total_num_scheduled_tokens].copy_(
+                    slot_mapping[:total_num_scheduled_tokens],
+                    non_blocking=True,
+                )
 
             # Make AscendCommonAttentionMetadata
             common_attn_metadata = AscendCommonAttentionMetadata(
@@ -1543,6 +1564,11 @@ def _prepare_inputs(
                         common_prefix_len=common_prefix_len,
                         common_attn_metadata=common_attn_metadata,
                         **extra_attn_metadata_args)
+                elif self.model_config.runner_type == "pooling":
+                    attn_metadata_i = builder.build(
+                        common_prefix_len=common_prefix_len,
+                        common_attn_metadata=common_attn_metadata,
+                        **extra_attn_metadata_args)
                 else:
                     attn_metadata_i = builder.build(
                         common_prefix_len=common_prefix_len,
@@ -2672,6 +2698,33 @@ def _convert_torch_format(self, tensor):
         tensor = torch_npu.npu_format_cast(tensor, ACL_FORMAT)
         return tensor
 
+    def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
+        """
+        Add encoder-only layers to the KV cache config.
+        """
+        block_size = self.vllm_config.cache_config.block_size
+        use_mla = self.vllm_config.model_config.use_mla
+        encoder_only_attn_specs: dict[AttentionSpec,
+                                      list[str]] = defaultdict(list)
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for layer_name, attn_module in attn_layers.items():
+            if attn_module.attn_type == AttentionType.ENCODER_ONLY:
+                attn_spec: AttentionSpec = EncoderOnlyAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=attn_module.num_kv_heads,
+                    head_size=attn_module.head_size,
+                    dtype=self.kv_cache_dtype,
+                    use_mla=use_mla)
+                encoder_only_attn_specs[attn_spec].append(layer_name)
+                self.runner_only_attn_layers.add(layer_name)
+        if len(encoder_only_attn_specs) > 0:
+            assert len(
+                encoder_only_attn_specs
+            ) == 1, "Only support one encoder-only attention spec now"
+            spec, layer_names = encoder_only_attn_specs.popitem()
+            self.kv_cache_config.kv_cache_groups.append(
+                KVCacheGroupSpec(layer_names=layer_names, kv_cache_spec=spec))
+
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -2681,9 +2734,10 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         kv_cache_config = deepcopy(kv_cache_config)
         self.kv_cache_config = kv_cache_config
+        self.may_reinitialize_input_batch(kv_cache_config)
+        self.may_add_encoder_only_layers_to_kv_cache_config()
         self.initialize_attn_backend(kv_cache_config)
         self.use_hybrid_blocks = (len(self.attn_groups) > 1)
-        self.may_reinitialize_input_batch(kv_cache_config)
 
         if self.model_config.is_deepseek_mla:
             kv_caches = self.initialize_kv_cache_tensors_deepseek(