fix softmax_value (#179)

jinminxi104 · web-flow · commit b99524bd3276 · 2025-02-26T12:29:17.000+08:00
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/conversion.py b/dlinfer/graph/dicp/vendor/AtbGraph/conversion.py
@@ -443,6 +443,7 @@ def prefill_attention(
         max_kv_seq_len,
         block_size,
         mask,
+        softmax_scale,
         is_unpaged_prefill,
         kv_scales,
         kv_zeros,
@@ -454,7 +455,11 @@ def prefill_attention(
         # inplace1 = self.get_proxy(atb_op.Inplace, (fill_kv_cache, k_cache, 0))
         # inplace2 = self.get_proxy(atb_op.Inplace, (fill_kv_cache, v_cache, 1))
         mask = mask[0]
-        scale = 1.0 / math.sqrt(query.node.meta["val"].shape[-1])
+        scale = (
+            softmax_scale
+            if softmax_scale
+            else 1.0 / math.sqrt(query.node.meta["val"].shape[-1])
+        )
         if query.node.meta["val"].dtype != mask.node.meta["val"].dtype:
             mask = self.get_proxy(atb_op.Cast, (mask, query.node.meta["val"].dtype))
         if is_unpaged_prefill:
diff --git a/dlinfer/vendor/ascend/torch_npu_ops.py b/dlinfer/vendor/ascend/torch_npu_ops.py
@@ -195,7 +195,7 @@ def paged_decode_attention(
     query = query.contiguous()
     attn_output = attn_output.contiguous()
     query = query.view(bs, 1, num_q_heads * dim)
-    scale_value = 1.0 / math.sqrt(dim)
+    scale_value = softmax_scale if softmax_scale else 1.0 / math.sqrt(dim)
 
     torch.ops.npu_ext.npu_incre_flash_attention_v4_out(
         query,
@@ -252,15 +252,12 @@ def paged_prefill_attention(
         raise RuntimeError(
             "paged_decode_attention does not " "support alibi_slopes yet"
         )
-    if softmax_scale is not None:
-        raise RuntimeError(
-            "paged_decode_attention does not " "support softmax_scale yet"
-        )
+
     if block_table.dtype != torch.int32:
         block_table = block_table.to(torch.int32)
 
     kv_seq_len_list = kv_seq_len.tolist()
-    scale_value = 1.0 / math.sqrt(query.shape[-1])
+    scale_value = softmax_scale if softmax_scale else 1.0 / math.sqrt(query.shape[-1])
     query = query.contiguous().view(query.shape[0], 1, -1)
     torch.ops.npu_ext.npu_incre_flash_attention_v4_out(
         query,