minor: canonicalize TFLOPS calculation (#2069)

Edenzzzz · web-flow · commit eccbdde95558 · 2025-11-11T19:40:04.000-08:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Refactor** * TFLOPS computation standardized across attention benchmarks so reported performance metrics consistently account for actual sequence and batch lengths. * **Bug Fixes** * Added checks to prevent invalid mixed-length causal inputs, avoiding misleading benchmark results. * **Chores** * Renamed timing parameter in the benchmark utility for clearer intent.
diff --git a/benchmarks/bench_blackwell_attention.py b/benchmarks/bench_blackwell_attention.py
@@ -18,7 +18,10 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
 def bench_fmha_blackwell(
@@ -69,14 +72,17 @@ def bench_fmha_blackwell(
     )
     ms = np.median(measurements)
 
-    def flops(ms):
-        if causal:
-            return batch_size * qkv_len * qkv_len * num_heads * head_dim * 2 / ms / 1e9
-        else:
-            return batch_size * qkv_len * qkv_len * num_heads * head_dim * 4 / ms / 1e9
-
+    TFLOPS = attention_tflops_per_sec_with_actual_seq_lens(
+        torch.full((batch_size,), qkv_len),
+        torch.full((batch_size,), qkv_len),
+        head_dim,
+        head_dim,
+        num_heads,
+        causal,
+        ms,
+    )
     print(
-        f"bench_fmha_blackwell (batch_size={batch_size}, qkv_len={qkv_len}, num_heads={num_heads}, head_dim={head_dim}, causal={causal}), flops: {flops(ms):.3f} TFLOPs/s"
+        f"bench_fmha_blackwell (batch_size={batch_size}, qkv_len={qkv_len}, num_heads={num_heads}, head_dim={head_dim}, causal={causal}), flops: {TFLOPS:.3f} TFLOPs/s"
     )
 
 
diff --git a/benchmarks/bench_block_sparse_attention.py b/benchmarks/bench_block_sparse_attention.py
@@ -18,7 +18,10 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
 def bench_variable_block_sparse_attention(
@@ -120,7 +123,15 @@ def bench_variable_block_sparse_attention(
     )
 
     def flops(ms):
-        return seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.tensor([seq_len]),
+            torch.tensor([seq_len]),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            False,
+            ms,
+        )
 
     print(
         f"bench_variable_block_sparse_attention (num_qo_heads={num_qo_heads}, num_kv_heads={num_kv_heads}, head_dim={head_dim}, seq_len={seq_len}, num_blocks_row={num_blocks_row}, num_blocks_col={num_blocks_col}, block_density={block_density}), sparse fa2-template: {flops(sparse_ms_fa2):.3f} TFLOPs/s, sparse fa3-template: {flops(sparse_ms_fa3):.3f} TFLOPs/s, dense fa2-template: {flops(dense_sm80_ms):.3f} TFLOPs/s, dense fa3-template: {flops(dense_sm90_ms):.3f} TFLOPs/s"
diff --git a/benchmarks/bench_hopper_attention.py b/benchmarks/bench_hopper_attention.py
@@ -18,7 +18,10 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
 def bench_single_prefill(seq_len, num_heads, causal, head_dim):
@@ -41,10 +44,15 @@ def bench_single_prefill(seq_len, num_heads, causal, head_dim):
     )
 
     def flops(ms):
-        if causal:
-            return seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-        else:
-            return seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.tensor([seq_len]),
+            torch.tensor([seq_len]),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
         f"bench_single_prefill (seq_len={seq_len}, num_heads={num_heads}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s"
@@ -97,14 +105,15 @@ def bench_batch_ragged_prefill(batch_size, num_heads, seq_len, causal, head_dim)
     )
 
     def flops(ms):
-        if causal:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-            )
-        else:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
-            )
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.full((batch_size,), seq_len),
+            torch.full((batch_size,), seq_len),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
         f"bench_batch_ragged_prefill (batch_size={batch_size}, num_heads={num_heads}, seq_len={seq_len}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s"
@@ -176,14 +185,15 @@ def bench_batch_paged_prefill(
     )
 
     def flops(ms):
-        if causal:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-            )
-        else:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
-            )
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.full((batch_size,), seq_len),
+            torch.full((batch_size,), seq_len),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
         f"bench_batch_paged_prefill (page_size={page_size} batch_size={batch_size}, num_heads={num_heads}, seq_len={seq_len}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s"
diff --git a/benchmarks/bench_hopper_fp8_attention.py b/benchmarks/bench_hopper_fp8_attention.py
@@ -2,7 +2,10 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
 def bench_single_prefill(seq_len, num_heads, causal, head_dim):
@@ -45,10 +48,15 @@ def bench_single_prefill(seq_len, num_heads, causal, head_dim):
     )
 
     def flops(ms):
-        if causal:
-            return seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-        else:
-            return seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.tensor([seq_len]),
+            torch.tensor([seq_len]),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
         f"bench_single_prefill (seq_len={seq_len}, num_heads={num_heads}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s, fa3-fp8: {flops(fp8_sm90_ms):.3f} TFLOPs/s"
diff --git a/flashinfer/testing/utils.py b/flashinfer/testing/utils.py
@@ -277,6 +277,12 @@ def attention_flops(
     Returns:
         total_flops (int): Total FLOPs for the layer.
     """
+    # Causal attention requires kv_len >= q_len
+    if qo_seqlen > kv_seqlen:
+        raise ValueError(
+            "qo_seqlen must be less than or equal to kv_seqlen for causal attention"
+        )
+
     if causal:
         bmm1_flops = (
             batch_size
@@ -323,6 +329,13 @@ def attention_flops_with_actual_seq_lens(
     Returns:
         total_flops (int): Total FLOPs for the layer.
     """
+    # Causal attention requires kv_len >= q_len
+    # Otherwise right align if kv_len > q_len
+    if causal and (actual_seq_lens_q > actual_seq_lens_kv).any():
+        raise ValueError(
+            "actual_seq_lens_q must be less than or equal to actual_seq_lens_kv for causal attention"
+        )
+
     if causal:
         bmm1_flops = (
             torch.dot(
@@ -412,7 +425,7 @@ def attention_tflops_per_sec_with_actual_seq_lens(
     head_dim_vo,
     num_qo_heads,
     causal,
-    time,
+    ms,
 ):
     """
     Calculate TFLOPS per second for a given attention layer with actual sequence lengths.
@@ -425,7 +438,7 @@ def attention_tflops_per_sec_with_actual_seq_lens(
         head_dim_vo (int): Head dimension of the value.
         num_qo_heads (int): Number of query heads.
         causal (bool): Whether to use causal masking.
-        time (float): Execution time in milliseconds.
+        ms (float): Execution time in milliseconds.
 
     Returns:
         tflops_per_sec (float): TFLOPS per second for the layer.
@@ -438,7 +451,7 @@ def attention_tflops_per_sec_with_actual_seq_lens(
         num_qo_heads,
         causal,
     )
-    return f.item() / time / 1e9 if not math.isnan(time) else 0.0
+    return f.item() / ms / 1e9 if not math.isnan(ms) else 0.0
 
 
 def attention_tb_per_sec(