Fix the rest of the issues

vshampor · vshampor · commit 9fa3c12f93de · 2025-10-16T18:03:26.000+02:00
diff --git a/src/core/reference/include/openvino/reference/xattention.hpp b/src/core/reference/include/openvino/reference/xattention.hpp
@@ -144,6 +144,11 @@ class XAttentionBlockSelector {
         }
     }
 
+    /** Applies the softmax causal mask along the last two dimensions of the rank-3 input tensor in-place.
+     * @param in_out_data Pointer to the softmax input values (logits).
+     * @param in_out_shape Shape of the input tensor. Expected shape is [num_heads, num_query_tokens /
+     * stride, num_key_tokens / stride].
+     */
     void apply_causal_mask_(T* in_out_data, const Shape& in_out_shape) {
         OPENVINO_ASSERT(in_out_shape.size() == 3);
         OPENVINO_ASSERT(in_out_shape[1] <= in_out_shape[2]);
@@ -153,7 +158,8 @@ class XAttentionBlockSelector {
             size_t head_offset = head_idx * in_out_shape[1] * in_out_shape[2];
             for (size_t query_dim_idx = 0; query_dim_idx < in_out_shape[1]; query_dim_idx++) {
                 size_t query_dim_offset = query_dim_idx * in_out_shape[2];
-                for (size_t key_dim_idx = key_dim - query_dim + query_dim_idx + 1; key_dim_idx < key_dim; key_dim_idx++) {
+                for (size_t key_dim_idx = key_dim - query_dim + query_dim_idx + 1; key_dim_idx < key_dim;
+                     key_dim_idx++) {
                     in_out_data[head_offset + query_dim_offset + key_dim_idx] = -INFINITY;
                 }
             }
@@ -222,11 +228,11 @@ class XAttentionBlockSelector {
     }
 
     /** Selects the elements of the input tensor along the last dimension, independently along the first two dimensions,
-     * so that the elements constitute a smallest subset amounting to a sum portion no less than `threshold` of the
-     * element sum. The last two dimensions are treated as the query-block and key-block dimensions in the context
-     * of attention matrix scores, and the first-in-row, the "diagonal" and "non-causal" elements are
-     * disregarded when calculating the sum. "Non-causal" elements are never preserved, while "diagonal" and
-     * first-in-row elements are always preserved.
+     * so that the selected elements constitute a smallest subset amounting to a sum portion no less than `threshold`
+     * of the total "causal" element sum. "Causal" is understood in the sense of the last two dimensions being
+     * treated as the query-block and key-block dimensions in the context of attention matrix scores. The
+     * first-in-row, the "diagonal" and "non-causal" elements are disregarded when calculating the sum. "Non-causal"
+     * elements are never preserved, while "diagonal" and first-in-row elements are always preserved.
      * @param blocked_scores_data Pointer to the blocked score input.
      * @param blocked_attention_scores_shape Shape of the blocked score input tensor. Expected shape is [num_heads,
      * num_query_tokens / block_size, num_key_tokens / block_size]
@@ -256,27 +262,30 @@ class XAttentionBlockSelector {
             for (size_t q_block_idx = 0; q_block_idx < blocked_attention_scores_shape[1]; q_block_idx++) {
                 std::priority_queue<IndexAndScore> indices_and_scores_queue;
                 double total_sum = 0.0;
+                double cumsum = 0.0;
                 for (size_t k_block_idx = 0; k_block_idx < blocked_attention_scores_shape[2]; k_block_idx++) {
                     if (k_block_idx >
                         (blocked_attention_scores_shape[2] - blocked_attention_scores_shape[1] + q_block_idx)) {
                         // Disregard non-causal blocks entirely
                         continue;
                     }
+                    size_t target_offset = head_offset + blocked_attention_scores_shape[2] * q_block_idx + k_block_idx;
+                    T current_score = *(blocked_attention_scores_data + target_offset);
+                    total_sum += current_score;
+
                     if ((k_block_idx ==
                          (blocked_attention_scores_shape[2] - blocked_attention_scores_shape[1] + q_block_idx)) ||
                         k_block_idx == 0) {
-                        // We preserve first-in-row and diagonal blocks always, and do not include their score in the
-                        // cumulative sum, i.e. we only preserve the fraction of the non-diagonal blocks' attention mass
+                        // We preserve first-in-row and diagonal blocks always, and include their score in the
+                        // cumulative sum. The target for the rest of the blocks in row is to fill up the
+                        // rest of the attention mass fraction so that with the diagonal and first blocks they
+                        // comprise the `threshold` portion of the entire causal attention mass in this row
                         retval[head_idx].insert({q_block_idx, k_block_idx});
+                        cumsum += current_score;
                     } else {
-                        size_t target_offset =
-                            head_offset + blocked_attention_scores_shape[2] * q_block_idx + k_block_idx;
-                        T current_score = *(blocked_attention_scores_data + target_offset);
-                        total_sum += current_score;
                         indices_and_scores_queue.push({{q_block_idx, k_block_idx}, current_score});
                     }
                 }
-                double cumsum = 0.0;
                 double required_sum = m_threshold * total_sum;
                 while (cumsum < required_sum && !indices_and_scores_queue.empty()) {
                     auto index_and_largest_score = indices_and_scores_queue.top();
diff --git a/src/core/tests/reference/xattention.cpp b/src/core/tests/reference/xattention.cpp
@@ -639,10 +639,10 @@ std::vector<double> E2E_Q_DATA_8 = {
 ov::Shape E2E_K_SHAPE_8 = {2, 8, 2};
 std::vector<double> E2E_K_DATA_8 = {
     // clang-format off
-    -1.2870, -1.2179,  0.0316,  0.0080, -0.6171,  1.0622,  0.3085, -0.7751,
-    -1.3612,  0.9485, -0.0803,  0.5752,  0.1925, -0.1113,  1.4693,  0.0673,
-     0.7422,  0.7149, -1.7684, -0.0651, -0.1925, -1.4169,  1.0030, -0.8091,
-    -0.7934,  0.5160, -0.2543,  0.1729, -0.0687, -1.4245,  0.0758,  1.1613
+     0.2980,  0.4959, -0.0834,  0.7015,  1.2516,  0.6656, -2.7873,  1.9731,
+    -0.4817,  1.1117, -0.8096, -0.5397, -1.0528,  0.2869, -1.1274,  1.4849,
+    -0.2468, -1.0449, -1.0085, -0.3389,  0.6750,  0.9095,  0.4674,  2.2321,
+     1.3183, -0.3513, -0.3717,  0.0176, -0.2545, -0.6729, -1.1547,  0.0279
     // clang-format on
 };
 
@@ -746,8 +746,89 @@ std::vector<E2EBlockSelectTestData> E2E_BLOCK_SELECT_TEST_CASES = {
             {{0, 0}, {0, 2}, {0, 4}, {1, 0}, {1, 1}, {1, 3}, {1, 5}, {2, 0}, {2, 1}, {2, 2}, {2, 3}, {2, 4}, {2, 6}, {3, 0}, {3, 1}, {3, 4}, {3, 5}, {3, 6}, {3, 7}}
         }
         // clang-format on
+    },
+    {
+        E2E_Q_SHAPE_8,
+        E2E_Q_DATA_8,
+        E2E_K_SHAPE_16,
+        E2E_K_DATA_16,
+        /* threshold = */ 0.45,
+        /* block_size = */ 2,
+        /* stride = */ 2,
+
+        // clang-format off
+        {
+            {{0, 0}, {0, 4}, {1, 0}, {1, 5}, {2, 0}, {2, 1}, {2, 3}, {2, 6}, {3, 0}, {3, 2}, {3, 5}, {3, 7}},
+            {{0, 0}, {0, 2}, {0, 4}, {1, 0}, {1, 5}, {2, 0}, {2, 4}, {2, 6}, {3, 0}, {3, 5}, {3, 7}}
+        }
+        // clang-format on
+    },
+    {
+        E2E_Q_SHAPE_8,
+        E2E_Q_DATA_8,
+        E2E_K_SHAPE_16,
+        E2E_K_DATA_16,
+        /* threshold = */ 0.45,
+        /* block_size = */ 4,
+        /* stride = */ 2,
+
+        // clang-format off
+        {
+            {{0, 0}, {0, 2}, {1, 0}, {1, 1}, {1, 3}},
+            {{0, 0}, {0, 2}, {1, 0}, {1, 3}}
+        }
+        // clang-format on
+    },
+    {
+        E2E_Q_SHAPE_8,
+        E2E_Q_DATA_8,
+        E2E_K_SHAPE_16,
+        E2E_K_DATA_16,
+        /* threshold = */ 0.45,
+        /* block_size = */ 4,
+        /* stride = */ 4,
+
+        // clang-format off
+        {
+            {{0, 0}, {0, 2}, {1, 0}, {1, 3}},
+            {{0, 0}, {0, 2}, {1, 0}, {1, 3}}
+        }
+        // clang-format on
+    },
+    {
+        E2E_Q_SHAPE_8,
+        E2E_Q_DATA_8,
+        E2E_K_SHAPE_8,
+        E2E_K_DATA_8,
+        /* threshold = */ 0.5,
+        /* block_size = */ 2,
+        /* stride = */ 2,
+
+        // clang-format off
+        {
+            {{0, 0}, {1, 0}, {1, 1}, {2, 0}, {2, 1}, {2, 2}, {3, 0}, {3, 1}, {3, 3}},
+            {{0, 0}, {1, 0}, {1, 1}, {2, 0}, {2, 2}, {3, 0}, {3, 3}}
+        }
+        // clang-format on
+    },
+    {
+        E2E_Q_SHAPE_8,
+        E2E_Q_DATA_8,
+        E2E_K_SHAPE_8,
+        E2E_K_DATA_8,
+        /* threshold = */ 0.2,
+        /* block_size = */ 2,
+        /* stride = */ 2,
+
+        // clang-format off
+        {
+            {{0, 0}, {1, 0}, {1, 1}, {2, 0}, {2, 2}, {3, 0}, {3, 3}},
+            {{0, 0}, {1, 0}, {1, 1}, {2, 0}, {2, 2}, {3, 0}, {3, 3}}
+        }
+        // clang-format on
     }};
 
+
 TEST_P(XAttentionE2EBlockSelectTest, SelectsBlocksCorrectlyFromQKData) {
     auto test_struct = GetParam();
     ov::reference::XAttentionBlockSelector<double> selector(test_struct.threshold,
@@ -762,8 +843,8 @@ TEST_P(XAttentionE2EBlockSelectTest, SelectsBlocksCorrectlyFromQKData) {
     ASSERT_EQ(test_result.size(), test_struct.ref_retained_block_indices.size());
     EXPECT_EQ(test_result, test_struct.ref_retained_block_indices);
     for (size_t head_idx = 0; head_idx < test_result.size(); head_idx++) {
-        std::cout << "Head " << head_idx << std::endl;
         if (test_result != test_struct.ref_retained_block_indices) {
+            std::cout << "Head " << head_idx << std::endl;
             const auto& ref_set = test_struct.ref_retained_block_indices[head_idx];
             const auto& test_set = test_result[head_idx];
             std::cout << "ref has " << ref_set.size() << " elements, test has " << test_set.size() << std::endl;