[CPU]Support RoPE for GLM4

zhangYiIntel · zhangYiIntel · commit 0e8211add714 · 2025-11-05T09:26:27.000+08:00
Signed-off-by: Zhang Yi &lt;yi3.zhang@intel.com&gt;
diff --git a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -783,7 +783,11 @@ ov::pass::RoPEFusionChatGLMHF::RoPEFusionChatGLMHF() {
     auto reshape = pattern::wrap_type<v1::Reshape>({qk_linear, pattern::any_input()},
                                                    pattern::shape_matches("[?, head_cnt, 1, head_size]"),
                                                    {{"special_zero", false}});
-    auto slice_1 = NewGenSlice(reshape, 0, "ndims", 1, 3);
+
+    auto qkv_proj =
+        pattern::wrap_type<v1::VariadicSplit>({reshape, 3, {"ndims", "ndims"}});
+    qkv_proj->set_output_size(2);
+    auto slice_1 = NewGenSlice(reshape, 0, "ndims", 1, 3) | qkv_proj->output(0);
 
     auto const_idx =
         pattern::wrap_type<ov::opset1::Constant>(pattern::type_matches(ov::element::i32) && const_idx_predicate);
@@ -807,7 +811,7 @@ ov::pass::RoPEFusionChatGLMHF::RoPEFusionChatGLMHF() {
     auto multiply_1 = pattern::wrap_type<v1::Multiply>({flatten, repeat_interleave_sin}, {{"auto_broadcast", "numpy"}});
     auto add = pattern::wrap_type<v1::Add>({multiply, multiply_1}, {{"auto_broadcast", "numpy"}});
 
-    auto slice_5 = NewGenSlice(reshape, "ndims", INT_MAX, 1, 3);
+    auto slice_5 = NewGenSlice(reshape, "ndims", INT_MAX, 1, 3) | qkv_proj->output(1);
     auto result = pattern::wrap_type<v0::Concat>({add, slice_5}, {{"axis", -1}});
 
     matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
diff --git a/src/plugins/intel_cpu/src/nodes/rope.cpp b/src/plugins/intel_cpu/src/nodes/rope.cpp
@@ -252,15 +252,26 @@ struct RoPE::RoPEExecutorChatGLM : public RoPE::Executor {
         jcp.dst_prc = precision_of<T>::value;
         jcp.rotary_ndims = config.rotary_ndims;
         jcp.interleave = true;
-        jcp.mix_cos_sin = true;
+        // if use precomputed rope cache then it's mixed
+        // otherwise rope will have separate cos/sin inputs
+        jcp.mix_cos_sin = config.use_rope_cache;
         m_rotaryKernel = createJitKernel(jcp, true);
     }
 
     void execute([[maybe_unused]] const dnnl::stream& strm,
                  const std::vector<MemoryPtr>& inputs,
                  const std::vector<MemoryPtr>& outputs) override {
         ov::intel_cpu::PlainTensor t_src(inputs[0]);
-        ov::intel_cpu::PlainTensor t_cos_sin(inputs[1]);
+        ov::intel_cpu::PlainTensor t_cos;
+        ov::intel_cpu::PlainTensor t_sin;
+        ov::intel_cpu::PlainTensor t_cos_sin;
+        if (!m_config.use_rope_cache) {
+            t_cos.reset(inputs[1]);
+            t_sin.reset(inputs[2]);
+        } else {
+            t_cos_sin.reset(inputs[1]);
+        }
+
         ov::intel_cpu::PlainTensor t_dst(outputs[0]);
 
         // [seq_len, batch_size, (hidden_states_q + hidden_states_k + hidden_states_v)]
@@ -277,27 +288,45 @@ struct RoPE::RoPEExecutorChatGLM : public RoPE::Executor {
 
             auto rotary_dims = m_config.rotary_ndims;
 
-            parallel_for3d(batch_size, head_cnt, seq_len, [&](size_t b, size_t h, size_t p) {
-                // src [batch, length, H x S]
-                auto* src = t_src.ptr<T>(b, p, h * head_size);
-                // [batch_size, length, ndims//2, 2]
-                auto* cos_sin = &t_cos_sin.at<float>({b, p, 0, 0}, true);
-                auto* dst = t_dst.ptr<T>(b, h, p, 0);
-
-                if (m_rotaryKernel) {
-                    execJitKernel(m_rotaryKernel, src, dst, cos_sin, nullptr);
-                } else {
-                    size_t i = 0;
-                    for (; i < rotary_dims; i += 2) {
-                        auto cosv = cos_sin[i];
-                        auto sinv = cos_sin[i + 1];
-                        dst[i] = cosv * src[i] - sinv * src[i + 1];
-                        dst[i + 1] = sinv * src[i] + cosv * src[i + 1];
+            if (m_config.use_rope_cache) {
+                parallel_for3d(batch_size, head_cnt, seq_len, [&](size_t b, size_t h, size_t p) {
+                    // src [batch, length, H x S]
+                    auto* src = t_src.ptr<T>(b, p, h * head_size);
+                    // [batch_size, length, ndims//2, 2]
+                    auto* cos_sin = &t_cos_sin.at<float>({b, p, 0, 0}, true);
+                    auto* dst = t_dst.ptr<T>(b, h, p, 0);
+
+                    if (m_rotaryKernel) {
+                        execJitKernel(m_rotaryKernel, src, dst, cos_sin, nullptr);
+                    } else {
+                        size_t i = 0;
+                        for (; i < rotary_dims; i += 2) {
+                            auto cosv = cos_sin[i];
+                            auto sinv = cos_sin[i + 1];
+                            dst[i] = cosv * src[i] - sinv * src[i + 1];
+                            dst[i + 1] = sinv * src[i] + cosv * src[i + 1];
+                        }
                     }
-                }
 
-                memcpy(dst + rotary_dims, src + rotary_dims, (head_size - rotary_dims) * sizeof(T));
-            });
+                    memcpy(dst + rotary_dims, src + rotary_dims, (head_size - rotary_dims) * sizeof(T));
+                });
+            } else {
+                parallel_for3d(batch_size, head_cnt, seq_len, [&](size_t b, size_t h, size_t p) {
+                    auto* src = t_src.ptr<T>(b, p, h * head_size);
+                    auto* dst = t_dst.ptr<T>(b, h, p);
+                    const auto* cos = t_cos.ptr<float>(b, 0, 0);
+                    const auto* sin = t_sin.ptr<float>(b, 0, 0);
+                    if (m_rotaryKernel) {
+                        execJitKernel(m_rotaryKernel, src, dst, cos, sin);
+                    } else {
+                        for (size_t i = 0; i < rotary_dims; i += 2) {
+                            dst[i] = cos[i / 2] * src[i] - sin[i / 2] * src[i + 1];
+                            dst[i + 1] = sin[i / 2] * src[i] + cos[i / 2] * src[i + 1];
+                        }
+                    }
+                    memcpy(dst + rotary_dims, src + rotary_dims, (head_size - rotary_dims) * sizeof(T));
+                });
+            }
         } else {
             auto seq_len = t_src.size(0);
             auto batch_size = t_src.size(1);
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -1094,7 +1094,6 @@ void Transformations::PostLpt() {
     CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RoPEFusion, true);
     CPU_REGISTER_PASS_ARM64(postLPTPassManager, ov::pass::RoPEFusion, true);
     CPU_DISABLE_PASS_COMMON(postLPTPassManager, ov::pass::RoPEFusionFlux);
-    CPU_DISABLE_PASS_COMMON(postLPTPassManager, ov::pass::RoPEFusionChatGLMHF);
     CPU_REGISTER_PASS_X64(postLPTPassManager, CausalMaskPreprocessFusion);
 
 #if defined(OPENVINO_ARCH_X86_64)
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp
@@ -79,5 +79,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_RoPETestQwenVL,
                             ::testing::ValuesIn(vit_param)),
                          RoPETestQwenVL::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM,
+                         RoPETestChatGLMHF,
+                         ::testing::Combine(
+                            ::testing::Values(ov::element::f32),
+                            ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                         RoPETestChatGLMHF::getTestCaseName);
+
 }  // namespace test
 }  // namespace ov