openvinotoolkit · songbell · Nov 26, 2025 · Nov 26, 2025 · Copilot · Nov 26, 2025
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -65,13 +65,18 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     class ContinuousBatchingImpl;
 
     class ContinuousBatchingForSpeculativeDecodingImpl;
+    class ContinuousBatchingForEagle3DecodingImpl;
     class ContinuousBatchingForPromptLookupImpl;
     class SpeculativeDecodingImpl;
+    class Eagle3DecodingImpl;
     class PromptLookupImpl;
 
     friend class ContinuousBatchingForSpeculativeDecodingImpl;
+
     friend class ContinuousBatchingForPromptLookupImpl;
+    friend class ContinuousBatchingForEagle3DecodingImpl;
     friend class SpeculativeDecodingImpl;
+    friend class Eagle3DecodingImpl;
     friend class PromptLookupImpl;
 
     std::shared_ptr<IContinuousBatchingPipeline> m_impl;

diff --git a/src/cpp/src/continuous_batching/model_runner.hpp b/src/cpp/src/continuous_batching/model_runner.hpp
diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp
@@ -12,13 +12,58 @@
 #include "continuous_batching/pipeline_impl.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
 #include "prompt_lookup/prompt_lookup_impl.hpp"
+#include "speculative_decoding/speculative_decoding_eagle3_impl.hpp"
 #include "continuous_batching/timer.hpp"
 #include "utils.hpp"
 #include "visual_language/inputs_embedder.hpp"
+#include "json_utils.hpp"
 
 using namespace ov::genai;
 
 namespace {
+struct Eagle3RTInfo {
+    bool eagle3_mode = false;
+    std::vector<int> hidden_layers_list;
+    std::filesystem::path dt_mapping_table;
+};
+
+Eagle3RTInfo
+extract_eagle_mode_from_config(ov::AnyMap& config, const std::filesystem::path& models_path) {
+    Eagle3RTInfo eagle_rt_info;
+    if (config.find("eagle3_mode") != config.end()) {
+        eagle_rt_info.eagle3_mode = config.at("eagle3_mode").as<bool>();
+        config.erase("eagle3_mode");
+        if (config.find("hidden_layers_list") != config.end()) {
+            try {
+                eagle_rt_info.hidden_layers_list = config.at("hidden_layers_list").as<std::vector<int>>();
+                config.erase("hidden_layers_list");
+            } catch (const std::exception&) {
+                OPENVINO_THROW("please check the hidden layers input");
+            }
+        } else {
+            // compute the layers from number of hidden layers
+            auto config_file_path = models_path / "config.json";
+            if (!std::filesystem::exists(config_file_path))
+                OPENVINO_THROW("cannot deduce layers for hidden layer extraction");
+            std::ifstream file(config_file_path);
+
+            nlohmann::json data = nlohmann::json::parse(file);
+            using ov::genai::utils::read_json_param;
+            int num_decoder_layers = 0;
+            read_json_param(data, "num_hidden_layers", num_decoder_layers);
+            OPENVINO_ASSERT(num_decoder_layers > 3, "num_decoder_layers is too small to deduce hidden layers for extraction");
+            // The following default hidden layer selection corresponds to the EAGLE reference implementation:
+            // https://github.com/SafeAILab/EAGLE/blob/0ea94696/eagle/model/modeling_llama_kv.py#L1138
+            // These layers (2, num_decoder_layers / 2, num_decoder_layers - 3) are chosen to capture features from
+            // early, middle, and late stages of the decoder, as recommended by the EAGLE authors.
+            // If you wish to use different layers, provide the "hidden_layers_list" parameter in the config.
+            eagle_rt_info.hidden_layers_list = { 2, num_decoder_layers / 2, num_decoder_layers - 3 };
+        }
+        OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exact 3 layer extraction are expected in eagle3");
-        OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exact 3 layer extraction are expected in eagle3");
+        OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exactly 3 layer extraction are expected in eagle3");
-        OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exact 3 layer extraction are expected in eagle3");
+        OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exactly 3 layer extraction are expected in eagle3");
+    }
+    return eagle_rt_info;
+}
+
 bool
 extract_prompt_lookup_from_config(ov::AnyMap& config) {
     bool res = false;
@@ -45,6 +90,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
+    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
 
     auto model = utils::read_model(models_path, properties);
     auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
@@ -63,6 +109,10 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config);
+    } else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
+        OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
+        m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -87,7 +137,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-
+    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
     auto model = utils::read_model(models_path, properties_without_draft_model);
     auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
     properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
@@ -105,6 +155,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config);
+    } else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
+        OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
+        // Eagle speculative decoding does not support dynamic_split_fuse mode
+        // because it requires hidden state interaction from main model to draft model
+        // to be implemented future
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
+        m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -131,6 +188,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
+    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, std::filesystem::path(model_str));
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
 
     auto rt_info = model->get_rt_info();
@@ -150,6 +208,10 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
+    } else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
+        OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);

diff --git a/src/cpp/src/llm/pipeline.cpp b/src/cpp/src/llm/pipeline.cpp
@@ -88,6 +88,20 @@ std::pair<std::string, Any> generation_config(const GenerationConfig& config) {
     return {utils::CONFIG_ARG_NAME, Any::make<GenerationConfig>(config)};
 }
 
+inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model, ov::AnyMap& properties, const std::filesystem::path& mapping_path) {
+    if (model->has_rt_info("eagle3_mode") && model->get_rt_info<bool>("eagle3_mode")) {
+        properties["eagle3_mode"] = true;
+        if (model->has_rt_info("hidden_layers_list"))
+            properties["hidden_layers_list"] = model->get_rt_info<std::vector<int>>("hidden_layers_list");
+    }
+}
+
+inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model,
+                                ov::AnyMap& properties,
+                                const std::string& mapping_path) {
+    apply_eagle_rt_info(model, properties, std::filesystem::path(mapping_path));
+}
+
 std::pair<std::string, Any> draft_model(
     const std::filesystem::path& models_path,
     const std::string& device,
@@ -96,6 +110,7 @@ std::pair<std::string, Any> draft_model(
 
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
+    apply_eagle_rt_info(model, plugin_config, models_path);
     auto generation_config = utils::from_config_json_if_exists(models_path);
     auto tokenizer = ov::genai::Tokenizer(models_path);
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
@@ -111,6 +126,7 @@ std::pair<std::string, Any> draft_model(
     auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
 
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
+    apply_eagle_rt_info(model, plugin_config, model_str);
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
 }
 

diff --git a/src/cpp/src/sampling/sampler.cpp b/src/cpp/src/sampling/sampler.cpp
@@ -853,6 +853,11 @@ SequenceGroupSamplingInfo Sampler::sample_from_sequence_group(SequenceGroup::Ptr
                         }
                     }
                 }
+                if (!is_validation_mode_enabled && m_draft2target_mapping) { // compute token offset for draft model in speculative sampling
+                    ov::Tensor d2t_tensor = m_draft2target_mapping->get_tensor_view();
+                    auto d2t = d2t_tensor.data<int64_t>();
+                    sampled_token.m_index = sampled_token.m_index + (d2t? d2t[sampled_token.m_index] : 0);
-                    auto d2t = d2t_tensor.data<int64_t>();
-                    sampled_token.m_index = sampled_token.m_index + (d2t? d2t[sampled_token.m_index] : 0);
+                    // Map from draft token indices to target token indices
+                    auto draft_to_target_mapping = d2t_tensor.data<int64_t>();
+                    sampled_token.m_index = sampled_token.m_index + (draft_to_target_mapping ? draft_to_target_mapping[sampled_token.m_index] : 0);
-                    auto d2t = d2t_tensor.data<int64_t>();
-                    sampled_token.m_index = sampled_token.m_index + (d2t? d2t[sampled_token.m_index] : 0);
+                    // Map from draft token indices to target token indices
+                    auto draft_to_target_mapping = d2t_tensor.data<int64_t>();
+                    sampled_token.m_index = sampled_token.m_index + (draft_to_target_mapping ? draft_to_target_mapping[sampled_token.m_index] : 0);
+                }
                 // flag to add sampled token to generated sequence or extend logit processors only
                 bool is_extend_sequence = logit_token_offset == 0 || is_generate_n_tokens || !is_validation_passed;
                 if (is_validation_mode_enabled && !is_extend_sequence) {

diff --git a/src/cpp/src/sampling/sampler.hpp b/src/cpp/src/sampling/sampler.hpp
@@ -99,6 +99,7 @@ class Sampler {
     Tokenizer m_tokenizer;
 
     ThreadPool m_thread_pool;
+    std::shared_ptr<ov::op::v0::Constant> m_draft2target_mapping; // Tensor to store draft2target mapping for eagle model
 public:
     Sampler(const Sampler& rhs) = delete;
     Sampler(Sampler&& rhs) = delete;
@@ -125,6 +126,10 @@ class Sampler {
     // pair with map with backend name and corresponding compiler init time, and vector of compile times for each concrete grammar
     std::pair<std::map<std::string, float>, std::vector<float>> get_structured_output_times();
     void clear_structured_output_compile_times();
+
+    void set_d2t_for_decoding(std::shared_ptr<ov::op::v0::Constant>& d2t) {
+        m_draft2target_mapping = d2t;
+    };
 };
 
 class Sampler::GroupBeamSearcher {

diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
@@ -44,6 +44,7 @@ class Sequence {
     LogProbs m_generated_log_probs;
     uint64_t m_grouped_id;
     uint64_t m_id = _get_next_global_sequence_id();
+    ov::Tensor m_hidden_state = ov::Tensor();
     SequenceStatus m_status = SequenceStatus::RUNNING;
     GenerationFinishReason m_finish_reason = GenerationFinishReason::NONE;
     float m_cumulative_log_prob = 0.0f;
@@ -70,6 +71,7 @@ class Sequence {
         m_generated_ids(seq.m_generated_ids),
         m_generated_log_probs(seq.m_generated_log_probs),
         m_grouped_id(id),
+        m_hidden_state(seq.m_hidden_state),
         m_status(seq.m_status),
         m_cumulative_log_prob(seq.m_cumulative_log_prob),
         m_sequence_group(seq.m_sequence_group),
@@ -142,6 +144,14 @@ class Sequence {
         m_generated_ids.push_back(token_id);
     }
 
+    void update_hidden_state(const ov::Tensor& tensor) {
+        m_hidden_state = tensor;
+    }
+
+    ov::Tensor get_hidden_state() const {
+        return m_hidden_state;
+    }
+
     // removes n last tokens and updates cumulative log prob
     // used to remove stop_string from the output
     void remove_last_tokens(int n) {
@@ -644,7 +654,7 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
         m_num_validation_tokens = k;
     }
 
-    size_t get_num_tokens_to_validate() {
+    size_t get_num_tokens_to_validate() const {
         return m_num_validation_tokens;
     }