Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,18 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
class ContinuousBatchingImpl;

class ContinuousBatchingForSpeculativeDecodingImpl;
class ContinuousBatchingForEagle3DecodingImpl;
class ContinuousBatchingForPromptLookupImpl;
class SpeculativeDecodingImpl;
class Eagle3DecodingImpl;
class PromptLookupImpl;

friend class ContinuousBatchingForSpeculativeDecodingImpl;

friend class ContinuousBatchingForPromptLookupImpl;
friend class ContinuousBatchingForEagle3DecodingImpl;
friend class SpeculativeDecodingImpl;
friend class Eagle3DecodingImpl;
friend class PromptLookupImpl;

std::shared_ptr<IContinuousBatchingPipeline> m_impl;
Expand Down
256 changes: 255 additions & 1 deletion src/cpp/src/continuous_batching/model_runner.hpp

Large diffs are not rendered by default.

64 changes: 63 additions & 1 deletion src/cpp/src/continuous_batching/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,58 @@
#include "continuous_batching/pipeline_impl.hpp"
#include "speculative_decoding/speculative_decoding_impl.hpp"
#include "prompt_lookup/prompt_lookup_impl.hpp"
#include "speculative_decoding/speculative_decoding_eagle3_impl.hpp"
#include "continuous_batching/timer.hpp"
#include "utils.hpp"
#include "visual_language/inputs_embedder.hpp"
#include "json_utils.hpp"

using namespace ov::genai;

namespace {
struct Eagle3RTInfo {
bool eagle3_mode = false;
std::vector<int> hidden_layers_list;
std::filesystem::path dt_mapping_table;
};

Eagle3RTInfo
extract_eagle_mode_from_config(ov::AnyMap& config, const std::filesystem::path& models_path) {
Eagle3RTInfo eagle_rt_info;
if (config.find("eagle3_mode") != config.end()) {
eagle_rt_info.eagle3_mode = config.at("eagle3_mode").as<bool>();
config.erase("eagle3_mode");
if (config.find("hidden_layers_list") != config.end()) {
try {
eagle_rt_info.hidden_layers_list = config.at("hidden_layers_list").as<std::vector<int>>();
config.erase("hidden_layers_list");
} catch (const std::exception&) {
OPENVINO_THROW("please check the hidden layers input");
}
} else {
// compute the layers from number of hidden layers
auto config_file_path = models_path / "config.json";
if (!std::filesystem::exists(config_file_path))
OPENVINO_THROW("cannot deduce layers for hidden layer extraction");
std::ifstream file(config_file_path);

nlohmann::json data = nlohmann::json::parse(file);
using ov::genai::utils::read_json_param;
int num_decoder_layers = 0;
read_json_param(data, "num_hidden_layers", num_decoder_layers);
OPENVINO_ASSERT(num_decoder_layers > 3, "num_decoder_layers is too small to deduce hidden layers for extraction");
// The following default hidden layer selection corresponds to the EAGLE reference implementation:
// https://github.com/SafeAILab/EAGLE/blob/0ea94696/eagle/model/modeling_llama_kv.py#L1138
// These layers (2, num_decoder_layers / 2, num_decoder_layers - 3) are chosen to capture features from
// early, middle, and late stages of the decoder, as recommended by the EAGLE authors.
// If you wish to use different layers, provide the "hidden_layers_list" parameter in the config.
eagle_rt_info.hidden_layers_list = { 2, num_decoder_layers / 2, num_decoder_layers - 3 };
}
OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exact 3 layer extraction are expected in eagle3");
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'exact' to 'exactly' in the assertion message.

Suggested change
OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exact 3 layer extraction are expected in eagle3");
OPENVINO_ASSERT(eagle_rt_info.hidden_layers_list.size() == 3, "only exactly 3 layer extraction are expected in eagle3");

Copilot uses AI. Check for mistakes.
}
return eagle_rt_info;
}

bool
extract_prompt_lookup_from_config(ov::AnyMap& config) {
bool res = false;
Expand All @@ -45,6 +90,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
auto properties_without_draft_model = properties;
auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);

auto model = utils::read_model(models_path, properties);
auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
Expand All @@ -63,6 +109,10 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config);
} else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
} else if (draft_model_desr.model != nullptr) {
OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
Expand All @@ -87,7 +137,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
auto properties_without_draft_model = properties;
auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);

auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
auto model = utils::read_model(models_path, properties_without_draft_model);
auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
Expand All @@ -105,6 +155,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config);
} else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
// Eagle speculative decoding does not support dynamic_split_fuse mode
// because it requires hidden state interaction from main model to draft model
// to be implemented future
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
} else if (draft_model_desr.model != nullptr) {
OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
Expand All @@ -131,6 +188,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
auto properties_without_draft_model = properties;
auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, std::filesystem::path(model_str));
auto model = utils::singleton_core().read_model(model_str, weights_tensor);

auto rt_info = model->get_rt_info();
Expand All @@ -150,6 +208,10 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
} else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
} else if (draft_model_desr.model != nullptr) {
OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
Expand Down
16 changes: 16 additions & 0 deletions src/cpp/src/llm/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,20 @@ std::pair<std::string, Any> generation_config(const GenerationConfig& config) {
return {utils::CONFIG_ARG_NAME, Any::make<GenerationConfig>(config)};
}

inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model, ov::AnyMap& properties, const std::filesystem::path& mapping_path) {
if (model->has_rt_info("eagle3_mode") && model->get_rt_info<bool>("eagle3_mode")) {
properties["eagle3_mode"] = true;
if (model->has_rt_info("hidden_layers_list"))
properties["hidden_layers_list"] = model->get_rt_info<std::vector<int>>("hidden_layers_list");
}
}

inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model,
ov::AnyMap& properties,
const std::string& mapping_path) {
apply_eagle_rt_info(model, properties, std::filesystem::path(mapping_path));
}

std::pair<std::string, Any> draft_model(
const std::filesystem::path& models_path,
const std::string& device,
Expand All @@ -96,6 +110,7 @@ std::pair<std::string, Any> draft_model(

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
apply_eagle_rt_info(model, plugin_config, models_path);
auto generation_config = utils::from_config_json_if_exists(models_path);
auto tokenizer = ov::genai::Tokenizer(models_path);
return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
Expand All @@ -111,6 +126,7 @@ std::pair<std::string, Any> draft_model(
auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);

auto model = utils::singleton_core().read_model(model_str, weights_tensor);
apply_eagle_rt_info(model, plugin_config, model_str);
return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
}

Expand Down
5 changes: 5 additions & 0 deletions src/cpp/src/sampling/sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,11 @@ SequenceGroupSamplingInfo Sampler::sample_from_sequence_group(SequenceGroup::Ptr
}
}
}
if (!is_validation_mode_enabled && m_draft2target_mapping) { // compute token offset for draft model in speculative sampling
ov::Tensor d2t_tensor = m_draft2target_mapping->get_tensor_view();
auto d2t = d2t_tensor.data<int64_t>();
sampled_token.m_index = sampled_token.m_index + (d2t? d2t[sampled_token.m_index] : 0);
Comment on lines +858 to +859
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The variable name d2t is unclear. Consider renaming to draft_to_target_mapping or adding a comment explaining that it maps draft token indices to target token indices.

Suggested change
auto d2t = d2t_tensor.data<int64_t>();
sampled_token.m_index = sampled_token.m_index + (d2t? d2t[sampled_token.m_index] : 0);
// Map from draft token indices to target token indices
auto draft_to_target_mapping = d2t_tensor.data<int64_t>();
sampled_token.m_index = sampled_token.m_index + (draft_to_target_mapping ? draft_to_target_mapping[sampled_token.m_index] : 0);

Copilot uses AI. Check for mistakes.
}
// flag to add sampled token to generated sequence or extend logit processors only
bool is_extend_sequence = logit_token_offset == 0 || is_generate_n_tokens || !is_validation_passed;
if (is_validation_mode_enabled && !is_extend_sequence) {
Expand Down
5 changes: 5 additions & 0 deletions src/cpp/src/sampling/sampler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class Sampler {
Tokenizer m_tokenizer;

ThreadPool m_thread_pool;
std::shared_ptr<ov::op::v0::Constant> m_draft2target_mapping; // Tensor to store draft2target mapping for eagle model
public:
Sampler(const Sampler& rhs) = delete;
Sampler(Sampler&& rhs) = delete;
Expand All @@ -125,6 +126,10 @@ class Sampler {
// pair with map with backend name and corresponding compiler init time, and vector of compile times for each concrete grammar
std::pair<std::map<std::string, float>, std::vector<float>> get_structured_output_times();
void clear_structured_output_compile_times();

void set_d2t_for_decoding(std::shared_ptr<ov::op::v0::Constant>& d2t) {
m_draft2target_mapping = d2t;
};
};

class Sampler::GroupBeamSearcher {
Expand Down
12 changes: 11 additions & 1 deletion src/cpp/src/sequence_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class Sequence {
LogProbs m_generated_log_probs;
uint64_t m_grouped_id;
uint64_t m_id = _get_next_global_sequence_id();
ov::Tensor m_hidden_state = ov::Tensor();
SequenceStatus m_status = SequenceStatus::RUNNING;
GenerationFinishReason m_finish_reason = GenerationFinishReason::NONE;
float m_cumulative_log_prob = 0.0f;
Expand All @@ -70,6 +71,7 @@ class Sequence {
m_generated_ids(seq.m_generated_ids),
m_generated_log_probs(seq.m_generated_log_probs),
m_grouped_id(id),
m_hidden_state(seq.m_hidden_state),
m_status(seq.m_status),
m_cumulative_log_prob(seq.m_cumulative_log_prob),
m_sequence_group(seq.m_sequence_group),
Expand Down Expand Up @@ -142,6 +144,14 @@ class Sequence {
m_generated_ids.push_back(token_id);
}

void update_hidden_state(const ov::Tensor& tensor) {
m_hidden_state = tensor;
}

ov::Tensor get_hidden_state() const {
return m_hidden_state;
}

// removes n last tokens and updates cumulative log prob
// used to remove stop_string from the output
void remove_last_tokens(int n) {
Expand Down Expand Up @@ -644,7 +654,7 @@ class SequenceGroup : public std::enable_shared_from_this<SequenceGroup> {
m_num_validation_tokens = k;
}

size_t get_num_tokens_to_validate() {
size_t get_num_tokens_to_validate() const {
return m_num_validation_tokens;
}

Expand Down
Loading
Loading