diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp index 60f9730aec..bcba4da6c0 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.cpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.cpp @@ -646,17 +646,18 @@ ov::Tensor insert_image_placeholders( return merged; } -std::vector> drop_image_placeholders(const ov::Tensor& tokens) { +std::vector> drop_image_placeholders(int64_t* tokens, const ov::Shape& shape) { std::vector> chunks; - int64_t last_token = tokens.data()[0]; + size_t full_length = shape.at(1); + int64_t last_token = tokens[0]; size_t text_start = 0; - for (size_t offset = 1; offset < tokens.get_shape().at(1); ++offset) { + for (size_t offset = 1; offset < full_length; ++offset) { // If last_token and next_token are not negative, it's continuation of the current chunk text - skip // If last_token is negative and next_token is not negative, it's a start of text - save the offset, add image placeholder // If last token is not negative and next_token is negative, it's an end of text - push_back a chunk // If last_token and next_token are negative, it's continuation of an image placeholder - skip // if last_token and next_token are negative but different, it's a start of a new image placeholder - save the previous image placeholder - int64_t next_token = tokens.data()[offset]; + int64_t next_token = tokens[offset]; if (last_token < 0 && next_token >= 0) { text_start = offset; chunks.push_back(size_t(-(last_token + 1))); @@ -665,7 +666,7 @@ std::vector> drop_image_placeholders(const ov:: std::in_place_type, ov::element::i64, ov::Shape{1, offset - text_start}, - tokens.data() + text_start + tokens + text_start ); } else if (last_token < 0 && next_token < 0 && last_token != next_token) { chunks.push_back(size_t(-(last_token + 1))); @@ -673,13 +674,12 @@ std::vector> drop_image_placeholders(const ov:: last_token = next_token; } // Add the last chunk - size_t full_length = tokens.get_shape().at(1); if (last_token >= 0) { chunks.emplace_back( std::in_place_type, ov::element::i64, ov::Shape{1, full_length - text_start}, - tokens.data() + text_start + tokens + text_start ); } else { chunks.push_back(size_t(-(last_token + 1))); @@ -806,7 +806,7 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp m_prev_hist_length = m_kv_cache_state.get_state().size(); m_kv_cache_state.add_inputs(new_tokens); - std::vector> tokens = phi_utils::drop_image_placeholders(new_tokens); + std::vector> tokens = phi_utils::drop_image_placeholders(new_tokens.data(), new_tokens.get_shape()); ov::Tensor inputs_embeds{ov::element::f32, {1, new_tokens.get_shape().at(1), m_vlm_config.hidden_size}}; size_t offset = 0; CircularBufferQueueElementGuard embeddings_request_guard(m_embedding->get_request_queue().get()); diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp index 3383fbde41..0115aac32f 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.hpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.hpp @@ -19,7 +19,7 @@ std::string normalize_prompt( ); std::vector> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer, const std::regex& native_pattern); ov::Tensor insert_image_placeholders(const std::vector>& chunks, const std::vector& tokens_per_images); -std::vector> drop_image_placeholders(const ov::Tensor& tokens); +std::vector> drop_image_placeholders(int64_t* tokens, const ov::Shape& shape); } diff --git a/src/cpp/src/visual_language/phi4mm/classes.cpp b/src/cpp/src/visual_language/phi4mm/classes.cpp index c11d3e2bd1..3e435da811 100644 --- a/src/cpp/src/visual_language/phi4mm/classes.cpp +++ b/src/cpp/src/visual_language/phi4mm/classes.cpp @@ -820,7 +820,7 @@ ov::Tensor InputsEmbedderPhi4MM::get_inputs_embeds( m_prev_hist_length = m_kv_cache_state.get_state().size(); m_kv_cache_state.add_inputs(new_tokens); - std::vector> tokens = phi_utils::drop_image_placeholders(new_tokens); + std::vector> tokens = phi_utils::drop_image_placeholders(new_tokens.data(), new_tokens.get_shape()); ov::Tensor inputs_embeds{ov::element::f32, {1, new_tokens.get_shape().at(1), m_vlm_config.hidden_size}}; size_t offset = 0; CircularBufferQueueElementGuard embeddings_request_guard(m_embedding->get_request_queue().get());