Skip to content

Commit af2e7c4

Browse files
committed
Optimize Image preprocessing by GPU with ov::Model
1 parent fc503c8 commit af2e7c4

File tree

2 files changed

+254
-11
lines changed

2 files changed

+254
-11
lines changed

src/cpp/src/visual_language/phi3_vision/classes.cpp

Lines changed: 250 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,195 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
528528
return res;
529529
}
530530

531+
std::shared_ptr<ov::Node> create_bicubic_resize(std::shared_ptr<ov::Node> input, std::shared_ptr<ov::Node> target_size) {
532+
using namespace ov::op;
533+
534+
// Convert to float32 before interpolation (required for bicubic)
535+
auto input_f32 = std::make_shared<v0::Convert>(input, ov::element::f32);
536+
537+
// For NHWC format, resize axes are [1, 2] (height, width dimensions)
538+
auto axes = v0::Constant::create(ov::element::i64, ov::Shape{2}, std::vector<int64_t>{1, 2});
539+
540+
v11::Interpolate::InterpolateAttrs attrs;
541+
attrs.mode = v11::Interpolate::InterpolateMode::CUBIC;
542+
attrs.shape_calculation_mode = v11::Interpolate::ShapeCalcMode::SIZES;
543+
attrs.coordinate_transformation_mode = v11::Interpolate::CoordinateTransformMode::ASYMMETRIC;
544+
attrs.cube_coeff = -0.5f; // Standard bicubic coefficient
545+
attrs.nearest_mode = v11::Interpolate::NearestMode::FLOOR;
546+
attrs.pads_begin = {0, 0};
547+
attrs.pads_end = {0, 0};
548+
attrs.antialias = false;
549+
550+
return std::make_shared<v11::Interpolate>(input_f32, target_size, axes, attrs);
551+
}
552+
553+
std::shared_ptr<ov::Node> create_mean_scale(std::shared_ptr<ov::Node> input_u8_or_f32, const ProcessorConfig& config) {
554+
using namespace ov::op;
555+
556+
std::shared_ptr<ov::Node> input_f32;
557+
558+
// Convert to float32 if input is uint8, otherwise use as-is
559+
if (input_u8_or_f32->get_element_type() == ov::element::u8) {
560+
input_f32 = std::make_shared<v0::Convert>(input_u8_or_f32, ov::element::f32);
561+
} else {
562+
input_f32 = input_u8_or_f32;
563+
}
564+
565+
// Follow the original mean_scale() function logic exactly:
566+
// (float(uint_8_data[idx]) / 255.0f - config.image_mean[c]) / config.image_std[c]
567+
// Step 1: x / 255.0
568+
auto scale_255 = v0::Constant::create(ov::element::f32, ov::Shape{}, std::vector<float>{255.0f});
569+
auto divided_by_255 = std::make_shared<v1::Divide>(input_f32, scale_255);
570+
571+
// Step 2: Create mean and std constants [R, G, B] - broadcasted along channel dimension
572+
// For NHWC format, we need shape [1, 1, 1, 3] to broadcast correctly
573+
auto mean_const = v0::Constant::create(ov::element::f32, ov::Shape{1, 1, 1, 3},
574+
std::vector<float>{config.image_mean[0], config.image_mean[1], config.image_mean[2]});
575+
auto std_const = v0::Constant::create(ov::element::f32, ov::Shape{1, 1, 1, 3},
576+
std::vector<float>{config.image_std[0], config.image_std[1], config.image_std[2]});
577+
578+
// Step 3: (x/255.0 - mean)
579+
auto mean_subtracted = std::make_shared<v1::Subtract>(divided_by_255, mean_const);
580+
581+
// Step 4: (x/255.0 - mean) / std
582+
auto result = std::make_shared<v1::Divide>(mean_subtracted, std_const);
583+
584+
return result;
585+
}
586+
587+
std::shared_ptr<ov::Node> create_channels_first(std::shared_ptr<ov::Node> input_nhwc) {
588+
using namespace ov::op;
589+
590+
// Transpose from NHWC (0,1,2,3) to NCHW (0,3,1,2)
591+
auto transpose_order = v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 3, 1, 2});
592+
return std::make_shared<v1::Transpose>(input_nhwc, transpose_order);
593+
}
594+
595+
std::shared_ptr<ov::Node> create_slice_image(std::shared_ptr<ov::Node> input_nchw) {
596+
using namespace ov::op;
597+
598+
// Input: (N, C, H, W) -> Output: (N*num_h_slices*num_w_slices, C, 336, 336)
599+
auto shape_node = std::make_shared<v3::ShapeOf>(input_nchw);
600+
// Index constants for gathering shape dimensions
601+
auto axis_0 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0}); // N
602+
auto axis_1 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1}); // C
603+
auto axis_2 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{2}); // H
604+
auto axis_3 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{3}); // W
605+
auto axis_0_node = v0::Constant::create(ov::element::i64, ov::Shape{}, std::vector<int64_t>{0}); // Gather axis
606+
607+
auto N = std::make_shared<v8::Gather>(shape_node, axis_0, axis_0_node);
608+
auto C = std::make_shared<v8::Gather>(shape_node, axis_1, axis_0_node);
609+
auto H = std::make_shared<v8::Gather>(shape_node, axis_2, axis_0_node);
610+
auto W = std::make_shared<v8::Gather>(shape_node, axis_3, axis_0_node);
611+
612+
// Patch size constant (336)
613+
auto S = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{INPUT_IMAGE_SIZE});
614+
615+
// Calculate number of slices (num_h = H / S, num_w = W / S)
616+
auto num_h = std::make_shared<v1::Divide>(H, S);
617+
auto num_w = std::make_shared<v1::Divide>(W, S);
618+
619+
// Reshape to 6D [N, C, num_h, S, num_w, S]
620+
auto target_shape_6d = std::make_shared<v0::Concat>(ov::NodeVector{N, C, num_h, S, num_w, S}, 0);
621+
auto reshape_6d = std::make_shared<v1::Reshape>(input_nchw, target_shape_6d, false);
622+
623+
// Transpose (Permute)
624+
// Current: 0:N, 1:C, 2:num_h, 3:S, 4:num_w, 5:S
625+
// Target: 0:N, 2:num_h, 4:num_w, 1:C, 3:S, 5:S
626+
auto permute_order = v0::Constant::create(ov::element::i64, ov::Shape{6}, std::vector<int64_t>{0, 2, 4, 1, 3, 5});
627+
auto permuted = std::make_shared<v1::Transpose>(reshape_6d, permute_order);
628+
629+
// Flatten to 4D [N * num_h * num_w, C, S, S]
630+
auto minus_one = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
631+
auto target_shape_4d = std::make_shared<v0::Concat>(ov::NodeVector{minus_one, C, S, S}, 0);
632+
auto final_reshape = std::make_shared<v1::Reshape>(permuted, target_shape_4d, false);
633+
634+
return final_reshape;
635+
}
636+
637+
std::shared_ptr<ov::Node> create_concatenate_batch(std::shared_ptr<ov::Node> global_processed, std::shared_ptr<ov::Node> hd_sliced) {
638+
using namespace ov::op;
639+
640+
// Concatenate along batch dimension (axis 0)
641+
// global_processed: (1, C, H, W)
642+
// hd_sliced: (num_slices, C, H, W)
643+
// Output: (1 + num_slices, C, H, W)
644+
return std::make_shared<v0::Concat>(ov::NodeVector{global_processed, hd_sliced}, 0);
645+
}
646+
647+
std::shared_ptr<ov::Node> create_pad_to_max_crops(std::shared_ptr<ov::Node> input_nchw, std::shared_ptr<ov::Node> max_crops_param) {
648+
using namespace ov::op;
649+
650+
// Get current input batch size (num_crops)
651+
auto shape_of = std::make_shared<v3::ShapeOf>(input_nchw);
652+
auto axis_0 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
653+
auto axis_0_scalar = v0::Constant::create(ov::element::i64, ov::Shape{}, std::vector<int64_t>{0}); // Axis for Gather
654+
auto num_crops = std::make_shared<v8::Gather>(shape_of, axis_0, axis_0_scalar);
655+
656+
// Calculate required padding amount: padding_needed = max(0, max_crops - num_crops)
657+
auto diff = std::make_shared<v1::Subtract>(max_crops_param, num_crops);
658+
auto zero = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
659+
auto padding_needed = std::make_shared<v1::Maximum>(diff, zero);
660+
661+
// Configure Pad operation arguments (pads_end)
662+
auto zero_3 = v0::Constant::create(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{0, 0, 0}); // Zeros for C, H, W dimensions
663+
auto zero_4 = v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, 0, 0}); // pads_begin
664+
auto pads_end = std::make_shared<v0::Concat>(ov::OutputVector{padding_needed, zero_3}, 0);
665+
666+
// Execute Pad operation (Constant mode, fill with 0)
667+
auto pad_value = v0::Constant::create(ov::element::f32, ov::Shape{}, std::vector<float>{0.0f});
668+
auto padded = std::make_shared<v1::Pad>(
669+
input_nchw,
670+
zero_4, // pads_begin
671+
pads_end, // pads_end
672+
pad_value, // pad_value
673+
ov::op::PadMode::CONSTANT
674+
);
675+
676+
return padded;
677+
}
678+
679+
std::shared_ptr<ov::Model> patch_image_preprocess_into_vision_encoder_model(
680+
const std::shared_ptr<ov::Model>& vision_encoder_model,
681+
const ProcessorConfig& config) {
682+
683+
using namespace ov;
684+
using namespace ov::op;
685+
686+
// Input: HD transformed image in NHWC format (uint8)
687+
auto hd_image = std::make_shared<v0::Parameter>(element::u8, PartialShape{1, -1, -1, 3});
688+
// Target size for global image resize [height, width]
689+
auto global_target_size = std::make_shared<v0::Parameter>(element::i64, PartialShape{2});
690+
// Max crops parameter for dynamic padding
691+
auto max_crops = std::make_shared<v0::Parameter>(element::i64, PartialShape{});
692+
693+
// Process global image (resize + normalize + channels_first)
694+
auto global_resized = create_bicubic_resize(hd_image, global_target_size);
695+
auto global_normalized = create_mean_scale(global_resized, config);
696+
auto global_processed = create_channels_first(global_normalized);
697+
698+
// Process HD image (normalize + channels_first + slice)
699+
auto hd_normalized = create_mean_scale(hd_image, config);
700+
auto hd_processed = create_channels_first(hd_normalized);
701+
auto hd_sliced = create_slice_image(hd_processed);
702+
703+
// Concatenate global and HD results on GPU
704+
auto concatenated = create_concatenate_batch(global_processed, hd_sliced);
705+
706+
// Pad to max crops on GPU
707+
auto padded_result = create_pad_to_max_crops(concatenated, max_crops);
708+
709+
auto vision_params = vision_encoder_model->get_parameters();
710+
auto vision_results = vision_encoder_model->get_results();
711+
712+
vision_params[0]->output(0).replace(padded_result);
713+
714+
return std::make_shared<Model>(
715+
vision_results,
716+
ParameterVector{hd_image, global_target_size, max_crops}
717+
);
718+
}
719+
531720
} // namespace
532721

533722
namespace phi_utils {
@@ -694,8 +883,27 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
694883
ov::InferRequest& encoder = infer_request_guard.get();
695884
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
696885

697-
const auto& [pixel_values, image_size] = get_pixel_values_phi3_v(image, config);
698-
encoder.set_input_tensor(pixel_values);
886+
ImageSize image_size;
887+
888+
if (use_ov_image_preprocess) {
889+
ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
890+
image_size = ImageSize{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
891+
892+
uint64_t global_size[2] = {INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE};
893+
ov::Tensor global_target_size(ov::element::i64, ov::Shape{2}, global_size);
894+
895+
int64_t max_crops_value = static_cast<int64_t>(config.phi3_v.num_crops);
896+
ov::Tensor max_crops_tensor(ov::element::i64, ov::Shape{}, &max_crops_value);
897+
898+
encoder.set_input_tensor(0, hd_image);
899+
encoder.set_input_tensor(1, global_target_size);
900+
encoder.set_input_tensor(2, max_crops_tensor);
901+
} else {
902+
const auto& [pixel_values, is] = get_pixel_values_phi3_v(image, config);
903+
image_size = is;
904+
encoder.set_input_tensor(pixel_values);
905+
}
906+
699907
ov::Tensor res{ov::element::f32, encoder.get_output_tensor().get_shape()};
700908
encoder.set_output_tensor(res);
701909
encoder.infer();
@@ -710,10 +918,27 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
710918
return encoded_image;
711919
}
712920

713-
VisionEncoderPhi3V::VisionEncoderPhi3V(
714-
const std::filesystem::path& model_dir,
715-
const std::string& device,
716-
const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties) {
921+
inline bool check_image_preprocess_env() {
922+
const char* env = std::getenv("IMAGE_PREPROCESS");
923+
return !(env && std::string(env) == "CPP");
924+
}
925+
926+
VisionEncoderPhi3V::VisionEncoderPhi3V(const std::filesystem::path& model_dir,
927+
const std::string& device,
928+
const ov::AnyMap properties)
929+
: VisionEncoder(model_dir, device, properties),
930+
use_ov_image_preprocess(check_image_preprocess_env()) {
931+
if (use_ov_image_preprocess) {
932+
auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
933+
auto model = patch_image_preprocess_into_vision_encoder_model(vision_encoder_model, m_processor_config);
934+
auto compiled_model = utils::singleton_core().compile_model(model, device, properties);
935+
m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
936+
compiled_model.get_property(ov::optimal_number_of_infer_requests),
937+
[&compiled_model]() -> ov::InferRequest {
938+
return compiled_model.create_infer_request();
939+
});
940+
}
941+
717942
auto compiled_model = create_hd_feature_transformer();
718943
m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
719944
compiled_model.get_property(ov::optimal_number_of_infer_requests),
@@ -730,11 +955,25 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
730955
m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(model_dir, "config.json");
731956
}
732957

733-
VisionEncoderPhi3V::VisionEncoderPhi3V(
734-
const ModelsMap& models_map,
735-
const std::filesystem::path& config_dir_path,
736-
const std::string& device,
737-
const ov::AnyMap properties) : VisionEncoder(models_map, config_dir_path, device, properties) {
958+
VisionEncoderPhi3V::VisionEncoderPhi3V(const ModelsMap& models_map,
959+
const std::filesystem::path& config_dir_path,
960+
const std::string& device,
961+
const ov::AnyMap properties)
962+
: VisionEncoder(models_map, config_dir_path, device, properties),
963+
use_ov_image_preprocess(check_image_preprocess_env()) {
964+
if (use_ov_image_preprocess) {
965+
const auto& [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair(models_map, "vision_embeddings");
966+
auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights);
967+
auto model = patch_image_preprocess_into_vision_encoder_model(model_org, m_processor_config);
968+
auto compiled_model = utils::singleton_core().compile_model(model, device, properties);
969+
970+
m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
971+
compiled_model.get_property(ov::optimal_number_of_infer_requests),
972+
[&compiled_model]() -> ov::InferRequest {
973+
return compiled_model.create_infer_request();
974+
});
975+
}
976+
738977
auto compiled_model = create_hd_feature_transformer();
739978
m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
740979
compiled_model.get_property(ov::optimal_number_of_infer_requests),

src/cpp/src/visual_language/phi3_vision/classes.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ class VisionEncoderPhi3V : public VisionEncoder {
4040
const ov::AnyMap properties);
4141

4242
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
43+
44+
private:
45+
bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess
46+
4347
};
4448

4549
class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {

0 commit comments

Comments
 (0)