@@ -528,6 +528,195 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
528528 return res;
529529}
530530
531+ std::shared_ptr<ov::Node> create_bicubic_resize (std::shared_ptr<ov::Node> input, std::shared_ptr<ov::Node> target_size) {
532+ using namespace ov ::op;
533+
534+ // Convert to float32 before interpolation (required for bicubic)
535+ auto input_f32 = std::make_shared<v0::Convert>(input, ov::element::f32 );
536+
537+ // For NHWC format, resize axes are [1, 2] (height, width dimensions)
538+ auto axes = v0::Constant::create (ov::element::i64 , ov::Shape{2 }, std::vector<int64_t >{1 , 2 });
539+
540+ v11::Interpolate::InterpolateAttrs attrs;
541+ attrs.mode = v11::Interpolate::InterpolateMode::CUBIC;
542+ attrs.shape_calculation_mode = v11::Interpolate::ShapeCalcMode::SIZES;
543+ attrs.coordinate_transformation_mode = v11::Interpolate::CoordinateTransformMode::ASYMMETRIC;
544+ attrs.cube_coeff = -0 .5f ; // Standard bicubic coefficient
545+ attrs.nearest_mode = v11::Interpolate::NearestMode::FLOOR;
546+ attrs.pads_begin = {0 , 0 };
547+ attrs.pads_end = {0 , 0 };
548+ attrs.antialias = false ;
549+
550+ return std::make_shared<v11::Interpolate>(input_f32, target_size, axes, attrs);
551+ }
552+
553+ std::shared_ptr<ov::Node> create_mean_scale (std::shared_ptr<ov::Node> input_u8_or_f32, const ProcessorConfig& config) {
554+ using namespace ov ::op;
555+
556+ std::shared_ptr<ov::Node> input_f32;
557+
558+ // Convert to float32 if input is uint8, otherwise use as-is
559+ if (input_u8_or_f32->get_element_type () == ov::element::u8 ) {
560+ input_f32 = std::make_shared<v0::Convert>(input_u8_or_f32, ov::element::f32 );
561+ } else {
562+ input_f32 = input_u8_or_f32;
563+ }
564+
565+ // Follow the original mean_scale() function logic exactly:
566+ // (float(uint_8_data[idx]) / 255.0f - config.image_mean[c]) / config.image_std[c]
567+ // Step 1: x / 255.0
568+ auto scale_255 = v0::Constant::create (ov::element::f32 , ov::Shape{}, std::vector<float >{255 .0f });
569+ auto divided_by_255 = std::make_shared<v1::Divide>(input_f32, scale_255);
570+
571+ // Step 2: Create mean and std constants [R, G, B] - broadcasted along channel dimension
572+ // For NHWC format, we need shape [1, 1, 1, 3] to broadcast correctly
573+ auto mean_const = v0::Constant::create (ov::element::f32 , ov::Shape{1 , 1 , 1 , 3 },
574+ std::vector<float >{config.image_mean [0 ], config.image_mean [1 ], config.image_mean [2 ]});
575+ auto std_const = v0::Constant::create (ov::element::f32 , ov::Shape{1 , 1 , 1 , 3 },
576+ std::vector<float >{config.image_std [0 ], config.image_std [1 ], config.image_std [2 ]});
577+
578+ // Step 3: (x/255.0 - mean)
579+ auto mean_subtracted = std::make_shared<v1::Subtract>(divided_by_255, mean_const);
580+
581+ // Step 4: (x/255.0 - mean) / std
582+ auto result = std::make_shared<v1::Divide>(mean_subtracted, std_const);
583+
584+ return result;
585+ }
586+
587+ std::shared_ptr<ov::Node> create_channels_first (std::shared_ptr<ov::Node> input_nhwc) {
588+ using namespace ov ::op;
589+
590+ // Transpose from NHWC (0,1,2,3) to NCHW (0,3,1,2)
591+ auto transpose_order = v0::Constant::create (ov::element::i64 , ov::Shape{4 }, std::vector<int64_t >{0 , 3 , 1 , 2 });
592+ return std::make_shared<v1::Transpose>(input_nhwc, transpose_order);
593+ }
594+
595+ std::shared_ptr<ov::Node> create_slice_image (std::shared_ptr<ov::Node> input_nchw) {
596+ using namespace ov ::op;
597+
598+ // Input: (N, C, H, W) -> Output: (N*num_h_slices*num_w_slices, C, 336, 336)
599+ auto shape_node = std::make_shared<v3::ShapeOf>(input_nchw);
600+ // Index constants for gathering shape dimensions
601+ auto axis_0 = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{0 }); // N
602+ auto axis_1 = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{1 }); // C
603+ auto axis_2 = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{2 }); // H
604+ auto axis_3 = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{3 }); // W
605+ auto axis_0_node = v0::Constant::create (ov::element::i64 , ov::Shape{}, std::vector<int64_t >{0 }); // Gather axis
606+
607+ auto N = std::make_shared<v8::Gather>(shape_node, axis_0, axis_0_node);
608+ auto C = std::make_shared<v8::Gather>(shape_node, axis_1, axis_0_node);
609+ auto H = std::make_shared<v8::Gather>(shape_node, axis_2, axis_0_node);
610+ auto W = std::make_shared<v8::Gather>(shape_node, axis_3, axis_0_node);
611+
612+ // Patch size constant (336)
613+ auto S = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{INPUT_IMAGE_SIZE});
614+
615+ // Calculate number of slices (num_h = H / S, num_w = W / S)
616+ auto num_h = std::make_shared<v1::Divide>(H, S);
617+ auto num_w = std::make_shared<v1::Divide>(W, S);
618+
619+ // Reshape to 6D [N, C, num_h, S, num_w, S]
620+ auto target_shape_6d = std::make_shared<v0::Concat>(ov::NodeVector{N, C, num_h, S, num_w, S}, 0 );
621+ auto reshape_6d = std::make_shared<v1::Reshape>(input_nchw, target_shape_6d, false );
622+
623+ // Transpose (Permute)
624+ // Current: 0:N, 1:C, 2:num_h, 3:S, 4:num_w, 5:S
625+ // Target: 0:N, 2:num_h, 4:num_w, 1:C, 3:S, 5:S
626+ auto permute_order = v0::Constant::create (ov::element::i64 , ov::Shape{6 }, std::vector<int64_t >{0 , 2 , 4 , 1 , 3 , 5 });
627+ auto permuted = std::make_shared<v1::Transpose>(reshape_6d, permute_order);
628+
629+ // Flatten to 4D [N * num_h * num_w, C, S, S]
630+ auto minus_one = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{-1 });
631+ auto target_shape_4d = std::make_shared<v0::Concat>(ov::NodeVector{minus_one, C, S, S}, 0 );
632+ auto final_reshape = std::make_shared<v1::Reshape>(permuted, target_shape_4d, false );
633+
634+ return final_reshape;
635+ }
636+
637+ std::shared_ptr<ov::Node> create_concatenate_batch (std::shared_ptr<ov::Node> global_processed, std::shared_ptr<ov::Node> hd_sliced) {
638+ using namespace ov ::op;
639+
640+ // Concatenate along batch dimension (axis 0)
641+ // global_processed: (1, C, H, W)
642+ // hd_sliced: (num_slices, C, H, W)
643+ // Output: (1 + num_slices, C, H, W)
644+ return std::make_shared<v0::Concat>(ov::NodeVector{global_processed, hd_sliced}, 0 );
645+ }
646+
647+ std::shared_ptr<ov::Node> create_pad_to_max_crops (std::shared_ptr<ov::Node> input_nchw, std::shared_ptr<ov::Node> max_crops_param) {
648+ using namespace ov ::op;
649+
650+ // Get current input batch size (num_crops)
651+ auto shape_of = std::make_shared<v3::ShapeOf>(input_nchw);
652+ auto axis_0 = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{0 });
653+ auto axis_0_scalar = v0::Constant::create (ov::element::i64 , ov::Shape{}, std::vector<int64_t >{0 }); // Axis for Gather
654+ auto num_crops = std::make_shared<v8::Gather>(shape_of, axis_0, axis_0_scalar);
655+
656+ // Calculate required padding amount: padding_needed = max(0, max_crops - num_crops)
657+ auto diff = std::make_shared<v1::Subtract>(max_crops_param, num_crops);
658+ auto zero = v0::Constant::create (ov::element::i64 , ov::Shape{1 }, std::vector<int64_t >{0 });
659+ auto padding_needed = std::make_shared<v1::Maximum>(diff, zero);
660+
661+ // Configure Pad operation arguments (pads_end)
662+ auto zero_3 = v0::Constant::create (ov::element::i64 , ov::Shape{3 }, std::vector<int64_t >{0 , 0 , 0 }); // Zeros for C, H, W dimensions
663+ auto zero_4 = v0::Constant::create (ov::element::i64 , ov::Shape{4 }, std::vector<int64_t >{0 , 0 , 0 , 0 }); // pads_begin
664+ auto pads_end = std::make_shared<v0::Concat>(ov::OutputVector{padding_needed, zero_3}, 0 );
665+
666+ // Execute Pad operation (Constant mode, fill with 0)
667+ auto pad_value = v0::Constant::create (ov::element::f32 , ov::Shape{}, std::vector<float >{0 .0f });
668+ auto padded = std::make_shared<v1::Pad>(
669+ input_nchw,
670+ zero_4, // pads_begin
671+ pads_end, // pads_end
672+ pad_value, // pad_value
673+ ov::op::PadMode::CONSTANT
674+ );
675+
676+ return padded;
677+ }
678+
679+ std::shared_ptr<ov::Model> patch_image_preprocess_into_vision_encoder_model (
680+ const std::shared_ptr<ov::Model>& vision_encoder_model,
681+ const ProcessorConfig& config) {
682+
683+ using namespace ov ;
684+ using namespace ov ::op;
685+
686+ // Input: HD transformed image in NHWC format (uint8)
687+ auto hd_image = std::make_shared<v0::Parameter>(element::u8 , PartialShape{1 , -1 , -1 , 3 });
688+ // Target size for global image resize [height, width]
689+ auto global_target_size = std::make_shared<v0::Parameter>(element::i64 , PartialShape{2 });
690+ // Max crops parameter for dynamic padding
691+ auto max_crops = std::make_shared<v0::Parameter>(element::i64 , PartialShape{});
692+
693+ // Process global image (resize + normalize + channels_first)
694+ auto global_resized = create_bicubic_resize (hd_image, global_target_size);
695+ auto global_normalized = create_mean_scale (global_resized, config);
696+ auto global_processed = create_channels_first (global_normalized);
697+
698+ // Process HD image (normalize + channels_first + slice)
699+ auto hd_normalized = create_mean_scale (hd_image, config);
700+ auto hd_processed = create_channels_first (hd_normalized);
701+ auto hd_sliced = create_slice_image (hd_processed);
702+
703+ // Concatenate global and HD results on GPU
704+ auto concatenated = create_concatenate_batch (global_processed, hd_sliced);
705+
706+ // Pad to max crops on GPU
707+ auto padded_result = create_pad_to_max_crops (concatenated, max_crops);
708+
709+ auto vision_params = vision_encoder_model->get_parameters ();
710+ auto vision_results = vision_encoder_model->get_results ();
711+
712+ vision_params[0 ]->output (0 ).replace (padded_result);
713+
714+ return std::make_shared<Model>(
715+ vision_results,
716+ ParameterVector{hd_image, global_target_size, max_crops}
717+ );
718+ }
719+
531720} // namespace
532721
533722namespace phi_utils {
@@ -694,8 +883,27 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
694883 ov::InferRequest& encoder = infer_request_guard.get ();
695884 ProcessorConfig config = utils::from_any_map (config_map, m_processor_config);
696885
697- const auto & [pixel_values, image_size] = get_pixel_values_phi3_v (image, config);
698- encoder.set_input_tensor (pixel_values);
886+ ImageSize image_size;
887+
888+ if (use_ov_image_preprocess) {
889+ ov::Tensor hd_image = HD_transform (image, config.phi3_v .num_crops );
890+ image_size = ImageSize{hd_image.get_shape ().at (2 ), hd_image.get_shape ().at (1 )};
891+
892+ uint64_t global_size[2 ] = {INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE};
893+ ov::Tensor global_target_size (ov::element::i64 , ov::Shape{2 }, global_size);
894+
895+ int64_t max_crops_value = static_cast <int64_t >(config.phi3_v .num_crops );
896+ ov::Tensor max_crops_tensor (ov::element::i64 , ov::Shape{}, &max_crops_value);
897+
898+ encoder.set_input_tensor (0 , hd_image);
899+ encoder.set_input_tensor (1 , global_target_size);
900+ encoder.set_input_tensor (2 , max_crops_tensor);
901+ } else {
902+ const auto & [pixel_values, is] = get_pixel_values_phi3_v (image, config);
903+ image_size = is;
904+ encoder.set_input_tensor (pixel_values);
905+ }
906+
699907 ov::Tensor res{ov::element::f32 , encoder.get_output_tensor ().get_shape ()};
700908 encoder.set_output_tensor (res);
701909 encoder.infer ();
@@ -710,10 +918,27 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
710918 return encoded_image;
711919}
712920
713- VisionEncoderPhi3V::VisionEncoderPhi3V (
714- const std::filesystem::path& model_dir,
715- const std::string& device,
716- const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties) {
921+ inline bool check_image_preprocess_env () {
922+ const char * env = std::getenv (" IMAGE_PREPROCESS" );
923+ return !(env && std::string (env) == " CPP" );
924+ }
925+
926+ VisionEncoderPhi3V::VisionEncoderPhi3V (const std::filesystem::path& model_dir,
927+ const std::string& device,
928+ const ov::AnyMap properties)
929+ : VisionEncoder(model_dir, device, properties),
930+ use_ov_image_preprocess (check_image_preprocess_env()) {
931+ if (use_ov_image_preprocess) {
932+ auto vision_encoder_model = utils::singleton_core ().read_model (model_dir / " openvino_vision_embeddings_model.xml" );
933+ auto model = patch_image_preprocess_into_vision_encoder_model (vision_encoder_model, m_processor_config);
934+ auto compiled_model = utils::singleton_core ().compile_model (model, device, properties);
935+ m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
936+ compiled_model.get_property (ov::optimal_number_of_infer_requests),
937+ [&compiled_model]() -> ov::InferRequest {
938+ return compiled_model.create_infer_request ();
939+ });
940+ }
941+
717942 auto compiled_model = create_hd_feature_transformer ();
718943 m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
719944 compiled_model.get_property (ov::optimal_number_of_infer_requests),
@@ -730,11 +955,25 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
730955 m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(model_dir, " config.json" );
731956}
732957
733- VisionEncoderPhi3V::VisionEncoderPhi3V (
734- const ModelsMap& models_map,
735- const std::filesystem::path& config_dir_path,
736- const std::string& device,
737- const ov::AnyMap properties) : VisionEncoder(models_map, config_dir_path, device, properties) {
958+ VisionEncoderPhi3V::VisionEncoderPhi3V (const ModelsMap& models_map,
959+ const std::filesystem::path& config_dir_path,
960+ const std::string& device,
961+ const ov::AnyMap properties)
962+ : VisionEncoder(models_map, config_dir_path, device, properties),
963+ use_ov_image_preprocess(check_image_preprocess_env()) {
964+ if (use_ov_image_preprocess) {
965+ const auto & [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair (models_map, " vision_embeddings" );
966+ auto model_org = utils::singleton_core ().read_model (vision_encoder_model, vision_encoder_weights);
967+ auto model = patch_image_preprocess_into_vision_encoder_model (model_org, m_processor_config);
968+ auto compiled_model = utils::singleton_core ().compile_model (model, device, properties);
969+
970+ m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
971+ compiled_model.get_property (ov::optimal_number_of_infer_requests),
972+ [&compiled_model]() -> ov::InferRequest {
973+ return compiled_model.create_infer_request ();
974+ });
975+ }
976+
738977 auto compiled_model = create_hd_feature_transformer ();
739978 m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
740979 compiled_model.get_property (ov::optimal_number_of_infer_requests),
0 commit comments