@@ -528,6 +528,29 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
528528 return res;
529529}
530530
531+ bool should_use_gpu_preprocessing (const std::string& device) {
532+ // Check if device is GPU
533+ bool is_gpu_device = (device == " GPU" || device.find (" GPU" ) == 0 );
534+
535+ if (!is_gpu_device) {
536+ // Always use CPU preprocessing for non-GPU devices
537+ return false ;
538+ }
539+
540+ // For GPU devices, check environment variable
541+ const char * env_var = std::getenv (" IMAGE_PREPROCESS" );
542+ if (env_var != nullptr ) {
543+ std::string env_value (env_var);
544+ // Force CPU preprocessing if set to "cpu" or "CPU"
545+ if (env_value == " cpu" || env_value == " CPU" ) {
546+ return false ;
547+ }
548+ }
549+
550+ // Use GPU preprocessing for GPU devices when not explicitly disabled
551+ return true ;
552+ }
553+
531554} // namespace
532555
533556namespace phi_utils {
@@ -694,8 +717,27 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
694717 ov::InferRequest& encoder = infer_request_guard.get ();
695718 ProcessorConfig config = utils::from_any_map (config_map, m_processor_config);
696719
697- const auto & [pixel_values, image_size] = get_pixel_values_phi3_v (image, config);
698- encoder.set_input_tensor (pixel_values);
720+ ImageSize image_size;
721+
722+ if (use_ov_image_preprocess) {
723+ ov::Tensor hd_image = HD_transform (image, config.phi3_v .num_crops );
724+ image_size = ImageSize{hd_image.get_shape ().at (2 ), hd_image.get_shape ().at (1 )};
725+
726+ uint64_t global_size[2 ] = {INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE};
727+ ov::Tensor global_target_size (ov::element::i64 , ov::Shape{2 }, global_size);
728+
729+ int64_t max_crops_value = static_cast <int64_t >(config.phi3_v .num_crops );
730+ ov::Tensor max_crops_tensor (ov::element::i64 , ov::Shape{}, &max_crops_value);
731+
732+ encoder.set_input_tensor (0 , hd_image);
733+ encoder.set_input_tensor (1 , global_target_size);
734+ encoder.set_input_tensor (2 , max_crops_tensor);
735+ } else {
736+ const auto & [pixel_values, is] = get_pixel_values_phi3_v (image, config);
737+ image_size = is;
738+ encoder.set_input_tensor (pixel_values);
739+ }
740+
699741 ov::Tensor res{ov::element::f32 , encoder.get_output_tensor ().get_shape ()};
700742 encoder.set_output_tensor (res);
701743 encoder.infer ();
@@ -714,6 +756,20 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
714756 const std::filesystem::path& model_dir,
715757 const std::string& device,
716758 const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties) {
759+ use_ov_image_preprocess = should_use_gpu_preprocessing (device);
760+ if (use_ov_image_preprocess) {
761+ auto vision_encoder_model = utils::singleton_core ().read_model (model_dir / " openvino_vision_embeddings_model.xml" );
762+
763+ auto combined_model = create_combined_preprocessing_vision_model (vision_encoder_model, m_processor_config);
764+ auto compiled_combined = utils::singleton_core ().compile_model (combined_model, device, properties);
765+
766+ m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
767+ compiled_combined.get_property (ov::optimal_number_of_infer_requests),
768+ [&compiled_combined]() -> ov::InferRequest {
769+ return compiled_combined.create_infer_request ();
770+ });
771+ }
772+
717773 auto compiled_model = create_hd_feature_transformer ();
718774 m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
719775 compiled_model.get_property (ov::optimal_number_of_infer_requests),
@@ -735,6 +791,21 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
735791 const std::filesystem::path& config_dir_path,
736792 const std::string& device,
737793 const ov::AnyMap properties) : VisionEncoder(models_map, config_dir_path, device, properties) {
794+ use_ov_image_preprocess = should_use_gpu_preprocessing (device);
795+ if (use_ov_image_preprocess) {
796+ const auto & [model_path, weights_path] = utils::get_model_weights_pair (models_map, " vision_embeddings" );
797+ auto vision_encoder_model = utils::singleton_core ().read_model (model_path, weights_path);
798+
799+ auto combined_model = create_combined_preprocessing_vision_model (vision_encoder_model, m_processor_config);
800+ auto compiled_combined = utils::singleton_core ().compile_model (combined_model, device, properties);
801+
802+ m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
803+ compiled_combined.get_property (ov::optimal_number_of_infer_requests),
804+ [&compiled_combined]() -> ov::InferRequest {
805+ return compiled_combined.create_infer_request ();
806+ });
807+ }
808+
738809 auto compiled_model = create_hd_feature_transformer ();
739810 m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
740811 compiled_model.get_property (ov::optimal_number_of_infer_requests),
@@ -753,6 +824,196 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
753824 m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(config_dir_path, " config.json" );
754825}
755826
827+ std::shared_ptr<ov::Model> VisionEncoderPhi3V::create_combined_preprocessing_vision_model (
828+ const std::shared_ptr<ov::Model>& vision_encoder_model,
829+ const ProcessorConfig& config) {
830+
831+ using namespace ov ;
832+ using namespace ov ::op;
833+
834+ // Input: HD transformed image in NHWC format (uint8)
835+ auto hd_image = std::make_shared<v0::Parameter>(element::u8 , PartialShape{1 , -1 , -1 , 3 });
836+ // Target size for global image resize [height, width]
837+ auto global_target_size = std::make_shared<v0::Parameter>(element::i64 , PartialShape{2 });
838+ // Max crops parameter for dynamic padding
839+ auto max_crops = std::make_shared<v0::Parameter>(element::i64 , PartialShape{});
840+
841+ auto create_constant = [](auto element_type, const Shape& shape, const auto & data) {
842+ return v0::Constant::create (element_type, shape, data);
843+ };
844+
845+ auto create_bicubic_resize = [&](std::shared_ptr<Node> input, std::shared_ptr<Node> target_size) {
846+ // Convert to float32 before interpolation (required for bicubic)
847+ auto input_f32 = std::make_shared<v0::Convert>(input, element::f32 );
848+
849+ // For NHWC format, resize axes are [1, 2] (height, width dimensions)
850+ auto axes = create_constant (element::i64 , Shape{2 }, std::vector<int64_t >{1 , 2 });
851+
852+ v11::Interpolate::InterpolateAttrs attrs;
853+ attrs.mode = v11::Interpolate::InterpolateMode::CUBIC;
854+ attrs.shape_calculation_mode = v11::Interpolate::ShapeCalcMode::SIZES;
855+ attrs.coordinate_transformation_mode = v11::Interpolate::CoordinateTransformMode::ASYMMETRIC;
856+ attrs.cube_coeff = -0 .5f ; // Standard bicubic coefficient
857+ attrs.nearest_mode = v11::Interpolate::NearestMode::FLOOR;
858+ attrs.pads_begin = {0 , 0 };
859+ attrs.pads_end = {0 , 0 };
860+ attrs.antialias = false ;
861+
862+ return std::make_shared<v11::Interpolate>(input_f32, target_size, axes, attrs);
863+ };
864+
865+ // GPU implementation of mean_scale operation
866+ auto create_mean_scale = [&](std::shared_ptr<Node> input_u8_or_f32) {
867+ std::shared_ptr<Node> input_f32;
868+
869+ // Convert to float32 if input is uint8, otherwise use as-is
870+ if (input_u8_or_f32->get_element_type () == element::u8 ) {
871+ input_f32 = std::make_shared<v0::Convert>(input_u8_or_f32, element::f32 );
872+ } else {
873+ input_f32 = input_u8_or_f32;
874+ }
875+
876+ // Follow the original mean_scale() function logic exactly:
877+ // (float(uint_8_data[idx]) / 255.0f - config.image_mean[c]) / config.image_std[c]
878+ // Step 1: x / 255.0
879+ auto scale_255 = create_constant (element::f32 , Shape{}, std::vector<float >{255 .0f });
880+ auto divided_by_255 = std::make_shared<v1::Divide>(input_f32, scale_255);
881+
882+ // Step 2: Create mean and std constants [R, G, B] - broadcasted along channel dimension
883+ // For NHWC format, we need shape [1, 1, 1, 3] to broadcast correctly
884+ auto mean_const = create_constant (element::f32 , Shape{1 , 1 , 1 , 3 },
885+ std::vector<float >{config.image_mean [0 ], config.image_mean [1 ], config.image_mean [2 ]});
886+ auto std_const = create_constant (element::f32 , Shape{1 , 1 , 1 , 3 },
887+ std::vector<float >{config.image_std [0 ], config.image_std [1 ], config.image_std [2 ]});
888+
889+ // Step 3: (x/255.0 - mean)
890+ auto mean_subtracted = std::make_shared<v1::Subtract>(divided_by_255, mean_const);
891+
892+ // Step 4: (x/255.0 - mean) / std
893+ auto result = std::make_shared<v1::Divide>(mean_subtracted, std_const);
894+
895+ return result;
896+ };
897+
898+ auto create_channels_first = [&](std::shared_ptr<Node> input_nhwc) {
899+ // Transpose from NHWC (0,1,2,3) to NCHW (0,3,1,2)
900+ auto transpose_order = create_constant (element::i64 , Shape{4 }, std::vector<int64_t >{0 , 3 , 1 , 2 });
901+ return std::make_shared<v1::Transpose>(input_nhwc, transpose_order);
902+ };
903+
904+ auto create_slice_image = [&](std::shared_ptr<Node> input_nchw) {
905+ // Input: (N, C, H, W) -> Output: (N*num_h_slices*num_w_slices, C, 336, 336)
906+ auto shape_node = std::make_shared<v3::ShapeOf>(input_nchw);
907+ // Index constants for gathering shape dimensions
908+ auto axis_0 = create_constant (element::i64 , Shape{1 }, std::vector<int64_t >{0 }); // N
909+ auto axis_1 = create_constant (element::i64 , Shape{1 }, std::vector<int64_t >{1 }); // C
910+ auto axis_2 = create_constant (element::i64 , Shape{1 }, std::vector<int64_t >{2 }); // H
911+ auto axis_3 = create_constant (element::i64 , Shape{1 }, std::vector<int64_t >{3 }); // W
912+ auto axis_0_node = create_constant (element::i64 , Shape{}, std::vector<int64_t >{0 }); // Gather axis
913+
914+ auto N = std::make_shared<v8::Gather>(shape_node, axis_0, axis_0_node);
915+ auto C = std::make_shared<v8::Gather>(shape_node, axis_1, axis_0_node);
916+ auto H = std::make_shared<v8::Gather>(shape_node, axis_2, axis_0_node);
917+ auto W = std::make_shared<v8::Gather>(shape_node, axis_3, axis_0_node);
918+
919+ // Patch size constant (336)
920+ auto S = create_constant (element::i64 , Shape{1 }, std::vector<int64_t >{INPUT_IMAGE_SIZE});
921+
922+ // Calculate number of slices (num_h = H / S, num_w = W / S)
923+ auto num_h = std::make_shared<v1::Divide>(H, S);
924+ auto num_w = std::make_shared<v1::Divide>(W, S);
925+
926+ // Reshape to 6D [N, C, num_h, S, num_w, S]
927+ auto target_shape_6d = std::make_shared<v0::Concat>(NodeVector{N, C, num_h, S, num_w, S}, 0 );
928+ auto reshape_6d = std::make_shared<v1::Reshape>(input_nchw, target_shape_6d, false );
929+
930+ // Transpose (Permute)
931+ // Current: 0:N, 1:C, 2:num_h, 3:S, 4:num_w, 5:S
932+ // Target: 0:N, 2:num_h, 4:num_w, 1:C, 3:S, 5:S
933+ auto permute_order = create_constant (element::i64 , Shape{6 }, std::vector<int64_t >{0 , 2 , 4 , 1 , 3 , 5 });
934+ auto permuted = std::make_shared<v1::Transpose>(reshape_6d, permute_order);
935+
936+ // Flatten to 4D [N * num_h * num_w, C, S, S]
937+ auto minus_one = create_constant (element::i64 , Shape{1 }, std::vector<int64_t >{-1 });
938+ auto target_shape_4d = std::make_shared<v0::Concat>(NodeVector{minus_one, C, S, S}, 0 );
939+ auto final_reshape = std::make_shared<v1::Reshape>(permuted, target_shape_4d, false );
940+
941+ return final_reshape;
942+ };
943+
944+ auto create_concatenate_batch = [&](std::shared_ptr<Node> global_processed, std::shared_ptr<Node> hd_sliced) {
945+ // Concatenate along batch dimension (axis 0)
946+ // global_processed: (1, C, H, W)
947+ // hd_sliced: (num_slices, C, H, W)
948+ // Output: (1 + num_slices, C, H, W)
949+ return std::make_shared<v0::Concat>(NodeVector{global_processed, hd_sliced}, 0 );
950+ };
951+
952+ auto create_pad_to_max_crops = [&](std::shared_ptr<Node> input_nchw, std::shared_ptr<Node> max_crops_param) {
953+ auto create_constant_i64 = [](const std::vector<int64_t >& val) {
954+ return v0::Constant::create (element::i64 , Shape{val.size ()}, val);
955+ };
956+
957+ // Get current input batch size (num_crops)
958+ auto shape_of = std::make_shared<v3::ShapeOf>(input_nchw);
959+ auto axis_0 = create_constant_i64 ({0 });
960+ auto axis_0_scalar = v0::Constant::create (element::i64 , {}, {0 }); // Axis for Gather
961+ auto num_crops = std::make_shared<v8::Gather>(shape_of, axis_0, axis_0_scalar);
962+
963+ // Calculate required padding amount: padding_needed = max(0, max_crops - num_crops)
964+ // If num_crops >= max_crops, the result will be 0.
965+ auto diff = std::make_shared<v1::Subtract>(max_crops_param, num_crops);
966+ auto zero = create_constant_i64 ({0 });
967+ auto padding_needed = std::make_shared<v1::Maximum>(diff, zero);
968+
969+ // Configure Pad operation arguments (pads_end)
970+ // pads_begin: [0, 0, 0, 0]
971+ // pads_end: [padding_needed, 0, 0, 0]
972+ auto zero_3 = create_constant_i64 ({0 , 0 , 0 }); // Zeros for C, H, W dimensions
973+ auto zero_4 = create_constant_i64 ({0 , 0 , 0 , 0 }); // pads_begin
974+ auto pads_end = std::make_shared<v0::Concat>(OutputVector{padding_needed, zero_3}, 0 );
975+
976+ // Execute Pad operation (Constant mode, fill with 0)
977+ auto pad_value = v0::Constant::create (element::f32 , Shape{}, {0 .0f });
978+
979+ auto padded = std::make_shared<v1::Pad>(
980+ input_nchw,
981+ zero_4, // pads_begin
982+ pads_end, // pads_end
983+ pad_value, // pad_value
984+ op::PadMode::CONSTANT
985+ );
986+
987+ return padded;
988+ };
989+
990+ // Process global image (resize + normalize + channels_first)
991+ auto global_resized = create_bicubic_resize (hd_image, global_target_size);
992+ auto global_normalized = create_mean_scale (global_resized);
993+ auto global_processed = create_channels_first (global_normalized);
994+
995+ // Process HD image (normalize + channels_first + slice)
996+ auto hd_normalized = create_mean_scale (hd_image);
997+ auto hd_processed = create_channels_first (hd_normalized);
998+ auto hd_sliced = create_slice_image (hd_processed);
999+
1000+ // Concatenate global and HD results on GPU
1001+ auto concatenated = create_concatenate_batch (global_processed, hd_sliced);
1002+
1003+ // Pad to max crops on GPU
1004+ auto padded_result = create_pad_to_max_crops (concatenated, max_crops);
1005+
1006+ auto vision_params = vision_encoder_model->get_parameters ();
1007+ auto vision_results = vision_encoder_model->get_results ();
1008+
1009+ vision_params[0 ]->output (0 ).replace (padded_result);
1010+
1011+ return std::make_shared<Model>(
1012+ vision_results,
1013+ ParameterVector{hd_image, global_target_size, max_crops}
1014+ );
1015+ }
1016+
7561017InputsEmbedderPhi3V::InputsEmbedderPhi3V (
7571018 const VLMConfig& vlm_config,
7581019 const std::filesystem::path& model_dir,
0 commit comments