Skip to content

Commit 95fcbcf

Browse files
committed
Optimize Image preprocessing by GPU
1 parent fc503c8 commit 95fcbcf

File tree

2 files changed

+271
-2
lines changed

2 files changed

+271
-2
lines changed

src/cpp/src/visual_language/phi3_vision/classes.cpp

Lines changed: 263 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,29 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
528528
return res;
529529
}
530530

531+
bool should_use_gpu_preprocessing(const std::string& device) {
532+
// Check if device is GPU
533+
bool is_gpu_device = (device == "GPU" || device.find("GPU") == 0);
534+
535+
if (!is_gpu_device) {
536+
// Always use CPU preprocessing for non-GPU devices
537+
return false;
538+
}
539+
540+
// For GPU devices, check environment variable
541+
const char* env_var = std::getenv("IMAGE_PREPROCESS");
542+
if (env_var != nullptr) {
543+
std::string env_value(env_var);
544+
// Force CPU preprocessing if set to "cpu" or "CPU"
545+
if (env_value == "cpu" || env_value == "CPU") {
546+
return false;
547+
}
548+
}
549+
550+
// Use GPU preprocessing for GPU devices when not explicitly disabled
551+
return true;
552+
}
553+
531554
} // namespace
532555

533556
namespace phi_utils {
@@ -694,8 +717,27 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
694717
ov::InferRequest& encoder = infer_request_guard.get();
695718
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
696719

697-
const auto& [pixel_values, image_size] = get_pixel_values_phi3_v(image, config);
698-
encoder.set_input_tensor(pixel_values);
720+
ImageSize image_size;
721+
722+
if (use_ov_image_preprocess) {
723+
ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
724+
image_size = ImageSize{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
725+
726+
uint64_t global_size[2] = {INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE};
727+
ov::Tensor global_target_size(ov::element::i64, ov::Shape{2}, global_size);
728+
729+
int64_t max_crops_value = static_cast<int64_t>(config.phi3_v.num_crops);
730+
ov::Tensor max_crops_tensor(ov::element::i64, ov::Shape{}, &max_crops_value);
731+
732+
encoder.set_input_tensor(0, hd_image);
733+
encoder.set_input_tensor(1, global_target_size);
734+
encoder.set_input_tensor(2, max_crops_tensor);
735+
} else {
736+
const auto& [pixel_values, is] = get_pixel_values_phi3_v(image, config);
737+
image_size = is;
738+
encoder.set_input_tensor(pixel_values);
739+
}
740+
699741
ov::Tensor res{ov::element::f32, encoder.get_output_tensor().get_shape()};
700742
encoder.set_output_tensor(res);
701743
encoder.infer();
@@ -714,6 +756,20 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
714756
const std::filesystem::path& model_dir,
715757
const std::string& device,
716758
const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties) {
759+
use_ov_image_preprocess = should_use_gpu_preprocessing(device);
760+
if (use_ov_image_preprocess) {
761+
auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
762+
763+
auto combined_model = create_combined_preprocessing_vision_model(vision_encoder_model, m_processor_config);
764+
auto compiled_combined = utils::singleton_core().compile_model(combined_model, device, properties);
765+
766+
m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
767+
compiled_combined.get_property(ov::optimal_number_of_infer_requests),
768+
[&compiled_combined]() -> ov::InferRequest {
769+
return compiled_combined.create_infer_request();
770+
});
771+
}
772+
717773
auto compiled_model = create_hd_feature_transformer();
718774
m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
719775
compiled_model.get_property(ov::optimal_number_of_infer_requests),
@@ -735,6 +791,21 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
735791
const std::filesystem::path& config_dir_path,
736792
const std::string& device,
737793
const ov::AnyMap properties) : VisionEncoder(models_map, config_dir_path, device, properties) {
794+
use_ov_image_preprocess = should_use_gpu_preprocessing(device);
795+
if (use_ov_image_preprocess) {
796+
const auto& [model_path, weights_path] = utils::get_model_weights_pair(models_map, "vision_embeddings");
797+
auto vision_encoder_model = utils::singleton_core().read_model(model_path, weights_path);
798+
799+
auto combined_model = create_combined_preprocessing_vision_model(vision_encoder_model, m_processor_config);
800+
auto compiled_combined = utils::singleton_core().compile_model(combined_model, device, properties);
801+
802+
m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
803+
compiled_combined.get_property(ov::optimal_number_of_infer_requests),
804+
[&compiled_combined]() -> ov::InferRequest {
805+
return compiled_combined.create_infer_request();
806+
});
807+
}
808+
738809
auto compiled_model = create_hd_feature_transformer();
739810
m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
740811
compiled_model.get_property(ov::optimal_number_of_infer_requests),
@@ -753,6 +824,196 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
753824
m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(config_dir_path, "config.json");
754825
}
755826

827+
std::shared_ptr<ov::Model> VisionEncoderPhi3V::create_combined_preprocessing_vision_model(
828+
const std::shared_ptr<ov::Model>& vision_encoder_model,
829+
const ProcessorConfig& config) {
830+
831+
using namespace ov;
832+
using namespace ov::op;
833+
834+
// Input: HD transformed image in NHWC format (uint8)
835+
auto hd_image = std::make_shared<v0::Parameter>(element::u8, PartialShape{1, -1, -1, 3});
836+
// Target size for global image resize [height, width]
837+
auto global_target_size = std::make_shared<v0::Parameter>(element::i64, PartialShape{2});
838+
// Max crops parameter for dynamic padding
839+
auto max_crops = std::make_shared<v0::Parameter>(element::i64, PartialShape{});
840+
841+
auto create_constant = [](auto element_type, const Shape& shape, const auto& data) {
842+
return v0::Constant::create(element_type, shape, data);
843+
};
844+
845+
auto create_bicubic_resize = [&](std::shared_ptr<Node> input, std::shared_ptr<Node> target_size) {
846+
// Convert to float32 before interpolation (required for bicubic)
847+
auto input_f32 = std::make_shared<v0::Convert>(input, element::f32);
848+
849+
// For NHWC format, resize axes are [1, 2] (height, width dimensions)
850+
auto axes = create_constant(element::i64, Shape{2}, std::vector<int64_t>{1, 2});
851+
852+
v11::Interpolate::InterpolateAttrs attrs;
853+
attrs.mode = v11::Interpolate::InterpolateMode::CUBIC;
854+
attrs.shape_calculation_mode = v11::Interpolate::ShapeCalcMode::SIZES;
855+
attrs.coordinate_transformation_mode = v11::Interpolate::CoordinateTransformMode::ASYMMETRIC;
856+
attrs.cube_coeff = -0.5f; // Standard bicubic coefficient
857+
attrs.nearest_mode = v11::Interpolate::NearestMode::FLOOR;
858+
attrs.pads_begin = {0, 0};
859+
attrs.pads_end = {0, 0};
860+
attrs.antialias = false;
861+
862+
return std::make_shared<v11::Interpolate>(input_f32, target_size, axes, attrs);
863+
};
864+
865+
// GPU implementation of mean_scale operation
866+
auto create_mean_scale = [&](std::shared_ptr<Node> input_u8_or_f32) {
867+
std::shared_ptr<Node> input_f32;
868+
869+
// Convert to float32 if input is uint8, otherwise use as-is
870+
if (input_u8_or_f32->get_element_type() == element::u8) {
871+
input_f32 = std::make_shared<v0::Convert>(input_u8_or_f32, element::f32);
872+
} else {
873+
input_f32 = input_u8_or_f32;
874+
}
875+
876+
// Follow the original mean_scale() function logic exactly:
877+
// (float(uint_8_data[idx]) / 255.0f - config.image_mean[c]) / config.image_std[c]
878+
// Step 1: x / 255.0
879+
auto scale_255 = create_constant(element::f32, Shape{}, std::vector<float>{255.0f});
880+
auto divided_by_255 = std::make_shared<v1::Divide>(input_f32, scale_255);
881+
882+
// Step 2: Create mean and std constants [R, G, B] - broadcasted along channel dimension
883+
// For NHWC format, we need shape [1, 1, 1, 3] to broadcast correctly
884+
auto mean_const = create_constant(element::f32, Shape{1, 1, 1, 3},
885+
std::vector<float>{config.image_mean[0], config.image_mean[1], config.image_mean[2]});
886+
auto std_const = create_constant(element::f32, Shape{1, 1, 1, 3},
887+
std::vector<float>{config.image_std[0], config.image_std[1], config.image_std[2]});
888+
889+
// Step 3: (x/255.0 - mean)
890+
auto mean_subtracted = std::make_shared<v1::Subtract>(divided_by_255, mean_const);
891+
892+
// Step 4: (x/255.0 - mean) / std
893+
auto result = std::make_shared<v1::Divide>(mean_subtracted, std_const);
894+
895+
return result;
896+
};
897+
898+
auto create_channels_first = [&](std::shared_ptr<Node> input_nhwc) {
899+
// Transpose from NHWC (0,1,2,3) to NCHW (0,3,1,2)
900+
auto transpose_order = create_constant(element::i64, Shape{4}, std::vector<int64_t>{0, 3, 1, 2});
901+
return std::make_shared<v1::Transpose>(input_nhwc, transpose_order);
902+
};
903+
904+
auto create_slice_image = [&](std::shared_ptr<Node> input_nchw) {
905+
// Input: (N, C, H, W) -> Output: (N*num_h_slices*num_w_slices, C, 336, 336)
906+
auto shape_node = std::make_shared<v3::ShapeOf>(input_nchw);
907+
// Index constants for gathering shape dimensions
908+
auto axis_0 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{0}); // N
909+
auto axis_1 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{1}); // C
910+
auto axis_2 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{2}); // H
911+
auto axis_3 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{3}); // W
912+
auto axis_0_node = create_constant(element::i64, Shape{}, std::vector<int64_t>{0}); // Gather axis
913+
914+
auto N = std::make_shared<v8::Gather>(shape_node, axis_0, axis_0_node);
915+
auto C = std::make_shared<v8::Gather>(shape_node, axis_1, axis_0_node);
916+
auto H = std::make_shared<v8::Gather>(shape_node, axis_2, axis_0_node);
917+
auto W = std::make_shared<v8::Gather>(shape_node, axis_3, axis_0_node);
918+
919+
// Patch size constant (336)
920+
auto S = create_constant(element::i64, Shape{1}, std::vector<int64_t>{INPUT_IMAGE_SIZE});
921+
922+
// Calculate number of slices (num_h = H / S, num_w = W / S)
923+
auto num_h = std::make_shared<v1::Divide>(H, S);
924+
auto num_w = std::make_shared<v1::Divide>(W, S);
925+
926+
// Reshape to 6D [N, C, num_h, S, num_w, S]
927+
auto target_shape_6d = std::make_shared<v0::Concat>(NodeVector{N, C, num_h, S, num_w, S}, 0);
928+
auto reshape_6d = std::make_shared<v1::Reshape>(input_nchw, target_shape_6d, false);
929+
930+
// Transpose (Permute)
931+
// Current: 0:N, 1:C, 2:num_h, 3:S, 4:num_w, 5:S
932+
// Target: 0:N, 2:num_h, 4:num_w, 1:C, 3:S, 5:S
933+
auto permute_order = create_constant(element::i64, Shape{6}, std::vector<int64_t>{0, 2, 4, 1, 3, 5});
934+
auto permuted = std::make_shared<v1::Transpose>(reshape_6d, permute_order);
935+
936+
// Flatten to 4D [N * num_h * num_w, C, S, S]
937+
auto minus_one = create_constant(element::i64, Shape{1}, std::vector<int64_t>{-1});
938+
auto target_shape_4d = std::make_shared<v0::Concat>(NodeVector{minus_one, C, S, S}, 0);
939+
auto final_reshape = std::make_shared<v1::Reshape>(permuted, target_shape_4d, false);
940+
941+
return final_reshape;
942+
};
943+
944+
auto create_concatenate_batch = [&](std::shared_ptr<Node> global_processed, std::shared_ptr<Node> hd_sliced) {
945+
// Concatenate along batch dimension (axis 0)
946+
// global_processed: (1, C, H, W)
947+
// hd_sliced: (num_slices, C, H, W)
948+
// Output: (1 + num_slices, C, H, W)
949+
return std::make_shared<v0::Concat>(NodeVector{global_processed, hd_sliced}, 0);
950+
};
951+
952+
auto create_pad_to_max_crops = [&](std::shared_ptr<Node> input_nchw, std::shared_ptr<Node> max_crops_param) {
953+
auto create_constant_i64 = [](const std::vector<int64_t>& val) {
954+
return v0::Constant::create(element::i64, Shape{val.size()}, val);
955+
};
956+
957+
// Get current input batch size (num_crops)
958+
auto shape_of = std::make_shared<v3::ShapeOf>(input_nchw);
959+
auto axis_0 = create_constant_i64({0});
960+
auto axis_0_scalar = v0::Constant::create(element::i64, {}, {0}); // Axis for Gather
961+
auto num_crops = std::make_shared<v8::Gather>(shape_of, axis_0, axis_0_scalar);
962+
963+
// Calculate required padding amount: padding_needed = max(0, max_crops - num_crops)
964+
// If num_crops >= max_crops, the result will be 0.
965+
auto diff = std::make_shared<v1::Subtract>(max_crops_param, num_crops);
966+
auto zero = create_constant_i64({0});
967+
auto padding_needed = std::make_shared<v1::Maximum>(diff, zero);
968+
969+
// Configure Pad operation arguments (pads_end)
970+
// pads_begin: [0, 0, 0, 0]
971+
// pads_end: [padding_needed, 0, 0, 0]
972+
auto zero_3 = create_constant_i64({0, 0, 0}); // Zeros for C, H, W dimensions
973+
auto zero_4 = create_constant_i64({0, 0, 0, 0}); // pads_begin
974+
auto pads_end = std::make_shared<v0::Concat>(OutputVector{padding_needed, zero_3}, 0);
975+
976+
// Execute Pad operation (Constant mode, fill with 0)
977+
auto pad_value = v0::Constant::create(element::f32, Shape{}, {0.0f});
978+
979+
auto padded = std::make_shared<v1::Pad>(
980+
input_nchw,
981+
zero_4, // pads_begin
982+
pads_end, // pads_end
983+
pad_value, // pad_value
984+
op::PadMode::CONSTANT
985+
);
986+
987+
return padded;
988+
};
989+
990+
// Process global image (resize + normalize + channels_first)
991+
auto global_resized = create_bicubic_resize(hd_image, global_target_size);
992+
auto global_normalized = create_mean_scale(global_resized);
993+
auto global_processed = create_channels_first(global_normalized);
994+
995+
// Process HD image (normalize + channels_first + slice)
996+
auto hd_normalized = create_mean_scale(hd_image);
997+
auto hd_processed = create_channels_first(hd_normalized);
998+
auto hd_sliced = create_slice_image(hd_processed);
999+
1000+
// Concatenate global and HD results on GPU
1001+
auto concatenated = create_concatenate_batch(global_processed, hd_sliced);
1002+
1003+
// Pad to max crops on GPU
1004+
auto padded_result = create_pad_to_max_crops(concatenated, max_crops);
1005+
1006+
auto vision_params = vision_encoder_model->get_parameters();
1007+
auto vision_results = vision_encoder_model->get_results();
1008+
1009+
vision_params[0]->output(0).replace(padded_result);
1010+
1011+
return std::make_shared<Model>(
1012+
vision_results,
1013+
ParameterVector{hd_image, global_target_size, max_crops}
1014+
);
1015+
}
1016+
7561017
InputsEmbedderPhi3V::InputsEmbedderPhi3V(
7571018
const VLMConfig& vlm_config,
7581019
const std::filesystem::path& model_dir,

src/cpp/src/visual_language/phi3_vision/classes.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ class VisionEncoderPhi3V : public VisionEncoder {
4040
const ov::AnyMap properties);
4141

4242
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
43+
44+
private:
45+
bool use_ov_image_preprocess = false; // Default to false, will be set based on device and environment
46+
47+
// GPU preprocessing model creation function
48+
std::shared_ptr<ov::Model> create_combined_preprocessing_vision_model(
49+
const std::shared_ptr<ov::Model>& vision_encoder_model,
50+
const ProcessorConfig& config);
4351
};
4452

4553
class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {

0 commit comments

Comments
 (0)