Optimize Image preprocessing by GPU

jade-cho · jade-cho · commit 95fcbcf8998a · 2025-11-24T17:42:16.000+09:00
diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp
@@ -528,6 +528,29 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
     return res;
 }
 
+bool should_use_gpu_preprocessing(const std::string& device) {
+    // Check if device is GPU
+    bool is_gpu_device = (device == "GPU" || device.find("GPU") == 0);
+
+    if (!is_gpu_device) {
+        // Always use CPU preprocessing for non-GPU devices
+        return false;
+    }
+
+    // For GPU devices, check environment variable
+    const char* env_var = std::getenv("IMAGE_PREPROCESS");
+    if (env_var != nullptr) {
+        std::string env_value(env_var);
+        // Force CPU preprocessing if set to "cpu" or "CPU"
+        if (env_value == "cpu" || env_value == "CPU") {
+            return false;
+        }
+    }
+
+    // Use GPU preprocessing for GPU devices when not explicitly disabled
+    return true;
+}
+
 } // namespace
 
 namespace phi_utils {
@@ -694,8 +717,27 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
     ov::InferRequest& encoder = infer_request_guard.get();
     ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
 
-    const auto& [pixel_values, image_size] = get_pixel_values_phi3_v(image, config);
-    encoder.set_input_tensor(pixel_values);
+    ImageSize image_size;
+
+    if (use_ov_image_preprocess) {
+        ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
+        image_size = ImageSize{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
+
+        uint64_t global_size[2] = {INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE};
+        ov::Tensor global_target_size(ov::element::i64, ov::Shape{2}, global_size);
+
+        int64_t max_crops_value = static_cast<int64_t>(config.phi3_v.num_crops);
+        ov::Tensor max_crops_tensor(ov::element::i64, ov::Shape{}, &max_crops_value);
+
+        encoder.set_input_tensor(0, hd_image);
+        encoder.set_input_tensor(1, global_target_size);
+        encoder.set_input_tensor(2, max_crops_tensor);
+    } else {
+        const auto& [pixel_values, is] = get_pixel_values_phi3_v(image, config);
+        image_size = is;
+        encoder.set_input_tensor(pixel_values);
+    }
+
     ov::Tensor res{ov::element::f32, encoder.get_output_tensor().get_shape()};
     encoder.set_output_tensor(res);
     encoder.infer();
@@ -714,6 +756,20 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
     const std::filesystem::path& model_dir,
     const std::string& device,
     const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties) {
+    use_ov_image_preprocess = should_use_gpu_preprocessing(device);
+    if (use_ov_image_preprocess) {
+        auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
+
+        auto combined_model = create_combined_preprocessing_vision_model(vision_encoder_model, m_processor_config);
+        auto compiled_combined = utils::singleton_core().compile_model(combined_model, device, properties);
+
+        m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
+            compiled_combined.get_property(ov::optimal_number_of_infer_requests),
+            [&compiled_combined]() -> ov::InferRequest {
+                return compiled_combined.create_infer_request();
+            });
+    }
+
     auto compiled_model = create_hd_feature_transformer();
     m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
         compiled_model.get_property(ov::optimal_number_of_infer_requests),
@@ -735,6 +791,21 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
     const std::filesystem::path& config_dir_path,
     const std::string& device,
     const ov::AnyMap properties) : VisionEncoder(models_map, config_dir_path, device, properties) {
+    use_ov_image_preprocess = should_use_gpu_preprocessing(device);
+    if (use_ov_image_preprocess) {
+        const auto& [model_path, weights_path] = utils::get_model_weights_pair(models_map, "vision_embeddings");
+        auto vision_encoder_model = utils::singleton_core().read_model(model_path, weights_path);
+
+        auto combined_model = create_combined_preprocessing_vision_model(vision_encoder_model, m_processor_config);
+        auto compiled_combined = utils::singleton_core().compile_model(combined_model, device, properties);
+
+        m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
+            compiled_combined.get_property(ov::optimal_number_of_infer_requests),
+            [&compiled_combined]() -> ov::InferRequest {
+                return compiled_combined.create_infer_request();
+            });
+    }
+
     auto compiled_model = create_hd_feature_transformer();
     m_ireq_queue_hd_feature_transformer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
         compiled_model.get_property(ov::optimal_number_of_infer_requests),
@@ -753,6 +824,196 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(
     m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(config_dir_path, "config.json");
 }
 
+std::shared_ptr<ov::Model> VisionEncoderPhi3V::create_combined_preprocessing_vision_model(
+        const std::shared_ptr<ov::Model>& vision_encoder_model,
+    const ProcessorConfig& config) {
+
+    using namespace ov;
+    using namespace ov::op;
+
+    // Input: HD transformed image in NHWC format (uint8)
+    auto hd_image = std::make_shared<v0::Parameter>(element::u8, PartialShape{1, -1, -1, 3});
+    // Target size for global image resize [height, width]
+    auto global_target_size = std::make_shared<v0::Parameter>(element::i64, PartialShape{2});
+    // Max crops parameter for dynamic padding
+    auto max_crops = std::make_shared<v0::Parameter>(element::i64, PartialShape{});
+
+    auto create_constant = [](auto element_type, const Shape& shape, const auto& data) {
+        return v0::Constant::create(element_type, shape, data);
+    };
+
+    auto create_bicubic_resize = [&](std::shared_ptr<Node> input, std::shared_ptr<Node> target_size) {
+        // Convert to float32 before interpolation (required for bicubic)
+        auto input_f32 = std::make_shared<v0::Convert>(input, element::f32);
+
+        // For NHWC format, resize axes are [1, 2] (height, width dimensions)
+        auto axes = create_constant(element::i64, Shape{2}, std::vector<int64_t>{1, 2});
+
+        v11::Interpolate::InterpolateAttrs attrs;
+        attrs.mode = v11::Interpolate::InterpolateMode::CUBIC;
+        attrs.shape_calculation_mode = v11::Interpolate::ShapeCalcMode::SIZES;
+        attrs.coordinate_transformation_mode = v11::Interpolate::CoordinateTransformMode::ASYMMETRIC;
+        attrs.cube_coeff = -0.5f;  // Standard bicubic coefficient
+        attrs.nearest_mode = v11::Interpolate::NearestMode::FLOOR;
+        attrs.pads_begin = {0, 0};
+        attrs.pads_end = {0, 0};
+        attrs.antialias = false;
+
+        return std::make_shared<v11::Interpolate>(input_f32, target_size, axes, attrs);
+    };
+
+    // GPU implementation of mean_scale operation
+    auto create_mean_scale = [&](std::shared_ptr<Node> input_u8_or_f32) {
+        std::shared_ptr<Node> input_f32;
+
+        // Convert to float32 if input is uint8, otherwise use as-is
+        if (input_u8_or_f32->get_element_type() == element::u8) {
+            input_f32 = std::make_shared<v0::Convert>(input_u8_or_f32, element::f32);
+        } else {
+            input_f32 = input_u8_or_f32;
+        }
+
+        // Follow the original mean_scale() function logic exactly:
+        // (float(uint_8_data[idx]) / 255.0f - config.image_mean[c]) / config.image_std[c]
+        // Step 1: x / 255.0
+        auto scale_255 = create_constant(element::f32, Shape{}, std::vector<float>{255.0f});
+        auto divided_by_255 = std::make_shared<v1::Divide>(input_f32, scale_255);
+
+        // Step 2: Create mean and std constants [R, G, B] - broadcasted along channel dimension
+        // For NHWC format, we need shape [1, 1, 1, 3] to broadcast correctly
+        auto mean_const = create_constant(element::f32, Shape{1, 1, 1, 3},
+            std::vector<float>{config.image_mean[0], config.image_mean[1], config.image_mean[2]});
+        auto std_const = create_constant(element::f32, Shape{1, 1, 1, 3},
+            std::vector<float>{config.image_std[0], config.image_std[1], config.image_std[2]});
+
+        // Step 3: (x/255.0 - mean)
+        auto mean_subtracted = std::make_shared<v1::Subtract>(divided_by_255, mean_const);
+
+        // Step 4: (x/255.0 - mean) / std
+        auto result = std::make_shared<v1::Divide>(mean_subtracted, std_const);
+
+        return result;
+    };
+
+    auto create_channels_first = [&](std::shared_ptr<Node> input_nhwc) {
+        // Transpose from NHWC (0,1,2,3) to NCHW (0,3,1,2)
+        auto transpose_order = create_constant(element::i64, Shape{4}, std::vector<int64_t>{0, 3, 1, 2});
+        return std::make_shared<v1::Transpose>(input_nhwc, transpose_order);
+    };
+
+    auto create_slice_image = [&](std::shared_ptr<Node> input_nchw) {
+        // Input: (N, C, H, W) -> Output: (N*num_h_slices*num_w_slices, C, 336, 336)
+        auto shape_node = std::make_shared<v3::ShapeOf>(input_nchw);
+        // Index constants for gathering shape dimensions
+        auto axis_0 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{0}); // N
+        auto axis_1 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{1}); // C
+        auto axis_2 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{2}); // H
+        auto axis_3 = create_constant(element::i64, Shape{1}, std::vector<int64_t>{3}); // W
+        auto axis_0_node = create_constant(element::i64, Shape{}, std::vector<int64_t>{0}); // Gather axis
+
+        auto N = std::make_shared<v8::Gather>(shape_node, axis_0, axis_0_node);
+        auto C = std::make_shared<v8::Gather>(shape_node, axis_1, axis_0_node);
+        auto H = std::make_shared<v8::Gather>(shape_node, axis_2, axis_0_node);
+        auto W = std::make_shared<v8::Gather>(shape_node, axis_3, axis_0_node);
+
+        // Patch size constant (336)
+        auto S = create_constant(element::i64, Shape{1}, std::vector<int64_t>{INPUT_IMAGE_SIZE});
+
+        // Calculate number of slices (num_h = H / S, num_w = W / S)
+        auto num_h = std::make_shared<v1::Divide>(H, S);
+        auto num_w = std::make_shared<v1::Divide>(W, S);
+
+        // Reshape to 6D [N, C, num_h, S, num_w, S]
+        auto target_shape_6d = std::make_shared<v0::Concat>(NodeVector{N, C, num_h, S, num_w, S}, 0);
+        auto reshape_6d = std::make_shared<v1::Reshape>(input_nchw, target_shape_6d, false);
+
+        // Transpose (Permute)
+        // Current: 0:N, 1:C, 2:num_h, 3:S, 4:num_w, 5:S
+        // Target:  0:N, 2:num_h, 4:num_w, 1:C, 3:S, 5:S
+        auto permute_order = create_constant(element::i64, Shape{6}, std::vector<int64_t>{0, 2, 4, 1, 3, 5});
+        auto permuted = std::make_shared<v1::Transpose>(reshape_6d, permute_order);
+
+        // Flatten to 4D [N * num_h * num_w, C, S, S]
+        auto minus_one = create_constant(element::i64, Shape{1}, std::vector<int64_t>{-1});
+        auto target_shape_4d = std::make_shared<v0::Concat>(NodeVector{minus_one, C, S, S}, 0);
+        auto final_reshape = std::make_shared<v1::Reshape>(permuted, target_shape_4d, false);
+
+        return final_reshape;
+    };
+
+    auto create_concatenate_batch = [&](std::shared_ptr<Node> global_processed, std::shared_ptr<Node> hd_sliced) {
+        // Concatenate along batch dimension (axis 0)
+        // global_processed: (1, C, H, W)
+        // hd_sliced: (num_slices, C, H, W)
+        // Output: (1 + num_slices, C, H, W)
+        return std::make_shared<v0::Concat>(NodeVector{global_processed, hd_sliced}, 0);
+    };
+
+    auto create_pad_to_max_crops = [&](std::shared_ptr<Node> input_nchw, std::shared_ptr<Node> max_crops_param) {
+        auto create_constant_i64 = [](const std::vector<int64_t>& val) {
+            return v0::Constant::create(element::i64, Shape{val.size()}, val);
+        };
+
+        // Get current input batch size (num_crops)
+        auto shape_of = std::make_shared<v3::ShapeOf>(input_nchw);
+        auto axis_0 = create_constant_i64({0});
+        auto axis_0_scalar = v0::Constant::create(element::i64, {}, {0}); // Axis for Gather
+        auto num_crops = std::make_shared<v8::Gather>(shape_of, axis_0, axis_0_scalar);
+
+        // Calculate required padding amount: padding_needed = max(0, max_crops - num_crops)
+        //    If num_crops >= max_crops, the result will be 0.
+        auto diff = std::make_shared<v1::Subtract>(max_crops_param, num_crops);
+        auto zero = create_constant_i64({0});
+        auto padding_needed = std::make_shared<v1::Maximum>(diff, zero);
+
+        // Configure Pad operation arguments (pads_end)
+        //    pads_begin: [0, 0, 0, 0]
+        //    pads_end:   [padding_needed, 0, 0, 0]
+        auto zero_3 = create_constant_i64({0, 0, 0}); // Zeros for C, H, W dimensions
+        auto zero_4 = create_constant_i64({0, 0, 0, 0}); // pads_begin
+        auto pads_end = std::make_shared<v0::Concat>(OutputVector{padding_needed, zero_3}, 0);
+
+        // Execute Pad operation (Constant mode, fill with 0)
+        auto pad_value = v0::Constant::create(element::f32, Shape{}, {0.0f});
+
+        auto padded = std::make_shared<v1::Pad>(
+            input_nchw,
+            zero_4,      // pads_begin
+            pads_end,    // pads_end
+            pad_value,   // pad_value
+            op::PadMode::CONSTANT
+        );
+
+        return padded;
+    };
+
+    // Process global image (resize + normalize + channels_first)
+    auto global_resized = create_bicubic_resize(hd_image, global_target_size);
+    auto global_normalized = create_mean_scale(global_resized);
+    auto global_processed = create_channels_first(global_normalized);
+
+    // Process HD image (normalize + channels_first + slice)
+    auto hd_normalized = create_mean_scale(hd_image);
+    auto hd_processed = create_channels_first(hd_normalized);
+    auto hd_sliced = create_slice_image(hd_processed);
+
+    // Concatenate global and HD results on GPU
+    auto concatenated = create_concatenate_batch(global_processed, hd_sliced);
+
+    // Pad to max crops on GPU
+    auto padded_result = create_pad_to_max_crops(concatenated, max_crops);
+
+    auto vision_params = vision_encoder_model->get_parameters();
+    auto vision_results = vision_encoder_model->get_results();
+
+    vision_params[0]->output(0).replace(padded_result);
+
+    return std::make_shared<Model>(
+        vision_results,
+        ParameterVector{hd_image, global_target_size, max_crops}
+    );
+}
+
 InputsEmbedderPhi3V::InputsEmbedderPhi3V(
     const VLMConfig& vlm_config,
     const std::filesystem::path& model_dir,
diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp
@@ -40,6 +40,14 @@ class VisionEncoderPhi3V : public VisionEncoder {
         const ov::AnyMap properties);
 
     EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
+
+private:
+    bool use_ov_image_preprocess = false; // Default to false, will be set based on device and environment
+
+    // GPU preprocessing model creation function
+    std::shared_ptr<ov::Model> create_combined_preprocessing_vision_model(
+        const std::shared_ptr<ov::Model>& vision_encoder_model,
+        const ProcessorConfig& config);
 };
 
 class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {