address comments

qjia7 · qjia7 · commit 42d5e6455e55 · 2025-11-17T13:43:47.000+08:00
diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h
@@ -154,12 +154,6 @@ class Environment {
   const DataTransferManager& GetDataTransferManager() const {
     return data_transfer_mgr_;
   }
-
-  // Register a data transfer for an execution provider with the environment's data transfer manager
-  // This is needed for EPs like WebGPU where CopyTensors C API needs access to the data transfer
-  Status RegisterDataTransferForEP(std::unique_ptr<IDataTransfer> data_transfer) {
-    return data_transfer_mgr_.RegisterDataTransfer(std::move(data_transfer));
-  }
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
   // return a shared allocator from a plugin EP or custom allocator added with RegisterAllocator
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -12,9 +12,48 @@
 #include "core/session/ort_apis.h"
 
 #include "core/providers/webgpu/webgpu_provider_options.h"
+#include "core/providers/webgpu/data_transfer.h"
 using namespace onnxruntime::webgpu::options;
 
 namespace onnxruntime {
+// Helper to get default context config, buffer cache config, backend type, and enable_pix_capture
+struct WebGpuContextParams {
+  webgpu::WebGpuContextConfig context_config;
+  webgpu::WebGpuBufferCacheConfig buffer_cache_config;
+  int backend_type;
+  bool enable_pix_capture;
+};
+
+static WebGpuContextParams GetDefaultWebGpuContextParams(int context_id) {
+  WebGpuContextParams params;
+  params.context_config.context_id = context_id;
+  params.context_config.instance = nullptr;
+  params.context_config.device = nullptr;
+  params.context_config.dawn_proc_table = nullptr;
+  params.context_config.validation_mode = webgpu::ValidationMode::Basic;
+  params.context_config.preserve_device = false;
+  params.context_config.max_storage_buffer_binding_size = 0;
+  params.context_config.power_preference = static_cast<int>(WGPUPowerPreference_HighPerformance);
+
+  params.buffer_cache_config.storage.mode = webgpu::BufferCacheMode::Bucket;
+  params.buffer_cache_config.uniform.mode = webgpu::BufferCacheMode::Simple;
+  params.buffer_cache_config.query_resolve.mode = webgpu::BufferCacheMode::Disabled;
+  params.buffer_cache_config.default_entry.mode = webgpu::BufferCacheMode::Disabled;
+
+#ifdef _WIN32
+#if defined(DAWN_ENABLE_D3D12)
+  params.backend_type = static_cast<int>(WGPUBackendType_D3D12);
+#elif defined(DAWN_ENABLE_VULKAN)
+  params.backend_type = static_cast<int>(WGPUBackendType_Vulkan);
+#else
+  params.backend_type = static_cast<int>(WGPUBackendType_D3D12);
+#endif
+#else
+  params.backend_type = 0;
+#endif
+  params.enable_pix_capture = false;
+  return params;
+}
 
 struct WebGpuProviderFactory : IExecutionProviderFactory {
   WebGpuProviderFactory(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderConfig&& webgpu_ep_config)
@@ -291,4 +330,73 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
   return std::make_shared<WebGpuProviderFactory>(context_id, context, std::move(webgpu_ep_config));
 }
 
+// WebGPU DataTransfer implementation wrapper for the C API
+struct WebGpuDataTransferImpl : OrtDataTransferImpl {
+  WebGpuDataTransferImpl(const OrtApi& ort_api_in, webgpu::BufferManager& buffer_manager)
+      : ort_api{ort_api_in},
+        ep_api{*ort_api_in.GetEpApi()},
+        data_transfer_{buffer_manager} {
+    ort_version_supported = ORT_API_VERSION;
+    CanCopy = CanCopyImpl;
+    CopyTensors = CopyTensorsImpl;
+    Release = ReleaseImpl;
+  }
+
+  static bool CanCopyImpl(const OrtDataTransferImpl* this_ptr,
+                          const OrtMemoryDevice* src_memory_device,
+                          const OrtMemoryDevice* dst_memory_device) noexcept {
+    const auto& impl = *static_cast<const WebGpuDataTransferImpl*>(this_ptr);
+    OrtMemoryInfoDeviceType src_type = impl.ep_api.MemoryDevice_GetDeviceType(src_memory_device);
+    OrtMemoryInfoDeviceType dst_type = impl.ep_api.MemoryDevice_GetDeviceType(dst_memory_device);
+
+    // WebGPU supports GPU<->GPU, GPU<->CPU copies
+    return (src_type == OrtMemoryInfoDeviceType_GPU && dst_type == OrtMemoryInfoDeviceType_GPU) ||
+           (src_type == OrtMemoryInfoDeviceType_GPU && dst_type == OrtMemoryInfoDeviceType_CPU) ||
+           (src_type == OrtMemoryInfoDeviceType_CPU && dst_type == OrtMemoryInfoDeviceType_GPU);
+  }
+
+  static OrtStatus* CopyTensorsImpl(OrtDataTransferImpl* this_ptr,
+                                    const OrtValue** src_tensors,
+                                    OrtValue** dst_tensors,
+                                    OrtSyncStream** /*streams*/,
+                                    size_t num_tensors) noexcept {
+    auto& impl = *static_cast<WebGpuDataTransferImpl*>(this_ptr);
+    for (size_t idx = 0; idx < num_tensors; ++idx) {
+      const OrtValue* src_tensor = src_tensors[idx];
+      OrtValue* dst_tensor = dst_tensors[idx];
+      auto status = impl.data_transfer_.CopyTensor(src_tensor->Get<Tensor>(), *dst_tensor->GetMutable<Tensor>());
+      if (!status.IsOK()) {
+        // Convert common::Status to OrtStatus
+        return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, status.ErrorMessage().c_str());
+      }
+    }
+    return nullptr;
+  }
+
+  static void ReleaseImpl(OrtDataTransferImpl* this_ptr) noexcept {
+    delete static_cast<WebGpuDataTransferImpl*>(this_ptr);
+  }
+
+  const OrtApi& ort_api;
+  const OrtEpApi& ep_api;
+  webgpu::DataTransfer data_transfer_;
+};
+
+OrtDataTransferImpl* OrtWebGpuCreateDataTransfer(int context_id) {
+  webgpu::WebGpuContext* context_ptr = nullptr;
+  try {
+    context_ptr = &webgpu::WebGpuContextFactory::GetContext(context_id);
+  } catch (...) {
+    // Context doesn't exist, create a default one using shared helper
+    WebGpuContextParams params = GetDefaultWebGpuContextParams(context_id);
+    context_ptr = &webgpu::WebGpuContextFactory::CreateContext(params.context_config);
+    context_ptr->Initialize(params.buffer_cache_config, params.backend_type, params.enable_pix_capture);
+  }
+  if (context_ptr) {
+    return new WebGpuDataTransferImpl(*OrtApis::GetApi(ORT_API_VERSION), context_ptr->BufferManager());
+  }
+
+  return nullptr;
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory_creator.h b/onnxruntime/core/providers/webgpu/webgpu_provider_factory_creator.h
@@ -10,11 +10,18 @@
 
 #include "core/providers/webgpu/webgpu_provider_options.h"
 
+struct OrtDataTransferImpl;
+
 namespace onnxruntime {
 struct ConfigOptions;
 
 struct WebGpuProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(const ConfigOptions& config_options);
 };
 
+// C API to create data transfer for WebGPU EP
+// Returns nullptr if WebGPU context (context_id=0) doesn't exist yet
+// Caller takes ownership of the returned OrtDataTransferImpl*
+OrtDataTransferImpl* OrtWebGpuCreateDataTransfer(int context_id);
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -854,25 +854,10 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr
   VLOGS(*session_logger_, 1) << "Adding execution provider of type: " << provider_type;
   auto p_data_xfr = p_exec_provider->GetDataTransfer();
   if (p_data_xfr) {
-    // Register with session's data transfer manager
     auto st = data_transfer_mgr_.RegisterDataTransfer(std::move(p_data_xfr));
     if (!st.IsOK()) {
       return st;
     }
-
-#if !defined(ORT_MINIMAL_BUILD)
-    // For WebGPU EP, also register with environment's data transfer manager
-    // so that CopyTensors C API can work (it only checks environment's DTM)
-    if (provider_type == kWebGpuExecutionProvider) {
-      auto p_data_xfr_env = p_exec_provider->GetDataTransfer();
-      if (p_data_xfr_env) {
-        auto st_env = const_cast<Environment&>(environment_).RegisterDataTransferForEP(std::move(p_data_xfr_env));
-        if (!st_env.IsOK()) {
-          LOGS(*session_logger_, WARNING) << "Failed to register WebGPU data transfer with environment: " << st_env.ErrorMessage();
-        }
-      }
-    }
-#endif
   }
 
   auto p_external_data_loader = p_exec_provider->GetExternalDataLoader();
diff --git a/onnxruntime/core/session/plugin_ep/ep_factory_webgpu.cc b/onnxruntime/core/session/plugin_ep/ep_factory_webgpu.cc
@@ -57,20 +57,13 @@ OrtStatus* WebGpuEpFactory::CreateIExecutionProvider(const OrtHardwareDevice* co
   return nullptr;
 }
 
-/* TODO: Implement CreateAllocator and CreateDataTransfer to support shared allocators and data transfer outside of
-         an InferenceSession.
-OrtStatus* WebGpuEpFactory::CreateAllocator(const OrtMemoryInfo* memory_info,
-                           const OrtKeyValuePairs* allocator_options,
-                           OrtAllocator** allocator) noexcept override {
-  *allocator = device_allocators[memory_info->device.Id()].get();
-}
-
-OrtStatus* WebGpuEpFactory::CreateDataTransfer(_Outptr_result_maybenull_ OrtDataTransferImpl** data_transfer) override {
-  // TODO: Wrap the IDataTransfer implementation so we can copy to device using OrtApi CopyTensors.
-  *data_transfer = nullptr;
+OrtStatus* WebGpuEpFactory::CreateDataTransfer(_Outptr_result_maybenull_ OrtDataTransferImpl** data_transfer) noexcept {
+  // Call the WebGPU provider's C API to create the data transfer
+  // This is implemented in the WebGPU provider backend which has access to WebGPU headers
+  *data_transfer = OrtWebGpuCreateDataTransfer(0);  // Use default context (context_id=0)
   return nullptr;
 }
-*/
+
 }  // namespace onnxruntime
 
 #endif  // USE_WEBGPU
diff --git a/onnxruntime/core/session/plugin_ep/ep_factory_webgpu.h b/onnxruntime/core/session/plugin_ep/ep_factory_webgpu.h
@@ -29,6 +29,8 @@ class WebGpuEpFactory : public EpFactoryInternalImpl {
                                       const OrtSessionOptions* session_options,
                                       const OrtLogger* session_logger,
                                       std::unique_ptr<IExecutionProvider>* ep) noexcept override;
+
+  OrtStatus* CreateDataTransfer(_Outptr_result_maybenull_ OrtDataTransferImpl** data_transfer) noexcept override;
 };
 }  // namespace onnxruntime