Address code review comments

yuslepukhin · yuslepukhin · commit ab442e1e3b36 · 2025-11-14T14:38:54.000-08:00
diff --git a/onnxruntime/core/framework/allocator_utils.cc b/onnxruntime/core/framework/allocator_utils.cc
@@ -52,14 +52,14 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
     if (info.use_stream_aware_arena) {
 #ifdef ORT_ENABLE_STREAM
       return AllocatorPtr(
-          std::make_unique<StreamAwareArena>(std::move(device_allocator),
-                                             max_mem,
-                                             arena_extend_str,
-                                             initial_chunk_size_bytes,
-                                             max_dead_bytes_per_chunk,
-                                             initial_growth_chunk_size_bytes));
+          std::make_unique<StreamAwareBFCArena>(std::move(device_allocator),
+                                                max_mem,
+                                                arena_extend_str,
+                                                initial_chunk_size_bytes,
+                                                max_dead_bytes_per_chunk,
+                                                initial_growth_chunk_size_bytes));
 #else
-      ORT_THROW("StreamAwareArena should be transparent to minimal build.");
+      ORT_THROW("StreamAwareBFCArena should be transparent to minimal build.");
 #endif
     } else {
       return AllocatorPtr(
diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
@@ -826,13 +826,13 @@ void BFCArena::ResetChunkOnTargetStream(Stream* target_stream, bool coalesce_fla
   }
 }
 
-StreamAwareArena::StreamAwareArena(std::unique_ptr<IAllocator> resource_allocator,
-                                   size_t total_memory,
-                                   ArenaExtendStrategy arena_extend_strategy,
-                                   int initial_chunk_size_bytes,
-                                   int max_dead_bytes_per_chunk,
-                                   int initial_growth_chunk_size_bytes,
-                                   int64_t max_power_of_two_extend_bytes)
+StreamAwareBFCArena::StreamAwareBFCArena(std::unique_ptr<IAllocator> resource_allocator,
+                                         size_t total_memory,
+                                         ArenaExtendStrategy arena_extend_strategy,
+                                         int initial_chunk_size_bytes,
+                                         int max_dead_bytes_per_chunk,
+                                         int initial_growth_chunk_size_bytes,
+                                         int64_t max_power_of_two_extend_bytes)
     : BFCArena(std::move(resource_allocator),
                total_memory,
                arena_extend_strategy,
@@ -842,11 +842,11 @@ StreamAwareArena::StreamAwareArena(std::unique_ptr<IAllocator> resource_allocato
                max_power_of_two_extend_bytes) {
 }
 
-void* StreamAwareArena::AllocOnStream(size_t size, Stream* current_stream) {
+void* StreamAwareBFCArena::AllocOnStream(size_t size, Stream* current_stream) {
   return AllocateRawInternal(size, false, current_stream);
 }
 
-void StreamAwareArena::ReleaseStreamBuffers(Stream* stream) {
+void StreamAwareBFCArena::ReleaseStreamBuffers(Stream* stream) {
   // since chunks on target stream will be reset to nullptr, trigger coalesce to see whether we can get bigger chunk.
   ResetChunkOnTargetStream(stream, true);
 }
diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h
@@ -43,7 +43,7 @@ namespace onnxruntime {
 #endif
 #endif
 
-class StreamAwareArena;
+class StreamAwareBFCArena;
 // A memory allocator that implements a 'best-fit with coalescing'
 // algorithm.  This is essentially a very simple version of Doug Lea's
 // malloc (dlmalloc).
@@ -502,15 +502,15 @@ class BFCArena : public IArena {
 };
 
 #ifdef ORT_ENABLE_STREAM
-class StreamAwareArena : public BFCArena {
+class StreamAwareBFCArena : public BFCArena {
  public:
-  StreamAwareArena(std::unique_ptr<IAllocator> resource_allocator,
-                   size_t total_memory,
-                   ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY,
-                   int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
-                   int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK,
-                   int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES,
-                   int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES);
+  StreamAwareBFCArena(std::unique_ptr<IAllocator> resource_allocator,
+                      size_t total_memory,
+                      ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY,
+                      int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
+                      int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK,
+                      int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES,
+                      int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES);
 
   bool IsStreamAware() const override { return true; }
 
diff --git a/onnxruntime/core/framework/device_stream_collection.cc b/onnxruntime/core/framework/device_stream_collection.cc
@@ -37,7 +37,7 @@ class DeviceStreamCollectionImpl {
       if (it.second->Info().device == stream->GetDevice() &&
           it.second->Info().alloc_type == OrtArenaAllocator) {
         if (it.second->IsStreamAware()) {
-          // Previously we only had one StreamAwareArena. We need to guard
+          // Previously we only had one StreamAwareBFCArena. We need to guard
           // against multiple allocators now.
           auto* arena_alloc = IArena::SafeArenaCast(it.second.get());
           if (arena_alloc) {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -154,9 +154,15 @@ AllocatorPtr CUDAExecutionProvider::CreateCudaAllocator(OrtDevice::DeviceId devi
 
     return CreateAllocator(default_memory_info);
   } else {
-    const bool use_cuda_mempool =
+    const bool cuda_mempool_requested =
         default_memory_arena_cfg != nullptr && default_memory_arena_cfg->use_cuda_mempool == 1;
 
+    const bool use_cuda_mempool = cuda_mempool_requested && cuda::CudaMempoolArena::IsCudaVersionSupported();
+
+    if (cuda_mempool_requested && !use_cuda_mempool) {
+      LOGS_DEFAULT(WARNING) << "CUDA memory pool requested but not supported on this device/driver. Falling back to default BFCArena with CUDA allocator.";
+    }
+
     if (use_cuda_mempool) {
       auto device = OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA, device_id);
       auto mem_info = OrtMemoryInfo("CUDAMemPoolArena", OrtAllocatorType::OrtArenaAllocator, device, OrtMemTypeDefault);
diff --git a/onnxruntime/core/providers/cuda/cuda_mempool_arena.cc b/onnxruntime/core/providers/cuda/cuda_mempool_arena.cc
@@ -57,23 +57,23 @@ CudaMempoolArena::~CudaMempoolArena() {
   for (auto& kv : alloc_map_) {
     void* p = kv.first;
     const cudaStream_t s = kv.second.stream;
-    (void)cudaFreeAsync(p, s);  // ignore errors in destructor
+    ORT_IGNORE_RETURN_VALUE(cudaFreeAsync(p, s));  // ignore errors in destructor
   }
 
+  // 2) Synchronize all streams we know about (those that ever held allocations).
+  SyncAllKnownStreams_NoThrow();
+
   // Now it is safe to drop our bookkeeping.
   alloc_map_.clear();
   stream_map_.clear();
 
-  // 2) Synchronize all streams we know about (those that ever held allocations).
-  SyncAllKnownStreams_NoThrow();
-
   // 3) Safety barrier: ensure any frees enqueued on destroyed/unknown streams are completed.
-  (void)cudaDeviceSynchronize();  // ignore errors in destructor
+  ORT_IGNORE_RETURN_VALUE(cudaDeviceSynchronize());  // ignore errors in destructor
 
   // 4) Trim to zero and destroy the pool.
   if (pool_) {
-    (void)cudaMemPoolTrimTo(pool_, 0);  // best-effort
-    (void)cudaMemPoolDestroy(pool_);
+    ORT_IGNORE_RETURN_VALUE(cudaMemPoolTrimTo(pool_, 0));  // best-effort
+    ORT_IGNORE_RETURN_VALUE(cudaMemPoolDestroy(pool_));
     pool_ = nullptr;
   }
 }
@@ -93,7 +93,7 @@ void* CudaMempoolArena::Alloc(size_t size) {
                           << size << " bytes at " << p << " on default stream.";
 
   // In case the default stream is busy.
-  ::cudaStreamSynchronize(kDefaultStream);
+  ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(kDefaultStream));
 
   {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -231,8 +231,41 @@ void CudaMempoolArena::MaybeRehashLocked() {
 void CudaMempoolArena::SyncAllKnownStreams_NoThrow() {
   for (const auto& kv : stream_map_) {
     const cudaStream_t s = kv.first;
-    (void)cudaStreamSynchronize(s);  // ignore errors; device-wide sync follows
+    ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(s));  // ignore errors; device-wide sync follows
+  }
+}
+
+bool CudaMempoolArena::IsCudaVersionSupported() noexcept {
+  int ort_cuda_rt_version = 0;
+  cudaError_t cuda_status = cudaRuntimeGetVersion(&ort_cuda_rt_version);
+  if (cuda_status != cudaSuccess) {
+    return false;
+  }
+
+  if (ort_cuda_rt_version < 11020) {
+    return false;
+  }
+
+  int ort_cuda_driver_version = 0;
+  cuda_status = cudaDriverGetVersion(&ort_cuda_driver_version);
+  if (cuda_status != cudaSuccess) {
+    return false;
+  }
+
+  if (ort_cuda_driver_version < 11020) {
+    return false;
   }
+
+  // Check if the driver version supports the runtime version
+  if (ort_cuda_rt_version >= 12000 && ort_cuda_driver_version < 12000) {
+    return false;
+  }
+
+  if (ort_cuda_rt_version >= 13000 && ort_cuda_driver_version < 13000) {
+    return false;
+  }
+
+  return true;
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/cuda_mempool_arena.h b/onnxruntime/core/providers/cuda/cuda_mempool_arena.h
@@ -29,15 +29,18 @@ namespace cuda {
  * - Creates a **process-local** CUDA mempool for a specific device (from `OrtMemoryInfo`).
  * - All allocations use **`cudaMallocFromPoolAsync()`** on either the legacy default stream (0) or a
  *   caller-provided stream. The allocation stream is recorded for ordered free.
- * - `Free()` and `ReleaseStreamBuffers()` enqueue **`cudaFreeAsync()`** on the recorded stream to
+ * - `Free()` enqueue **`cudaFreeAsync()`** on the recorded stream to
  *   respect CUDA's stream-ordered semantics.
  * - `Shrink()` trims the pool with **`cudaMemPoolTrimTo(bytes_to_keep)`** and right-sizes the book-keeping maps
  *   under lock.
  *
  * ### Tuning
- * - `pool_release_threshold`: if non-zero, sets `cudaMemPoolAttrReleaseThreshold`. **Recommended: 1 MB.**
- * - `initial_pool_size_bytes`: if > 0, pre‑reserve pool capacity by setting
+ * - `pool_release_threshold`: if non-zero, sets `cudaMemPoolAttrReleaseThreshold`. **Recommended: 1 MB.**, but
+ *    must be experimentally determined based on workload for optimal memory consumption vs performance.
  *   `cudaMemPoolAttrReservedMemCurrent`. **Recommended: 10 MB.**
+ * - `bytes_to_keep_on_shrink`: target size for `cudaMemPoolTrimTo()` on `Shrink()`. This is only relevant
+ *    if Shrink() is enabled. It usually costs performance, and strictly speaking is not necessary for cuda mempools
+ *    since they release memory on at synchronous points according to `pool_release_threshold`.
  *
  * ### Thread-safety
  * - All updates to internal maps and statistics are guarded by an internal `std::mutex`.
@@ -122,11 +125,16 @@ class CudaMempoolArena final : public IArena {
   // void ReleaseStreamBuffers(Stream* stream) override;
 
   /**
-   * @brief Trim the pool to `bytes_to_keep` (configured at construction) using `cudaMemPoolTrimTo()`.
+   * @brief Trim the pool to `bytes_to_keep_on_shrink_` (configured at construction) using `cudaMemPoolTrimTo()`.
+   * Memory still allocated is not affected. Shrink() may affect your performance and contrary to BFCArena
+   * This allocator does not need Shrink. Cuda mempool is capable of releasing memory automatically
+   * according to pool_release_threshold_ set at construction.
    * Also rehashes internal maps under lock to keep them reasonably sized.
    */
   Status Shrink() override;
 
+  static bool IsCudaVersionSupported() noexcept;
+
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaMempoolArena);
 
  private:
diff --git a/onnxruntime/test/framework/bfc_arena_test.cc b/onnxruntime/test/framework/bfc_arena_test.cc
@@ -339,7 +339,7 @@ struct StreamMock : public Stream {
 
 #ifdef ORT_ENABLE_STREAM
 TEST(StreamAwareArenaTest, TwoStreamAllocation) {
-  StreamAwareArena a(std::unique_ptr<IAllocator>(new CPUAllocator()), 1 << 30);
+  StreamAwareBFCArena a(std::unique_ptr<IAllocator>(new CPUAllocator()), 1 << 30);
   CheckStats(&a, 0, 0, 0, 0);
 
   OrtDevice tmp;
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
@@ -405,7 +405,7 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
     // One reserve call should have been made (for allocating memory for the sole initializer in the model)
     ASSERT_EQ(1, alloc_stats.num_reserves);
 
-    // This counter comes from Reserve(). The actual call for arena based allocator went to StreamAwareArena instance
+    // This counter comes from Reserve(). The actual call for arena based allocator went to StreamAwareBFCArena instance
     ASSERT_EQ(1, alloc_stats.num_allocs);
   }
 }
diff --git a/onnxruntime/test/providers/cuda/cuda_mempool_arena_test.cc b/onnxruntime/test/providers/cuda/cuda_mempool_arena_test.cc
@@ -25,26 +25,33 @@ namespace test {
 static bool IsCudaMemPoolSupported() {
   int ort_cuda_rt_version = 0;
   cudaError_t cuda_status = cudaRuntimeGetVersion(&ort_cuda_rt_version);
-  bool version_supported = (cuda_status == cudaSuccess && ort_cuda_rt_version >= 11020);
-  if (!version_supported) {
+  if (cuda_status != cudaSuccess) {
     return false;
   }
-  // Creating a cuda mempool in some pipelines fails with
-  // CUDA failure 801: operation not supported ; GPU=0 ; hostname=af14bbb1c000000 ;
-  // Even though CUDA version may be 12.8 possibly due to the driver.
-  cudaMemPoolProps props{};
-  // Pinned is not the same as pinned allocator, cudaMemLocationTypeDevice actually does not exist
-  // even though is present in some internet docs.
-  props.allocType = cudaMemAllocationTypePinned;
-  props.handleTypes = cudaMemHandleTypeNone;        // local to process
-  props.location.type = cudaMemLocationTypeDevice;  // Device memory
-  props.location.id = 0;                            // test device 0
-  cudaMemPool_t pool;
-  auto cuda_error = cudaMemPoolCreate(&pool, &props);
-  if (cuda_error != cudaSuccess) {
+
+  if (ort_cuda_rt_version < 11020) {
+    return false;
+  }
+
+  int ort_cuda_driver_version = 0;
+  cuda_status = cudaDriverGetVersion(&ort_cuda_driver_version);
+  if (cuda_status != cudaSuccess) {
+    return false;
+  }
+
+  if (ort_cuda_driver_version < 11020) {
     return false;
   }
-  cuda_error = cudaMemPoolDestroy(pool);
+
+  // Check if the driver version supports the runtime version
+  if (ort_cuda_rt_version >= 12000 && ort_cuda_driver_version < 12000) {
+    return false;
+  }
+
+  if (ort_cuda_rt_version >= 13000 && ort_cuda_driver_version < 13000) {
+    return false;
+  }
+
   return true;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -405,7 +405,7 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {`
`405`	`405`	`// One reserve call should have been made (for allocating memory for the sole initializer in the model)`
`406`	`406`	`ASSERT_EQ(1, alloc_stats.num_reserves);`
`407`	`407`
`408`		`- // This counter comes from Reserve(). The actual call for arena based allocator went to StreamAwareArena instance`
	`408`	`+ // This counter comes from Reserve(). The actual call for arena based allocator went to StreamAwareBFCArena instance`
`409`	`409`	`ASSERT_EQ(1, alloc_stats.num_allocs);`
`410`	`410`	`}`
`411`	`411`	`}`