isl-org · TwentyPast4 · Oct 29, 2025 · Oct 29, 2025 · Oct 30, 2025 · Oct 31, 2025
diff --git a/cpp/benchmarks/core/HashMap.cpp b/cpp/benchmarks/core/HashMap.cpp
@@ -253,8 +253,8 @@ void HashReserveInt(benchmark::State& state,
 
 class Int3 {
 public:
-    Int3() : x_(0), y_(0), z_(0) {};
-    Int3(int k) : x_(k), y_(k * 2), z_(k * 4) {};
+    Int3() : x_(0), y_(0), z_(0){};
+    Int3(int k) : x_(k), y_(k * 2), z_(k * 4){};
     bool operator==(const Int3& other) const {
         return x_ == other.x_ && y_ == other.y_ && z_ == other.z_;
     }

diff --git a/cpp/open3d/core/CUDAUtils.cpp b/cpp/open3d/core/CUDAUtils.cpp
@@ -11,6 +11,8 @@
 #include "open3d/utility/Logging.h"
 
 #ifdef BUILD_CUDA_MODULE
+#include <unordered_set>
+
 #include "open3d/core/MemoryManager.h"
 #endif
 
@@ -141,41 +143,60 @@ static void SetDevice(int device_id) {
     OPEN3D_CUDA_CHECK(cudaSetDevice(device_id));
 }
 
-class CUDAStream {
-public:
-    static CUDAStream& GetInstance() {
-        // The global stream state is given per thread like CUDA's internal
-        // device state.
-        static thread_local CUDAStream instance;
-        return instance;
-    }
+void Synchronize(const CUDAStream& stream) {
+    OPEN3D_CUDA_CHECK(cudaStreamSynchronize(stream.Get()));
+}
 
-    cudaStream_t Get() { return stream_; }
-    void Set(cudaStream_t stream) { stream_ = stream; }
+#endif
 
-    static cudaStream_t Default() { return static_cast<cudaStream_t>(0); }
+}  // namespace cuda
 
-private:
-    CUDAStream() = default;
-    CUDAStream(const CUDAStream&) = delete;
-    CUDAStream& operator=(const CUDAStream&) = delete;
+#ifdef BUILD_CUDA_MODULE
 
-    cudaStream_t stream_ = Default();
-};
+CUDAStream& CUDAStream::GetInstance() {
+    // The global stream state is given per thread like CUDA's internal
+    // device state.
+    thread_local CUDAStream instance = CUDAStream::Default();
+    return instance;
+}
 
-cudaStream_t GetStream() { return CUDAStream::GetInstance().Get(); }
+CUDAStream CUDAStream::CreateNew() {
+    CUDAStream stream;
+    OPEN3D_CUDA_CHECK(cudaStreamCreate(&stream.stream_));
+    return stream;
+}
 
-static void SetStream(cudaStream_t stream) {
-    CUDAStream::GetInstance().Set(stream);
+void CUDAStream::SetHostToDeviceMemcpyPolicy(CUDAMemoryCopyPolicy policy) {
+    OPEN3D_ASSERT(!IsDefaultStream());
+    memcpy_from_host_to_device_ = policy;
 }
 
-cudaStream_t GetDefaultStream() { return CUDAStream::Default(); }
+CUDAMemoryCopyPolicy CUDAStream::GetHostToDeviceMemcpyPolicy() const {
+    return memcpy_from_host_to_device_;
+}
 
-#endif
+CUDAMemoryCopyPolicy CUDAStream::GetDeviceToHostMemcpyPolicy() const {
+    return memcpy_from_device_to_host_;
+}
 
-}  // namespace cuda
+void CUDAStream::SetDeviceToHostMemcpyPolicy(CUDAMemoryCopyPolicy policy) {
+    OPEN3D_ASSERT(!IsDefaultStream());
+    memcpy_from_device_to_host_ = policy;
+}
 
-#ifdef BUILD_CUDA_MODULE
+bool CUDAStream::IsDefaultStream() const {
+    return stream_ == static_cast<cudaStream_t>(nullptr);
+}
+
+cudaStream_t CUDAStream::Get() const { return stream_; }
+
+void CUDAStream::Set(cudaStream_t stream) { stream_ = stream; }
+
+void CUDAStream::Destroy() {
+    OPEN3D_ASSERT(!IsDefaultStream());
+    OPEN3D_CUDA_CHECK(cudaStreamDestroy(stream_));
+    *this = CUDAStream::Default();
+}
 
 CUDAScopedDevice::CUDAScopedDevice(int device_id)
     : prev_device_id_(cuda::GetDevice()) {
@@ -189,27 +210,22 @@ CUDAScopedDevice::CUDAScopedDevice(const Device& device)
 
 CUDAScopedDevice::~CUDAScopedDevice() { cuda::SetDevice(prev_device_id_); }
 
-constexpr CUDAScopedStream::CreateNewStreamTag
-        CUDAScopedStream::CreateNewStream;
-
-CUDAScopedStream::CUDAScopedStream(const CreateNewStreamTag&)
-    : prev_stream_(cuda::GetStream()), owns_new_stream_(true) {
-    OPEN3D_CUDA_CHECK(cudaStreamCreate(&new_stream_));
-    cuda::SetStream(new_stream_);
-}
-
-CUDAScopedStream::CUDAScopedStream(cudaStream_t stream)
-    : prev_stream_(cuda::GetStream()),
+CUDAScopedStream::CUDAScopedStream(CUDAStream stream, bool destroy_on_exit)
+    : prev_stream_(CUDAStream::GetInstance()),
       new_stream_(stream),
-      owns_new_stream_(false) {
-    cuda::SetStream(stream);
+      owns_new_stream_(destroy_on_exit) {
+    CUDAStream::GetInstance() = new_stream_;
 }
 
 CUDAScopedStream::~CUDAScopedStream() {
     if (owns_new_stream_) {
-        OPEN3D_CUDA_CHECK(cudaStreamDestroy(new_stream_));
+        OPEN3D_ASSERT((prev_stream_.Get() != new_stream_.Get()) &&
+                      "CUDAScopedStream destroy_on_exit would destroy the same "
+                      "stream which was in place before the scoped stream was "
+                      "created.");
+        new_stream_.Destroy();
     }
-    cuda::SetStream(prev_stream_);
+    CUDAStream::GetInstance() = prev_stream_;
 }
 
 CUDAState& CUDAState::GetInstance() {
@@ -304,10 +320,35 @@ size_t GetCUDACurrentTotalMemSize() {
 namespace open3d {
 namespace core {
 
+const std::unordered_set<cudaError_t> kProcessEndingErrors = {
+        cudaErrorAssert,
+        cudaErrorLaunchTimeout,
+        cudaErrorHardwareStackError,
+        cudaErrorIllegalInstruction,
+        cudaErrorMisalignedAddress,
+        cudaErrorInvalidAddressSpace,
+        cudaErrorInvalidPc,
+        cudaErrorTensorMemoryLeak,
+        cudaErrorMpsClientTerminated,
+        cudaErrorExternalDevice,
+        cudaErrorContained,
+        cudaErrorIllegalAddress,
+        cudaErrorLaunchFailure,
+        cudaErrorECCUncorrectable,
+        cudaErrorUnknown};
+
 void __OPEN3D_CUDA_CHECK(cudaError_t err, const char* file, const int line) {
     if (err != cudaSuccess) {
-        utility::LogError("{}:{} CUDA runtime error: {}", file, line,
-                          cudaGetErrorString(err));
+        if (kProcessEndingErrors.count(err)) {
+            utility::LogError(
+                    "{}:{} CUDA runtime error: {}. This is a process-ending "
+                    "error. All further operations will fail and the process "
+                    "needs to be relaunched to be able to use CUDA.",
+                    file, line, cudaGetErrorString(err));
+        } else {
+            utility::LogError("{}:{} CUDA runtime error: {}", file, line,
+                              cudaGetErrorString(err));
+        }
     }
 }
 

diff --git a/cpp/open3d/core/CUDAUtils.h b/cpp/open3d/core/CUDAUtils.h
@@ -57,6 +57,124 @@ namespace core {
 
 #ifdef BUILD_CUDA_MODULE
 
+/// \enum CUDAMemoryCopyPolicy
+///
+/// Specifier for different behavior of memory copies between the host and
+/// device.
+///
+enum class CUDAMemoryCopyPolicy {
+    // Default.
+    // Ensure all memory copy operations are finished by synchronizing the CUDA
+    // stream on which the copy occurred.
+    Sync = 0,
+    // Asynchronous memory copies. Unmanaged.
+    // No memory safety at all - you are responsible for your own actions.
+    // There are no guaranteed about the lifetime of memory copied between the
+    // host and the device. If memory is freed before the copy finishes, you
+    // *will* have serious memory issues.
+    Async = 2
+};
+
+/// \class CUDAStream
+///
+/// An Open3D representation of a CUDA stream.
+///
+class CUDAStream {
+public:
+    static CUDAStream& GetInstance();
+
+    /// Creates a new CUDA stream.
+    /// The caller is responsible for eventually destroying the stream by
+    /// calling Destroy().
+    static CUDAStream CreateNew();
+
+    /// Explicitly constructs a default stream. The default constructor could be
+    /// used, but this is clearer and closer to the old API.
+    static CUDAStream Default() { return {}; }
+
+    /// Default constructor. Refers to the default CUDA stream.
+    CUDAStream() = default;
+
+    /// Sets the behavior of memory copy operations device->host.
+    /// Sync by default. The default CUDA stream is implicitly synchronized with
+    /// every other stream. As such, it is invalid to call this function on the
+    /// default stream.
+    /// \param policy The desired behavior.
+    ///
+    /// Having non-synchronous memory
+    /// copy from device to host can result in memory corruption and various
+    /// other problems if you do not know what you are doing. Example:
+    /// ```cpp
+    /// void pokingTheBear() {
+    ///     CUDAScopedStream scoped_stream(CUDAStream::CreateNew(), true);
+    ///     CUDAStream::GetInstance().SetDeviceToHostMemcpyPolicy(CUDAMemoryCopyPolicy::AsyncUnmanaged);
+    ///     Tensor foo = Tensor::Init<float>({0.f}, "CUDA:0");
+    ///     Tensor foo_cpu = foo.To("CPU:0"); // launches an async copy from
+    ///     device to cpu memory owned by foo_cpu. Until the async copy
+    ///     completes, the memory will be uninitialized (random garbage).
+    ///     // Any operations on foo_cpu will be undefined here, as you cannot
+    ///     be sure the async memcpy has finished or not
+    ///     cuda::Synchronize(CUDAStream::GetInstance()); // force a manual sync
+    ///     // It is now safe to perform operations on foo_cpu
+    /// }
+    /// ```
+    void SetDeviceToHostMemcpyPolicy(CUDAMemoryCopyPolicy policy);
+
+    /// Returns the current value of the memory synchronization flag for
+    /// device->host memory copies. The default stream will always return Sync,
+    /// because it is implicitly synchronized.
+    CUDAMemoryCopyPolicy GetDeviceToHostMemcpyPolicy() const;
+
+    /// Sets the behavior of memory copy operations host->device.
+    /// Sync by default. The default CUDA stream is implicitly synchronized with
+    /// every other stream. As such, it is invalid to call this function on the
+    /// default stream.
+    /// \param policy The desired behavior.
+    /// Having non-synchronous memory copy from host to device can result in
+    /// memory corruption and various other problems if you do not know what you
+    /// are doing. Example:
+    /// ```cpp
+    /// void pokingTheBear() {
+    ///     CUDAScopedStream scoped_stream(CUDAStream::CreateNew(), true);
+    ///     CUDAStream::GetInstance().SetHostToDeviceMemcpyPolicy(CUDAMemoryCopyPolicy::AsyncUnmanaged);
+    ///     Tensor foo;
+    ///     {
+    ///         Tensor foo_cpu = Tensor::Init<float>({-1.f});
+    ///         foo = foo_cpu.To("CUDA:0"); // launches async copy from foo_cpu
+    ///         to foo
+    ///     }
+    ///     // fo_cpu goes out of scope, no guarantee that the data will be
+    ///     // copied to the device memory pointed to by 'foo' before free is
+    ///     // called on the host. CUDA may throw an illegal memory access
+    ///     error.
+    /// }
+    /// ```
+    void SetHostToDeviceMemcpyPolicy(CUDAMemoryCopyPolicy policy);
+
+    /// Returns the current value of the memory synchronization flag for
+    /// host->device memory copies. The default stream will always return Sync,
+    /// because it is implicitly synchronized.
+    CUDAMemoryCopyPolicy GetHostToDeviceMemcpyPolicy() const;
+
+    /// Returns true if this refers to the default CUDA stream.
+    bool IsDefaultStream() const;
+
+    cudaStream_t Get() const;
+    void Set(cudaStream_t stream);
+
+    /// Destroys the underlying CUDA stream. It is invalid to call this on the
+    /// default stream. After this call, this object refers to the default
+    /// stream.
+    void Destroy();
+
+private:
+    cudaStream_t stream_ = static_cast<cudaStream_t>(nullptr);
+    CUDAMemoryCopyPolicy memcpy_from_device_to_host_ =
+            CUDAMemoryCopyPolicy::Sync;
+    CUDAMemoryCopyPolicy memcpy_from_host_to_device_ =
+            CUDAMemoryCopyPolicy::Sync;
+};
+
 /// \class CUDAScopedDevice
 ///
 /// Switch CUDA device id in the current scope. The device id will be reset
@@ -135,29 +253,17 @@ class CUDAScopedDevice {
 /// }
 /// ```
 class CUDAScopedStream {
-private:
-    struct CreateNewStreamTag {
-        CreateNewStreamTag(const CreateNewStreamTag&) = delete;
-        CreateNewStreamTag& operator=(const CreateNewStreamTag&) = delete;
-        CreateNewStreamTag(CreateNewStreamTag&&) = delete;
-        CreateNewStreamTag& operator=(CreateNewStreamTag&&) = delete;
-    };
-
 public:
-    constexpr static CreateNewStreamTag CreateNewStream = {};
-
-    explicit CUDAScopedStream(const CreateNewStreamTag&);
-
-    explicit CUDAScopedStream(cudaStream_t stream);
+    explicit CUDAScopedStream(CUDAStream stream, bool destroy_on_exit = false);
 
     ~CUDAScopedStream();
 
     CUDAScopedStream(const CUDAScopedStream&) = delete;
     CUDAScopedStream& operator=(const CUDAScopedStream&) = delete;
 
 private:
-    cudaStream_t prev_stream_;
-    cudaStream_t new_stream_;
+    CUDAStream prev_stream_;
+    CUDAStream new_stream_;
     bool owns_new_stream_ = false;
 };
 
@@ -265,8 +371,10 @@ bool SupportsMemoryPools(const Device& device);
 #ifdef BUILD_CUDA_MODULE
 
 int GetDevice();
-cudaStream_t GetStream();
-cudaStream_t GetDefaultStream();
+
+/// Calls cudaStreamSynchronize() for the specified CUDA stream.
+/// \param stream The stream to be synchronized.
+void Synchronize(const CUDAStream& stream);
 
 #endif
 

diff --git a/cpp/open3d/core/Indexer.h b/cpp/open3d/core/Indexer.h
@@ -638,7 +638,7 @@ class Indexer {
 class IndexerIterator {
 public:
     struct Iterator {
-        Iterator() {};
+        Iterator(){};
         Iterator(const Indexer& indexer);
         Iterator(Iterator&& other) = default;