Skip to content
Draft
4 changes: 2 additions & 2 deletions cpp/benchmarks/core/HashMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,8 @@ void HashReserveInt(benchmark::State& state,

class Int3 {
public:
Int3() : x_(0), y_(0), z_(0) {};
Int3(int k) : x_(k), y_(k * 2), z_(k * 4) {};
Int3() : x_(0), y_(0), z_(0){};
Int3(int k) : x_(k), y_(k * 2), z_(k * 4){};
bool operator==(const Int3& other) const {
return x_ == other.x_ && y_ == other.y_ && z_ == other.z_;
}
Expand Down
123 changes: 82 additions & 41 deletions cpp/open3d/core/CUDAUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "open3d/utility/Logging.h"

#ifdef BUILD_CUDA_MODULE
#include <unordered_set>

#include "open3d/core/MemoryManager.h"
#endif

Expand Down Expand Up @@ -141,41 +143,60 @@ static void SetDevice(int device_id) {
OPEN3D_CUDA_CHECK(cudaSetDevice(device_id));
}

class CUDAStream {
public:
static CUDAStream& GetInstance() {
// The global stream state is given per thread like CUDA's internal
// device state.
static thread_local CUDAStream instance;
return instance;
}
void Synchronize(const CUDAStream& stream) {
OPEN3D_CUDA_CHECK(cudaStreamSynchronize(stream.Get()));
}

cudaStream_t Get() { return stream_; }
void Set(cudaStream_t stream) { stream_ = stream; }
#endif

static cudaStream_t Default() { return static_cast<cudaStream_t>(0); }
} // namespace cuda

private:
CUDAStream() = default;
CUDAStream(const CUDAStream&) = delete;
CUDAStream& operator=(const CUDAStream&) = delete;
#ifdef BUILD_CUDA_MODULE

cudaStream_t stream_ = Default();
};
CUDAStream& CUDAStream::GetInstance() {
// The global stream state is given per thread like CUDA's internal
// device state.
thread_local CUDAStream instance = CUDAStream::Default();
return instance;
}

cudaStream_t GetStream() { return CUDAStream::GetInstance().Get(); }
CUDAStream CUDAStream::CreateNew() {
CUDAStream stream;
OPEN3D_CUDA_CHECK(cudaStreamCreate(&stream.stream_));
return stream;
}

static void SetStream(cudaStream_t stream) {
CUDAStream::GetInstance().Set(stream);
void CUDAStream::SetHostToDeviceMemcpyPolicy(CUDAMemoryCopyPolicy policy) {
OPEN3D_ASSERT(!IsDefaultStream());
memcpy_from_host_to_device_ = policy;
}

cudaStream_t GetDefaultStream() { return CUDAStream::Default(); }
CUDAMemoryCopyPolicy CUDAStream::GetHostToDeviceMemcpyPolicy() const {
return memcpy_from_host_to_device_;
}

#endif
CUDAMemoryCopyPolicy CUDAStream::GetDeviceToHostMemcpyPolicy() const {
return memcpy_from_device_to_host_;
}

} // namespace cuda
void CUDAStream::SetDeviceToHostMemcpyPolicy(CUDAMemoryCopyPolicy policy) {
OPEN3D_ASSERT(!IsDefaultStream());
memcpy_from_device_to_host_ = policy;
}

#ifdef BUILD_CUDA_MODULE
bool CUDAStream::IsDefaultStream() const {
return stream_ == static_cast<cudaStream_t>(nullptr);
}

cudaStream_t CUDAStream::Get() const { return stream_; }

void CUDAStream::Set(cudaStream_t stream) { stream_ = stream; }

void CUDAStream::Destroy() {
OPEN3D_ASSERT(!IsDefaultStream());
OPEN3D_CUDA_CHECK(cudaStreamDestroy(stream_));
*this = CUDAStream::Default();
}

CUDAScopedDevice::CUDAScopedDevice(int device_id)
: prev_device_id_(cuda::GetDevice()) {
Expand All @@ -189,27 +210,22 @@ CUDAScopedDevice::CUDAScopedDevice(const Device& device)

CUDAScopedDevice::~CUDAScopedDevice() { cuda::SetDevice(prev_device_id_); }

constexpr CUDAScopedStream::CreateNewStreamTag
CUDAScopedStream::CreateNewStream;

CUDAScopedStream::CUDAScopedStream(const CreateNewStreamTag&)
: prev_stream_(cuda::GetStream()), owns_new_stream_(true) {
OPEN3D_CUDA_CHECK(cudaStreamCreate(&new_stream_));
cuda::SetStream(new_stream_);
}

CUDAScopedStream::CUDAScopedStream(cudaStream_t stream)
: prev_stream_(cuda::GetStream()),
CUDAScopedStream::CUDAScopedStream(CUDAStream stream, bool destroy_on_exit)
: prev_stream_(CUDAStream::GetInstance()),
new_stream_(stream),
owns_new_stream_(false) {
cuda::SetStream(stream);
owns_new_stream_(destroy_on_exit) {
CUDAStream::GetInstance() = new_stream_;
}

CUDAScopedStream::~CUDAScopedStream() {
if (owns_new_stream_) {
OPEN3D_CUDA_CHECK(cudaStreamDestroy(new_stream_));
OPEN3D_ASSERT((prev_stream_.Get() != new_stream_.Get()) &&
"CUDAScopedStream destroy_on_exit would destroy the same "
"stream which was in place before the scoped stream was "
"created.");
new_stream_.Destroy();
}
cuda::SetStream(prev_stream_);
CUDAStream::GetInstance() = prev_stream_;
}

CUDAState& CUDAState::GetInstance() {
Expand Down Expand Up @@ -304,10 +320,35 @@ size_t GetCUDACurrentTotalMemSize() {
namespace open3d {
namespace core {

const std::unordered_set<cudaError_t> kProcessEndingErrors = {
cudaErrorAssert,
cudaErrorLaunchTimeout,
cudaErrorHardwareStackError,
cudaErrorIllegalInstruction,
cudaErrorMisalignedAddress,
cudaErrorInvalidAddressSpace,
cudaErrorInvalidPc,
cudaErrorTensorMemoryLeak,
cudaErrorMpsClientTerminated,
cudaErrorExternalDevice,
cudaErrorContained,
cudaErrorIllegalAddress,
cudaErrorLaunchFailure,
cudaErrorECCUncorrectable,
cudaErrorUnknown};

void __OPEN3D_CUDA_CHECK(cudaError_t err, const char* file, const int line) {
if (err != cudaSuccess) {
utility::LogError("{}:{} CUDA runtime error: {}", file, line,
cudaGetErrorString(err));
if (kProcessEndingErrors.count(err)) {
utility::LogError(
"{}:{} CUDA runtime error: {}. This is a process-ending "
"error. All further operations will fail and the process "
"needs to be relaunched to be able to use CUDA.",
file, line, cudaGetErrorString(err));
} else {
utility::LogError("{}:{} CUDA runtime error: {}", file, line,
cudaGetErrorString(err));
}
}
}

Expand Down
142 changes: 125 additions & 17 deletions cpp/open3d/core/CUDAUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,124 @@ namespace core {

#ifdef BUILD_CUDA_MODULE

/// \enum CUDAMemoryCopyPolicy
///
/// Specifier for different behavior of memory copies between the host and
/// device.
///
enum class CUDAMemoryCopyPolicy {
// Default.
// Ensure all memory copy operations are finished by synchronizing the CUDA
// stream on which the copy occurred.
Sync = 0,
// Asynchronous memory copies. Unmanaged.
// No memory safety at all - you are responsible for your own actions.
// There are no guaranteed about the lifetime of memory copied between the
// host and the device. If memory is freed before the copy finishes, you
// *will* have serious memory issues.
Async = 2
};

/// \class CUDAStream
///
/// An Open3D representation of a CUDA stream.
///
class CUDAStream {
public:
static CUDAStream& GetInstance();

/// Creates a new CUDA stream.
/// The caller is responsible for eventually destroying the stream by
/// calling Destroy().
static CUDAStream CreateNew();

/// Explicitly constructs a default stream. The default constructor could be
/// used, but this is clearer and closer to the old API.
static CUDAStream Default() { return {}; }

/// Default constructor. Refers to the default CUDA stream.
CUDAStream() = default;

/// Sets the behavior of memory copy operations device->host.
/// Sync by default. The default CUDA stream is implicitly synchronized with
/// every other stream. As such, it is invalid to call this function on the
/// default stream.
/// \param policy The desired behavior.
///
/// Having non-synchronous memory
/// copy from device to host can result in memory corruption and various
/// other problems if you do not know what you are doing. Example:
/// ```cpp
/// void pokingTheBear() {
/// CUDAScopedStream scoped_stream(CUDAStream::CreateNew(), true);
/// CUDAStream::GetInstance().SetDeviceToHostMemcpyPolicy(CUDAMemoryCopyPolicy::AsyncUnmanaged);
/// Tensor foo = Tensor::Init<float>({0.f}, "CUDA:0");
/// Tensor foo_cpu = foo.To("CPU:0"); // launches an async copy from
/// device to cpu memory owned by foo_cpu. Until the async copy
/// completes, the memory will be uninitialized (random garbage).
/// // Any operations on foo_cpu will be undefined here, as you cannot
/// be sure the async memcpy has finished or not
/// cuda::Synchronize(CUDAStream::GetInstance()); // force a manual sync
/// // It is now safe to perform operations on foo_cpu
/// }
/// ```
void SetDeviceToHostMemcpyPolicy(CUDAMemoryCopyPolicy policy);

/// Returns the current value of the memory synchronization flag for
/// device->host memory copies. The default stream will always return Sync,
/// because it is implicitly synchronized.
CUDAMemoryCopyPolicy GetDeviceToHostMemcpyPolicy() const;

/// Sets the behavior of memory copy operations host->device.
/// Sync by default. The default CUDA stream is implicitly synchronized with
/// every other stream. As such, it is invalid to call this function on the
/// default stream.
/// \param policy The desired behavior.
/// Having non-synchronous memory copy from host to device can result in
/// memory corruption and various other problems if you do not know what you
/// are doing. Example:
/// ```cpp
/// void pokingTheBear() {
/// CUDAScopedStream scoped_stream(CUDAStream::CreateNew(), true);
/// CUDAStream::GetInstance().SetHostToDeviceMemcpyPolicy(CUDAMemoryCopyPolicy::AsyncUnmanaged);
/// Tensor foo;
/// {
/// Tensor foo_cpu = Tensor::Init<float>({-1.f});
/// foo = foo_cpu.To("CUDA:0"); // launches async copy from foo_cpu
/// to foo
/// }
/// // fo_cpu goes out of scope, no guarantee that the data will be
/// // copied to the device memory pointed to by 'foo' before free is
/// // called on the host. CUDA may throw an illegal memory access
/// error.
/// }
/// ```
void SetHostToDeviceMemcpyPolicy(CUDAMemoryCopyPolicy policy);

/// Returns the current value of the memory synchronization flag for
/// host->device memory copies. The default stream will always return Sync,
/// because it is implicitly synchronized.
CUDAMemoryCopyPolicy GetHostToDeviceMemcpyPolicy() const;

/// Returns true if this refers to the default CUDA stream.
bool IsDefaultStream() const;

cudaStream_t Get() const;
void Set(cudaStream_t stream);

/// Destroys the underlying CUDA stream. It is invalid to call this on the
/// default stream. After this call, this object refers to the default
/// stream.
void Destroy();

private:
cudaStream_t stream_ = static_cast<cudaStream_t>(nullptr);
CUDAMemoryCopyPolicy memcpy_from_device_to_host_ =
CUDAMemoryCopyPolicy::Sync;
CUDAMemoryCopyPolicy memcpy_from_host_to_device_ =
CUDAMemoryCopyPolicy::Sync;
};

/// \class CUDAScopedDevice
///
/// Switch CUDA device id in the current scope. The device id will be reset
Expand Down Expand Up @@ -135,29 +253,17 @@ class CUDAScopedDevice {
/// }
/// ```
class CUDAScopedStream {
private:
struct CreateNewStreamTag {
CreateNewStreamTag(const CreateNewStreamTag&) = delete;
CreateNewStreamTag& operator=(const CreateNewStreamTag&) = delete;
CreateNewStreamTag(CreateNewStreamTag&&) = delete;
CreateNewStreamTag& operator=(CreateNewStreamTag&&) = delete;
};

public:
constexpr static CreateNewStreamTag CreateNewStream = {};

explicit CUDAScopedStream(const CreateNewStreamTag&);

explicit CUDAScopedStream(cudaStream_t stream);
explicit CUDAScopedStream(CUDAStream stream, bool destroy_on_exit = false);

~CUDAScopedStream();

CUDAScopedStream(const CUDAScopedStream&) = delete;
CUDAScopedStream& operator=(const CUDAScopedStream&) = delete;

private:
cudaStream_t prev_stream_;
cudaStream_t new_stream_;
CUDAStream prev_stream_;
CUDAStream new_stream_;
bool owns_new_stream_ = false;
};

Expand Down Expand Up @@ -265,8 +371,10 @@ bool SupportsMemoryPools(const Device& device);
#ifdef BUILD_CUDA_MODULE

int GetDevice();
cudaStream_t GetStream();
cudaStream_t GetDefaultStream();

/// Calls cudaStreamSynchronize() for the specified CUDA stream.
/// \param stream The stream to be synchronized.
void Synchronize(const CUDAStream& stream);

#endif

Expand Down
2 changes: 1 addition & 1 deletion cpp/open3d/core/Indexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ class Indexer {
class IndexerIterator {
public:
struct Iterator {
Iterator() {};
Iterator(){};
Iterator(const Indexer& indexer);
Iterator(Iterator&& other) = default;

Expand Down
Loading
Loading