isl-org
diff --git a/‎cpp/benchmarks/core/HashMap.cpp‎
Lines changed: 2 additions & 2 deletions b/‎cpp/benchmarks/core/HashMap.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/open3d/core/CUDAUtils.cpp‎
Lines changed: 48 additions & 39 deletions b/‎cpp/open3d/core/CUDAUtils.cpp‎
Lines changed: 48 additions & 39 deletions
diff --git a/‎cpp/open3d/core/CUDAUtils.h‎
Lines changed: 66 additions & 17 deletions b/‎cpp/open3d/core/CUDAUtils.h‎
Lines changed: 66 additions & 17 deletions
diff --git a/‎cpp/open3d/core/Indexer.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/open3d/core/Indexer.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/open3d/core/MemoryManagerCUDA.cpp‎
Lines changed: 14 additions & 8 deletions b/‎cpp/open3d/core/MemoryManagerCUDA.cpp‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎cpp/open3d/core/ParallelFor.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/open3d/core/ParallelFor.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h‎
Lines changed: 4 additions & 2 deletions b/‎cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h‎
Lines changed: 4 additions & 2 deletions
@@ -253,8 +253,8 @@ void HashReserveInt(benchmark::State& state,
 
 class Int3 {
 public:
-    Int3() : x_(0), y_(0), z_(0) {};
-    Int3(int k) : x_(k), y_(k * 2), z_(k * 4) {};
+    Int3() : x_(0), y_(0), z_(0){};
+    Int3(int k) : x_(k), y_(k * 2), z_(k * 4){};
     bool operator==(const Int3& other) const {
         return x_ == other.x_ && y_ == other.y_ && z_ == other.z_;
     }
 
@@ -141,41 +141,55 @@ static void SetDevice(int device_id) {
     OPEN3D_CUDA_CHECK(cudaSetDevice(device_id));
 }
 
-class CUDAStream {
-public:
-    static CUDAStream& GetInstance() {
-        // The global stream state is given per thread like CUDA's internal
-        // device state.
-        static thread_local CUDAStream instance;
-        return instance;
-    }
+void Synchronize(const CUDAStream& stream) {
+    OPEN3D_CUDA_CHECK(cudaStreamSynchronize(stream.Get()));
+}
 
-    cudaStream_t Get() { return stream_; }
-    void Set(cudaStream_t stream) { stream_ = stream; }
+#endif
 
-    static cudaStream_t Default() { return static_cast<cudaStream_t>(0); }
+}  // namespace cuda
 
-private:
-    CUDAStream() = default;
-    CUDAStream(const CUDAStream&) = delete;
-    CUDAStream& operator=(const CUDAStream&) = delete;
+#ifdef BUILD_CUDA_MODULE
 
-    cudaStream_t stream_ = Default();
-};
+CUDAStream& CUDAStream::GetInstance() {
+    // The global stream state is given per thread like CUDA's internal
+    // device state.
+    thread_local CUDAStream instance = cuda::GetDefaultStream();
+    return instance;
+}
+
+CUDAStream CUDAStream::CreateNew() {
+    CUDAStream stream;
+    OPEN3D_CUDA_CHECK(cudaStreamCreate(&stream.stream_));
+    // Having async memcpy device->host is very dangerous if you don't know what
+    // you are doing.
+    stream.SetShouldSyncMemcpyFromDeviceToHost(true);
+    return stream;
+}
 
-cudaStream_t GetStream() { return CUDAStream::GetInstance().Get(); }
+void CUDAStream::SetShouldSyncMemcpyFromDeviceToHost(
+        bool sync_memcpy_device_to_host) {
+    OPEN3D_ASSERT(!IsDefaultStream());
+    sync_memcpy_from_device_to_host_ = sync_memcpy_device_to_host;
+}
 
-static void SetStream(cudaStream_t stream) {
-    CUDAStream::GetInstance().Set(stream);
+bool CUDAStream::ShouldSyncMemcpyFromDeviceToHost() const {
+    return sync_memcpy_from_device_to_host_;
 }
 
-cudaStream_t GetDefaultStream() { return CUDAStream::Default(); }
+bool CUDAStream::IsDefaultStream() const {
+    return stream_ == static_cast<cudaStream_t>(nullptr);
+}
 
-#endif
+cudaStream_t CUDAStream::Get() const { return stream_; }
 
-}  // namespace cuda
+void CUDAStream::Set(cudaStream_t stream) { stream_ = stream; }
 
-#ifdef BUILD_CUDA_MODULE
+void CUDAStream::Destroy() {
+    OPEN3D_ASSERT(!IsDefaultStream());
+    OPEN3D_CUDA_CHECK(cudaStreamDestroy(stream_));
+    *this = cuda::GetDefaultStream();
+}
 
 CUDAScopedDevice::CUDAScopedDevice(int device_id)
     : prev_device_id_(cuda::GetDevice()) {
@@ -189,27 +203,22 @@ CUDAScopedDevice::CUDAScopedDevice(const Device& device)
 
 CUDAScopedDevice::~CUDAScopedDevice() { cuda::SetDevice(prev_device_id_); }
 
-constexpr CUDAScopedStream::CreateNewStreamTag
-        CUDAScopedStream::CreateNewStream;
-
-CUDAScopedStream::CUDAScopedStream(const CreateNewStreamTag&)
-    : prev_stream_(cuda::GetStream()), owns_new_stream_(true) {
-    OPEN3D_CUDA_CHECK(cudaStreamCreate(&new_stream_));
-    cuda::SetStream(new_stream_);
-}
-
-CUDAScopedStream::CUDAScopedStream(cudaStream_t stream)
-    : prev_stream_(cuda::GetStream()),
+CUDAScopedStream::CUDAScopedStream(CUDAStream stream, bool destroy_on_exit)
+    : prev_stream_(CUDAStream::GetInstance()),
       new_stream_(stream),
-      owns_new_stream_(false) {
-    cuda::SetStream(stream);
+      owns_new_stream_(destroy_on_exit) {
+    CUDAStream::GetInstance() = new_stream_;
 }
 
 CUDAScopedStream::~CUDAScopedStream() {
     if (owns_new_stream_) {
-        OPEN3D_CUDA_CHECK(cudaStreamDestroy(new_stream_));
+        OPEN3D_ASSERT((prev_stream_.Get() != new_stream_.Get()) &&
+                      "CUDAScopedStream destroy_on_exit would destroy the same "
+                      "stream which was in place before the scoped stream was "
+                      "created.");
+        new_stream_.Destroy();
     }
-    cuda::SetStream(prev_stream_);
+    CUDAStream::GetInstance() = prev_stream_;
 }
 
 CUDAState& CUDAState::GetInstance() {
 
@@ -57,6 +57,64 @@ namespace core {
 
 #ifdef BUILD_CUDA_MODULE
 
+/// \class CUDAStream
+///
+/// An Open3D representation of a CUDA stream.
+///
+class CUDAStream {
+public:
+    static CUDAStream& GetInstance();
+
+    /// Creates a new CUDA stream.
+    /// The caller is responsible for eventually destroying the stream by
+    /// calling Destroy().
+    static CUDAStream CreateNew();
+
+    /// Sets the flag indicating if all memory copy operations done within
+    /// this stream from device -> host should be synchronized. True by
+    /// default. The default CUDA stream is implicitly synchronized with every
+    /// other stream. As such, it is invalid to call this function on the
+    /// default stream. stream. \p sync_memcpy_device_to_host true or false to
+    /// enable or disable the synchronization. Having non-synchronous memory
+    /// copy from device to host can result in memory corruption and various
+    /// other problems if you do not know what you are doing. Example:
+    /// ```cpp
+    /// void pokingTheBear() {
+    ///     CUDAScopedStream scoped_stream(CUDAStream::CreateNew(), true);
+    ///     CUDAStream::GetInstance().SetShouldSyncMemcpyFromDeviceToHost(false);
+    ///     Tensor foo = Tensor::Init<float>({0.f}, "CUDA:0");
+    ///     Tensor foo_cpu = foo.To("CPU:0"); // launches an async copy from
+    ///     device to cpu memory owned by foo_cpu. Until the async copy
+    ///     completes, the memory will be uninitialized (random garbage).
+    ///     // Any operations on foo_cpu will be undefined here, as you cannot
+    ///     be sure the async memcpy has finished or not
+    ///     cuda::Synchronize(CUDAStream::GetInstance()); // force a manual sync
+    ///     // It is now safe to perform operations on foo_cpu
+    /// }
+    /// ```
+    void SetShouldSyncMemcpyFromDeviceToHost(bool sync_memcpy_device_to_host);
+
+    /// Returns the current value of the memory synchronization flag.
+    /// The default stream will always return false, because it is implicitly
+    /// synchronized.
+    bool ShouldSyncMemcpyFromDeviceToHost() const;
+
+    /// Returns true if this refers to the default CUDA stream.
+    bool IsDefaultStream() const;
+
+    cudaStream_t Get() const;
+    void Set(cudaStream_t stream);
+
+    /// Destroys the underlying CUDA stream. It is invalid to call this on the
+    /// default stream. After this call, this object refers to the default
+    /// stream.
+    void Destroy();
+
+private:
+    cudaStream_t stream_ = static_cast<cudaStream_t>(nullptr);
+    bool sync_memcpy_from_device_to_host_ = false;
+};
+
 /// \class CUDAScopedDevice
 ///
 /// Switch CUDA device id in the current scope. The device id will be reset
@@ -135,29 +193,17 @@ class CUDAScopedDevice {
 /// }
 /// ```
 class CUDAScopedStream {
-private:
-    struct CreateNewStreamTag {
-        CreateNewStreamTag(const CreateNewStreamTag&) = delete;
-        CreateNewStreamTag& operator=(const CreateNewStreamTag&) = delete;
-        CreateNewStreamTag(CreateNewStreamTag&&) = delete;
-        CreateNewStreamTag& operator=(CreateNewStreamTag&&) = delete;
-    };
-
 public:
-    constexpr static CreateNewStreamTag CreateNewStream = {};
-
-    explicit CUDAScopedStream(const CreateNewStreamTag&);
-
-    explicit CUDAScopedStream(cudaStream_t stream);
+    explicit CUDAScopedStream(CUDAStream stream, bool destroy_on_exit = false);
 
     ~CUDAScopedStream();
 
     CUDAScopedStream(const CUDAScopedStream&) = delete;
     CUDAScopedStream& operator=(const CUDAScopedStream&) = delete;
 
 private:
-    cudaStream_t prev_stream_;
-    cudaStream_t new_stream_;
+    CUDAStream prev_stream_;
+    CUDAStream new_stream_;
     bool owns_new_stream_ = false;
 };
 
@@ -265,8 +311,11 @@ bool SupportsMemoryPools(const Device& device);
 #ifdef BUILD_CUDA_MODULE
 
 int GetDevice();
-cudaStream_t GetStream();
-cudaStream_t GetDefaultStream();
+CUDAStream GetDefaultStream();
+
+/// Calls cudaStreamSynchronize() for the specified CUDA stream.
+/// \param stream The stream to be synchronized.
+void Synchronize(const CUDAStream& stream);
 
 #endif
 
 
@@ -638,7 +638,7 @@ class Indexer {
 class IndexerIterator {
 public:
     struct Iterator {
-        Iterator() {};
+        Iterator(){};
         Iterator(const Indexer& indexer);
         Iterator(Iterator&& other) = default;
 
 
@@ -22,7 +22,8 @@ void* MemoryManagerCUDA::Malloc(size_t byte_size, const Device& device) {
 #if CUDART_VERSION >= 11020
         if (cuda::SupportsMemoryPools(device)) {
             OPEN3D_CUDA_CHECK(cudaMallocAsync(static_cast<void**>(&ptr),
-                                              byte_size, cuda::GetStream()));
+                                              byte_size,
+                                              CUDAStream::GetInstance().Get()));
         } else {
             OPEN3D_CUDA_CHECK(cudaMalloc(static_cast<void**>(&ptr), byte_size));
         }
@@ -43,7 +44,8 @@ void MemoryManagerCUDA::Free(void* ptr, const Device& device) {
         if (ptr && IsCUDAPointer(ptr, device)) {
 #if CUDART_VERSION >= 11020
             if (cuda::SupportsMemoryPools(device)) {
-                OPEN3D_CUDA_CHECK(cudaFreeAsync(ptr, cuda::GetStream()));
+                OPEN3D_CUDA_CHECK(
+                        cudaFreeAsync(ptr, CUDAStream::GetInstance().Get()));
             } else {
                 OPEN3D_CUDA_CHECK(cudaFree(ptr));
             }
@@ -62,22 +64,26 @@ void MemoryManagerCUDA::Memcpy(void* dst_ptr,
                                const void* src_ptr,
                                const Device& src_device,
                                size_t num_bytes) {
+    const CUDAStream& current_stream = CUDAStream::GetInstance();
     if (dst_device.IsCUDA() && src_device.IsCPU()) {
         if (!IsCUDAPointer(dst_ptr, dst_device)) {
             utility::LogError("dst_ptr is not a CUDA pointer.");
         }
         CUDAScopedDevice scoped_device(dst_device);
         OPEN3D_CUDA_CHECK(cudaMemcpyAsync(dst_ptr, src_ptr, num_bytes,
                                           cudaMemcpyHostToDevice,
-                                          cuda::GetStream()));
+                                          current_stream.Get()));
     } else if (dst_device.IsCPU() && src_device.IsCUDA()) {
         if (!IsCUDAPointer(src_ptr, src_device)) {
             utility::LogError("src_ptr is not a CUDA pointer.");
         }
         CUDAScopedDevice scoped_device(src_device);
         OPEN3D_CUDA_CHECK(cudaMemcpyAsync(dst_ptr, src_ptr, num_bytes,
                                           cudaMemcpyDeviceToHost,
-                                          cuda::GetStream()));
+                                          current_stream.Get()));
+        if (current_stream.ShouldSyncMemcpyFromDeviceToHost()) {
+            OPEN3D_CUDA_CHECK(cudaStreamSynchronize(current_stream.Get()));
+        }
     } else if (dst_device.IsCUDA() && src_device.IsCUDA()) {
         if (!IsCUDAPointer(dst_ptr, dst_device)) {
             utility::LogError("dst_ptr is not a CUDA pointer.");
@@ -90,25 +96,25 @@ void MemoryManagerCUDA::Memcpy(void* dst_ptr,
             CUDAScopedDevice scoped_device(src_device);
             OPEN3D_CUDA_CHECK(cudaMemcpyAsync(dst_ptr, src_ptr, num_bytes,
                                               cudaMemcpyDeviceToDevice,
-                                              cuda::GetStream()));
+                                              current_stream.Get()));
         } else if (CUDAState::GetInstance().IsP2PEnabled(src_device.GetID(),
                                                          dst_device.GetID())) {
             OPEN3D_CUDA_CHECK(cudaMemcpyPeerAsync(
                     dst_ptr, dst_device.GetID(), src_ptr, src_device.GetID(),
-                    num_bytes, cuda::GetStream()));
+                    num_bytes, current_stream.Get()));
         } else {
             void* cpu_buf = MemoryManager::Malloc(num_bytes, Device("CPU:0"));
             {
                 CUDAScopedDevice scoped_device(src_device);
                 OPEN3D_CUDA_CHECK(cudaMemcpyAsync(cpu_buf, src_ptr, num_bytes,
                                                   cudaMemcpyDeviceToHost,
-                                                  cuda::GetStream()));
+                                                  current_stream.Get()));
             }
             {
                 CUDAScopedDevice scoped_device(dst_device);
                 OPEN3D_CUDA_CHECK(cudaMemcpyAsync(dst_ptr, cpu_buf, num_bytes,
                                                   cudaMemcpyHostToDevice,
-                                                  cuda::GetStream()));
+                                                  current_stream.Get()));
             }
             MemoryManager::Free(cpu_buf, Device("CPU:0"));
         }
 
@@ -61,8 +61,8 @@ void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {
     int64_t grid_size = (n + items_per_block - 1) / items_per_block;
 
     ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
-            <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
-                    n, func);
+            <<<grid_size, OPEN3D_PARFOR_BLOCK, 0,
+               CUDAStream::GetInstance().Get()>>>(n, func);
     OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");
 }
 
 
@@ -58,15 +58,17 @@ class CUDAHashBackendBufferAccessor {
         std::vector<uint8_t *> value_ptrs(n_values_);
         for (size_t i = 0; i < n_values_; ++i) {
             value_ptrs[i] = value_buffers[i].GetDataPtr<uint8_t>();
-            cudaMemset(value_ptrs[i], 0, capacity_ * value_dsizes_host[i]);
+            OPEN3D_CUDA_CHECK(cudaMemsetAsync(
+                    value_ptrs[i], 0, capacity_ * value_dsizes_host[i],
+                    core::CUDAStream::GetInstance().Get()));
         }
         values_ = static_cast<uint8_t **>(
                 MemoryManager::Malloc(n_values_ * sizeof(uint8_t *), device));
         MemoryManager::MemcpyFromHost(values_, device, value_ptrs.data(),
                                       n_values_ * sizeof(uint8_t *));
 
         heap_top_ = hashmap_buffer.GetHeapTop().cuda.GetDataPtr<int>();
-        cuda::Synchronize();
+        cuda::Synchronize(CUDAStream::GetInstance());
         OPEN3D_CUDA_CHECK(cudaGetLastError());
     }
Original file line number	Diff line number	Diff line change
`@@ -61,8 +61,8 @@ void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {`
`61`	`61`	`int64_t grid_size = (n + items_per_block - 1) / items_per_block;`
`62`	`62`
`63`	`63`	`ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>`
`64`		`- <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(`
`65`		`- n, func);`
	`64`	`+ <<<grid_size, OPEN3D_PARFOR_BLOCK, 0,`
	`65`	`+ CUDAStream::GetInstance().Get()>>>(n, func);`
`66`	`66`	`OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");`
`67`	`67`	`}`
`68`	`68`