Fix multi-device GPU external source. (#3710)

mzient · web-flow · commit 023fe52a1d67 · 2022-02-28T22:14:28.000+01:00
* Fix multi-device GPU external source.
* Add cross-device test for PyTorch. Fix MXNet parallel external source test.

Signed-off-by: Michal Zientkiewicz &lt;michalz@nvidia.com&gt;
diff --git a/dali/core/access_order.cc b/dali/core/access_order.cc
@@ -16,6 +16,7 @@
 #include "dali/core/cuda_error.h"
 #include "dali/core/cuda_stream.h"
 #include "dali/core/cuda_event_pool.h"
+#include "dali/core/device_guard.h"
 
 namespace dali {
 
@@ -37,7 +38,18 @@ void AccessOrder::wait(const AccessOrder &other) const {
     int other_dev = other.device_id();
     auto event = pool.Get(other_dev);
     // Record an event in the preceding stream
-    CUDA_CALL(cudaEventRecord(event, other.stream()));
+
+    // If the stream handle has a special value, we can't refer to it directly - it is
+    // inherently associated with the concept of "current device" and it must be switched
+    if (other_dev != device_id_ &&
+        (other.stream_ == 0 ||
+         other.stream_ == cudaStreamPerThread ||
+         other.stream_ == cudaStreamLegacy)) {
+      DeviceGuard dg(other.device_id_);
+      CUDA_CALL(cudaEventRecord(event, other.stream()));
+    } else {
+      CUDA_CALL(cudaEventRecord(event, other.stream()));
+    }
     // and wait for it in this stream
     CUDA_CALL(cudaStreamWaitEvent(stream(), event, 0));
     pool.Put(std::move(event), other_dev);
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
@@ -1619,14 +1619,23 @@ PYBIND11_MODULE(backend_impl, m) {
 
           // not the most beautiful but at least it doesn't throw as plain cast<T>()
           py::detail::make_caster<Tensor<CPUBackend>&> conv;
-          bool is_cpu_data = conv.load(static_cast<py::object>(list[0]), true);
+          bool is_cpu_data = list.empty() || conv.load(static_cast<py::object>(list[0]), true);
           if (is_cpu_data) {
             FeedPipeline<CPUBackend>(p, name, list, AccessOrder::host(), true);
           } else {
-            cudaStream_t stream = cuda_stream.is_none()
-                                ? UserStream::Get()->GetStream(list[0].cast<Tensor<GPUBackend>&>())
-                                : static_cast<cudaStream_t>(ctypes_void_ptr(cuda_stream));
-            FeedPipeline<GPUBackend>(p, name, list, stream, cuda_stream.is_none(), use_copy_kernel);
+            int device_id = p->device_id();
+            cudaStream_t stream = 0;
+            if (!cuda_stream.is_none())
+              stream = static_cast<cudaStream_t>(ctypes_void_ptr(cuda_stream));
+
+            if (!list.empty()) {
+              auto &sample0 = list[0].cast<Tensor<GPUBackend>&>();
+              if (cuda_stream.is_none())
+                stream = UserStream::Get()->GetStream(sample0);
+              device_id = sample0.device_id();
+            }
+            AccessOrder order(stream, device_id);
+            FeedPipeline<GPUBackend>(p, name, list, order, cuda_stream.is_none(), use_copy_kernel);
           }
         },
         "name"_a,
diff --git a/dali/python/nvidia/dali/_debug_mode.py b/dali/python/nvidia/dali/_debug_mode.py
@@ -126,8 +126,8 @@ def __rxor__(self, other):
         return _PipelineDebug.current()._wrap_op_call(_arithm_op, ["bitxor", other, self], {})
 
 
-def _transform_data_to_tensorlist(data, batch_size, layout=None):
-    data = _prep_data_for_feed_input(data, batch_size, layout)
+def _transform_data_to_tensorlist(data, batch_size, layout=None, device_id=None):
+    data = _prep_data_for_feed_input(data, batch_size, layout, device_id)
 
     if isinstance(data, list):
         if isinstance(data[0], _tensors.TensorGPU):
@@ -268,7 +268,7 @@ def __enter__(self):
 
     def build(self):
         """Build the pipeline.
-        
+
         Refer to :meth:`Pipeline.build() <nvidia.dali.Pipeline.build>` for details."""
         self._built = True
 
@@ -299,7 +299,7 @@ def run(self):
 
     def feed_input(self, data_node, data, **kwargs):
         """Pass data to an ExternalSource operator inside the pipeline.
-        
+
         Refer to :meth:`Pipeline.feed_input() <nvidia.dali.Pipeline.feed_input>` for details."""
         if not self._built:
             raise RuntimeError("Pipeline must be built first.")
diff --git a/dali/python/nvidia/dali/_utils/external_source_impl.py b/dali/python/nvidia/dali/_utils/external_source_impl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -76,7 +76,7 @@ def assert_cpu_sample_data_type(sample, error_str="Unsupported callback return t
     if isinstance(sample, np.ndarray):
         return True
     if types._is_mxnet_array(sample):
-        if sample.ctx.device_type != 'cpu':
+        if sample.context.device_type != 'cpu':
             raise TypeError("Unsupported callback return type. "
                             "GPU tensors are not supported. Got an MXNet GPU tensor.")
         return True
@@ -111,7 +111,7 @@ def sample_to_numpy(sample, error_str="Unsupported callback return type. Got: `{
     if isinstance(sample, np.ndarray):
         return sample
     if types._is_mxnet_array(sample):
-        if sample.ctx.device_type != 'cpu':
+        if sample.context.device_type != 'cpu':
             raise TypeError("Unsupported callback return type. "
                             "GPU tensors are not supported. Got an MXNet GPU tensor.")
         return sample.asnumpy()
diff --git a/dali/python/nvidia/dali/external_source.py b/dali/python/nvidia/dali/external_source.py
@@ -54,8 +54,7 @@ def _check_data_batch(data, batch_size, layout):
         if layout is not None and layout != "" and dim != len(layout):
             raise RuntimeError("The layout '{}' cannot describe {}-dimensional data".format(layout, dim))
 
-
-def _prep_data_for_feed_input(data, batch_size, layout):
+def _prep_data_for_feed_input(data, batch_size, layout, device_id = None):
     def to_numpy(x):
         if _types._is_mxnet_array(x):
             return x.asnumpy()
@@ -82,7 +81,10 @@ def to_numpy(x):
             if isinstance(datum, (_tensors.TensorCPU, _tensors.TensorGPU)):
                 inp = type(datum)(datum, layout=layout) if layout is not None else datum
             elif hasattr(datum, "__cuda_array_interface__") or (info[0] and info[1]):
-                inp = _tensors.TensorGPU(datum, layout or "")
+                array_device_id = _types._get_device_id_for_array(datum)
+                if array_device_id is None:
+                    array_device_id = device_id
+                inp = _tensors.TensorGPU(datum, layout or "", array_device_id)
             else:
                 datum = to_numpy(datum)
                 inp = _tensors.TensorCPU(datum, layout or "")
diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
@@ -43,19 +43,6 @@ def _show_deprecation_warning(deprecated, in_favor_of):
                       Warning, stacklevel=2)
 
 
-def _get_default_stream_for_array(array):
-    if isinstance(array, list) and len(array):
-        array = array[0]
-    if types._is_torch_tensor(array):
-        import torch
-        return torch.cuda.current_stream().cuda_stream
-    elif types._is_cupy_array(array):
-        import cupy
-        return cupy.cuda.get_current_stream().ptr
-    else:
-        return None
-
-
 class Pipeline(object):
     """Pipeline class is the base of all DALI data pipelines. The pipeline
 encapsulates the data processing graph and the execution engine.
@@ -718,13 +705,13 @@ def build(self, define_graph = None):
     def _feed_input(self, name, data, layout=None, cuda_stream=None, use_copy_kernel=False):
         from nvidia.dali.external_source import _prep_data_for_feed_input
         if cuda_stream is None:
-            cuda_stream = _get_default_stream_for_array(data)
+            cuda_stream = types._get_default_stream_for_array(data)
         if cuda_stream == -1:
             cuda_stream = None
         else:
             cuda_stream = types._raw_cuda_stream(cuda_stream)
 
-        data = _prep_data_for_feed_input(data, self._max_batch_size, layout)
+        data = _prep_data_for_feed_input(data, self._max_batch_size, layout, self._device_id)
 
         if isinstance(data, list):
             self._pipe.SetExternalTensorInput(
diff --git a/dali/python/nvidia/dali/types.py b/dali/python/nvidia/dali/types.py
@@ -333,6 +333,30 @@ def _raw_cuda_stream(stream_obj):
     else:
         return stream_obj
 
+def _get_default_stream_for_array(array):
+    if isinstance(array, list) and len(array):
+        array = array[0]
+    if _is_torch_tensor(array):
+        import torch
+        return _raw_cuda_stream(torch.cuda.current_stream())
+    elif _is_cupy_array(array):
+        import cupy
+        return _raw_cuda_stream(cupy.cuda.get_current_stream())
+    else:
+        return None
+
+def _get_device_id_for_array(array):
+    if isinstance(array, list) and len(array):
+        array = array[0]
+    if _is_torch_tensor(array):
+        return array.device.index
+    elif _is_cupy_array(array):
+        return array.device
+    elif _is_mxnet_array(array):
+        return array.context.device_id
+    else:
+        return None
+
 _cupy_array_type_regex = re.compile('.*cupy\..*\.ndarray.*')
 
 def _is_cupy_array(value):
diff --git a/dali/test/python/test_external_source_cupy.py b/dali/test/python/test_external_source_cupy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020, 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -55,3 +55,32 @@ def generator(i):
     with check_output_pattern(pattern):
         for _ in range(iterations):
             pipe.run()
+
+
+def _test_cross_device(src, dst):
+    import nvidia.dali.fn as fn
+    import numpy as np
+
+    pipe = Pipeline(1, 3, dst)
+
+    iter = 0
+    def get_data():
+        nonlocal iter
+        with cp.cuda.Device(src):
+            data = cp.array([[1,2,3,4],[5,6,7,8]], dtype=cp.float32) + iter
+            iter += 1
+        return data
+
+    with pipe:
+        pipe.set_outputs(fn.external_source(get_data, batch=False, device='gpu'))
+
+    pipe.build()
+    for i in range(10):
+        out, = pipe.run()
+        assert np.array_equal(np.array(out[0].as_cpu()), np.array([[1,2,3,4],[5,6,7,8]]) + i)
+
+def test_cross_device():
+    if cp.cuda.runtime.getDeviceCount() > 1:
+        for src in [0,1]:
+            for dst in [0,1]:
+                yield _test_cross_device, src, dst
diff --git a/dali/test/python/test_external_source_pytorch_gpu.py b/dali/test/python/test_external_source_pytorch_gpu.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020, 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -43,3 +43,30 @@ def gen_batch():
 
             for i in range(10):
                 check_output(pipe.run(), [np.array([attempt * 100 + (i + 1) * 10 + 1.5], dtype=np.float32)])
+
+def _test_cross_device(src, dst):
+    import nvidia.dali.fn as fn
+    import numpy as np
+
+    pipe = Pipeline(1, 3, dst)
+
+    iter = 0
+    def get_data():
+        nonlocal iter
+        data = torch.tensor([[1,2,3,4],[5,6,7,8]], dtype=torch.float32).cuda(device=dst) + iter
+        iter += 1
+        return data
+
+    with pipe:
+        pipe.set_outputs(fn.external_source(get_data, batch=False, device='gpu'))
+
+    pipe.build()
+    for i in range(10):
+        out, = pipe.run()
+        assert np.array_equal(np.array(out[0].as_cpu()), np.array([[1,2,3,4],[5,6,7,8]]) + i)
+
+def test_cross_device():
+    if torch.cuda.device_count() > 1:
+        for src in [0,1]:
+            for dst in [0,1]:
+                yield _test_cross_device, src, dst