Fix separated external source prefetch drain

JanuszL · JanuszL · commit fe2b2524cdc8 · 2026-06-17T10:53:01.000+02:00
Keep pipeline prefetching interleaved with backend runs so separated
execution does not leave CPU-prefetched external source batches without
scheduled Mixed/GPU work at end of epoch. Prime separated execution for
the maximum of CPU and GPU queue depths to avoid underfilling asymmetric
queue configurations.

Add a regression that drains a batch external source through mixed image
decoding with symmetric and asymmetric separated CPU/GPU prefetch queues.

Signed-off-by: Janusz Lisiecki &lt;jlisiecki@nvidia.com&gt;
diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
@@ -1591,29 +1591,22 @@ def _prefetch(self):
             raise RuntimeError("The pipeline was destroyed.")
         self._schedule_py_workers()
 
-        # We probably need some benchmarking before we remove this code path
-        if not self._exec_separated:
-            self._legacy_interleaved_prefetch()
-            return
-
-        # The new way: try to run the inputs and then feed them, finally call _pipe.Prefetch()
-        # If this fails, we just run `_pipe.Run()` a bunch of times. This will likely blow up for
-        # separated queues, which are not properly supported anyway.
-        iters_fed = 0
-        self._first_iter = False
-        iters_fed, success = self._prefetch_inputs()
-        if success:
-            self._pipe.Prefetch()
-        else:
-            self._last_iter = True
-            for _ in range(iters_fed):
-                self._pipe.Run()
+        # Keep input feeding interleaved with backend runs. Feeding all inputs
+        # first can leave separated execution with CPU-prefetched batches that
+        # have no scheduled Mixed/GPU work when an external source reaches end
+        # of epoch.
+        self._legacy_interleaved_prefetch()
 
     # This is the old way of prefetching - the feeding and running steps are interleaved.
     # Running all callbacks at once, then feeding, then running - may affect the performance
     # of the 1st iteration.
     def _legacy_interleaved_prefetch(self):
-        for _ in range(self._cpu_queue_size):
+        prefetch_count = (
+            max(self._cpu_queue_size, self._gpu_queue_size)
+            if self._exec_separated
+            else self._cpu_queue_size
+        )
+        for _ in range(prefetch_count):
             try:
                 self._first_iter = False
                 self._iter_setup()
diff --git a/dali/test/python/test_pipeline.py b/dali/test/python/test_pipeline.py
@@ -1893,6 +1893,54 @@ def my_pipe():
     my_pipe(device_id=0, seed=1234, num_threads=3, set_affinity=True, py_num_workers=3)
 
 
+def test_separated_queue_external_source_drains_prefetched_batches():
+    batch_size = 4
+    num_batches = 10
+    image_pattern = os.path.join(jpeg_folder, "*", "*.jpg")
+    paths = sorted(glob.glob(image_pattern))[: batch_size * num_batches]
+    assert len(paths) == batch_size * num_batches
+
+    def batches():
+        for i in range(num_batches):
+            batch_paths = paths[i * batch_size : (i + 1) * batch_size]
+            yield [np.fromfile(path, dtype=np.uint8) for path in batch_paths]
+
+    for cpu_size, gpu_size in [(2, 2), (3, 2), (2, 3)]:
+
+        @dali.pipeline_def(
+            batch_size=batch_size,
+            num_threads=4,
+            device_id=0,
+            prefetch_queue_depth={"cpu_size": cpu_size, "gpu_size": gpu_size},
+        )
+        def pipe():
+            encoded = fn.external_source(
+                source=batches,
+                batch=True,
+                cycle="raise",
+            )
+            decoded = fn.decoders.image(
+                encoded,
+                device="mixed",
+                output_type=types.RGB,
+            )
+            return decoded
+
+        p = pipe()
+        p.build()
+        for _ in range(num_batches):
+            out = p.run()[0]
+            assert len(out) == batch_size
+            decoded = out.as_cpu()
+            for sample_idx in range(batch_size):
+                sample = decoded.at(sample_idx)
+                assert sample.ndim == 3
+                assert sample.shape[-1] == 3
+                assert np.any(sample)
+        with assert_raises(StopIteration):
+            p.run()
+
+
 def test_not_iterable():
     import nvidia.dali._utils.hacks as hacks
     import collections.abc