Align separated prefetch feed count

JanuszL · JanuszL · commit a0755d255ebb · 2026-06-17T15:23:23.000+02:00
Make async separated executor prefetch and InputFeedCount use the same maximum queue-depth contract. Keep the Python separated prefetch path for drainable queue shapes, but use interleaved prefetch when the CPU queue is longer than the GPU queue so end-of-epoch Python sources do not leave CPU-only work without scheduled Mixed/GPU stages.

Signed-off-by: Janusz Lisiecki &lt;jlisiecki@nvidia.com&gt;
diff --git a/dali/pipeline/executor/async_separated_pipelined_executor.cc b/dali/pipeline/executor/async_separated_pipelined_executor.cc
@@ -14,6 +14,8 @@
 
 #include "dali/pipeline/executor/async_separated_pipelined_executor.h"
 
+#include <algorithm>
+
 namespace dali {
 
 void AsyncSeparatedPipelinedExecutor::RunCPU() {
@@ -39,14 +41,16 @@ void AsyncSeparatedPipelinedExecutor::Prefetch() {
     RunGPU();
   }
 
-  for (int i = 0; i < queue_sizes_.cpu_size; i++) {
+  int cpu_only_prefetch_count =
+      std::max(0, queue_sizes_.cpu_size - queue_sizes_.gpu_size);
+  for (int i = 0; i < cpu_only_prefetch_count; i++) {
     RunCPU();
   }
 }
 
 int AsyncSeparatedPipelinedExecutor::InputFeedCount(std::string_view op_name) {
   (void)graph_->Node(op_name);
-  return queue_sizes_.cpu_size + queue_sizes_.gpu_size;
+  return std::max(queue_sizes_.cpu_size, queue_sizes_.gpu_size);
 }
 
 }  // namespace dali
diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
@@ -1591,22 +1591,34 @@ def _prefetch(self):
             raise RuntimeError("The pipeline was destroyed.")
         self._schedule_py_workers()
 
-        # Keep input feeding interleaved with backend runs. Feeding all inputs
-        # first can leave separated execution with CPU-prefetched batches that
-        # have no scheduled Mixed/GPU work when an external source reaches end
-        # of epoch.
-        self._legacy_interleaved_prefetch()
+        # A larger separated CPU queue leaves CPU-only iterations after backend
+        # Prefetch. If a Python source reaches end of epoch, those iterations
+        # cannot be advanced through Mixed/GPU without feeding more CPU work.
+        if (
+            not self._exec_separated
+            or self._cpu_queue_size > self._gpu_queue_size
+        ):
+            self._legacy_interleaved_prefetch()
+            return
+
+        # The new way: try to run the inputs and then feed them, finally call
+        # _pipe.Prefetch(). If this fails, we just run `_pipe.Run()` a bunch of
+        # times. This will likely blow up for separated queues, which are not
+        # properly supported anyway.
+        iters_fed = 0
+        self._first_iter = False
+        iters_fed, success = self._prefetch_inputs()
+        if success:
+            self._pipe.Prefetch()
+        else:
+            self._last_iter = True
+            for _ in range(iters_fed):
+                self._pipe.Run()
 
     # This is the old way of prefetching - the feeding and running steps are interleaved.
     # Running all callbacks at once, then feeding, then running - may affect the performance
     # of the 1st iteration.
     def _legacy_interleaved_prefetch(self):
-        # Separated execution has independent CPU and GPU queue depths, but an
-        # interleaved Run schedules one whole pipeline iteration through all
-        # stages. After max(cpu, gpu) runs each stage has seen enough iterations
-        # to fill its own queue. Using cpu + gpu would schedule extra full
-        # iterations, not just fill the GPU queue, and could over-read external
-        # inputs at an epoch boundary.
         prefetch_count = (
             max(self._cpu_queue_size, self._gpu_queue_size)
             if self._exec_separated
@@ -1624,6 +1636,27 @@ def _legacy_interleaved_prefetch(self):
                 self._last_iter = True
                 break
 
+    def _prefetch_inputs(self):
+        prefetched, success = self._run_input_callbacks(True)
+        self._batches_to_consume += prefetched
+
+        if success:
+            if self._exec_separated:
+                prefetch_count = max(self._cpu_queue_size, self._gpu_queue_size)
+            else:
+                prefetch_count = self._cpu_queue_size
+
+            for i in range(prefetched, prefetch_count):
+                try:
+                    self.iter_setup()
+                    prefetched = i + 1
+                    self._batches_to_consume += 1
+                except StopIteration:
+                    success = False
+                    break
+
+        return prefetched, success
+
     def _run_once(self):
         """Start running the whole pipeline once without waiting for its results.
 

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,8 @@`
`14`	`14`
`15`	`15`	`#include "dali/pipeline/executor/async_separated_pipelined_executor.h"`
`16`	`16`
	`17`	`+#include <algorithm>`
	`18`	`+`
`17`	`19`	`namespace dali {`
`18`	`20`
`19`	`21`	`void AsyncSeparatedPipelinedExecutor::RunCPU() {`
`@@ -39,14 +41,16 @@ void AsyncSeparatedPipelinedExecutor::Prefetch() {`
`39`	`41`	`RunGPU();`
`40`	`42`	`}`
`41`	`43`
`42`		`- for (int i = 0; i < queue_sizes_.cpu_size; i++) {`
	`44`	`+ int cpu_only_prefetch_count =`
	`45`	`+ std::max(0, queue_sizes_.cpu_size - queue_sizes_.gpu_size);`
	`46`	`+ for (int i = 0; i < cpu_only_prefetch_count; i++) {`
`43`	`47`	`RunCPU();`
`44`	`48`	`}`
`45`	`49`	`}`
`46`	`50`
`47`	`51`	`int AsyncSeparatedPipelinedExecutor::InputFeedCount(std::string_view op_name) {`
`48`	`52`	`(void)graph_->Node(op_name);`
`49`		`- return queue_sizes_.cpu_size + queue_sizes_.gpu_size;`
	`53`	`+ return std::max(queue_sizes_.cpu_size, queue_sizes_.gpu_size);`
`50`	`54`	`}`
`51`	`55`
`52`	`56`	`} // namespace dali`