Cast and copy kwargs automatically

rostan-t · rostan-t · commit 3af166bd903f · 2026-04-28T09:21:25.000Z
Signed-off-by: Rostan Tabet &lt;rtabet@nvidia.com&gt;
diff --git a/dali/python/nvidia/dali/experimental/dynamic/_compile.py b/dali/python/nvidia/dali/experimental/dynamic/_compile.py
@@ -22,6 +22,8 @@
 from typing import TYPE_CHECKING, Any, NamedTuple
 
 import nvidia.dali.backend_impl as _b
+import nvidia.dali.types as dali_types
+from nvidia.dali import fn
 from nvidia.dali.external_source import ExternalSource
 from nvidia.dali.pipeline import Pipeline
 
@@ -74,6 +76,7 @@ class CompileNode:
     backend: str
     inputs: Sequence[CompileRef | Any]
     kwargs: Mapping[str, CompileRef | Any]
+    kwarg_casts: dict[str, dali_types.DALIDataType]
     num_outputs: int
     device: Device | None = None
     pipeline_output_offset: int | None = dataclasses.field(default=None, repr=False)
@@ -201,6 +204,25 @@ def make_source_batches(self, tensor_lists: Sequence[Any]) -> tuple[CompiledBatc
             for i, tl in enumerate(tensor_lists)
         )
 
+    @staticmethod
+    def _compute_kwarg_casts(op: type["Operator"], raw_kwargs: Mapping[str, CompiledBatch | Any]):
+        casts: dict[str, dali_types.DALIDataType] = {}
+        schema = op._schema
+        assert schema is not None
+
+        for name, data in raw_kwargs.items():
+            if not isinstance(data, CompiledBatch):
+                continue
+
+            expected_type = schema.GetArgumentType(name)
+            expected_type = dali_types._vector_types.get(expected_type, expected_type)
+            if expected_type == data.dtype.type_id:
+                continue
+
+            casts[name] = expected_type
+
+        return casts
+
     @_nvtx_range("Recording operator")
     def record(
         self,
@@ -209,18 +231,21 @@ def record(
         backend: str,
         inputs: Sequence[CompileRef | Any],
         kwargs: Mapping[str, CompileRef | Any],
+        raw_kwargs: Mapping[str, CompiledBatch | Any],
         num_outputs: int,
         device: Device | None = None,
     ) -> CompileNode | None:
         if existing := self._call_trie.find(call_chain):
             if existing.inputs == inputs and existing.kwargs == kwargs:
                 return existing
             return None
+
         node = CompileNode(
             op_class=op_class,
             backend=backend,
             inputs=inputs,
             kwargs=kwargs,
+            kwarg_casts=self._compute_kwarg_casts(op_class, raw_kwargs),
             num_outputs=num_outputs,
             device=device,
         )
@@ -466,6 +491,14 @@ def _wire_compile_graph(
         kw_scalars = {
             k: _scalar_decay(v) for k, v in node.kwargs.items() if not isinstance(v, CompileRef)
         }
+
+        # Cast kwargs when necessary
+        for name, dtype in node.kwarg_casts.items():
+            kw_nodes[name] = fn.cast(kw_nodes[name], dtype=dtype)
+        # All kwargs need to be on the CPU
+        for name, kw_node in kw_nodes.items():
+            kw_nodes[name] = kw_node.cpu()
+
         op = node.op_class._legacy_op(device=node.backend, **kw_scalars)
         out = op(*positional, **kw_nodes)
 
@@ -535,6 +568,7 @@ def _call():
             backend=backend,
             inputs=classified_inputs,
             kwargs=classified_kwargs,
+            raw_kwargs=raw_kwargs,
             num_outputs=len(results),
             device=device,
         )
diff --git a/dali/test/python/experimental_mode/test_compile.py b/dali/test/python/experimental_mode/test_compile.py
@@ -85,7 +85,7 @@ def test_compile_basic_pipeline():
 
     assert len(dynamic_results) == len(compiled_results)
     for dyn, comp in zip(dynamic_results, compiled_results):
-        np.testing.assert_array_almost_equal(dyn, comp)
+        np.testing.assert_array_equal(dyn, comp)
 
 
 @eval_modes()
@@ -333,3 +333,25 @@ def make_reader():
     assert len(dynamic_results) == len(compiled_results)
     for dyn, comp in zip(dynamic_results, compiled_results):
         np.testing.assert_array_equal(dyn, comp)
+
+
+def test_compile_incompatible_kwarg_dtype():
+    reader_dyn = ndd.readers.File(file_root=images_root)
+    reader_comp = ndd.readers.File(file_root=images_root)
+
+    dynamic_results = []
+    for jpegs, _ in reader_dyn.next_epoch(batch_size=4, compile=False):
+        img = ndd.decoders.image(jpegs, device="gpu")
+        resized = ndd.tensor_resize(img, sizes=ndd._shape(img))
+        dynamic_results.append(ndd.as_tensor(resized, pad=True).cpu())
+
+    compiled_results = []
+    for jpegs, _ in reader_comp.next_epoch(batch_size=4, compile=True):
+        img = ndd.decoders.image(jpegs, device="gpu")
+        resized = ndd.tensor_resize(img, sizes=ndd._shape(img))
+        assert _is_compiled(resized), resized
+        compiled_results.append(ndd.as_tensor(resized, pad=True).cpu())
+
+    assert len(dynamic_results) == len(compiled_results)
+    for dyn, comp in zip(dynamic_results, compiled_results):
+        np.testing.assert_array_equal(dyn, comp)