[ez][ET-VK][partitioner] Allow layout-agnostic ops to accept quantized layouts

ssjia · SS-JIA · commit 24da2f65cd4f · 2026-05-09T02:08:46.000-04:00
Pull Request resolved: #19395 Two changes that together let the partitioner keep PACKED_INT8 layouts flowing through identity-like ops, eliminating spurious clone dispatches: 1. utils.py: ANY_STORAGE_INCL_PACKED_INT8 (renamed from ALL_STORAGES_REPSET) previously claimed every layout (including PACKED_INT8_*) on the texture side, but PACKED_INT8 is buffer-only by convention — the texture indexing helpers and required_image_extents don't know about quantized layouts. Narrow the texture side to all_memory_layouts (float-only). Every existing call site is either an intersection identity or a wildcard for non-tensor / not-yet-prepacked args, so this narrow is non-breaking; and now the repset can act as a true universal set when intersected against quant-aware repsets. The new name slots cleanly next to ANY_STORAGE / ANY_BUFFER / ANY_TEXTURE and tells the reader exactly what is added: "like ANY_STORAGE, but also admits PACKED_INT8 (on the buffer side)". 2. op_registry.py: switch view_copy / clone / _clone_dim_order / alias_copy from inputs_storage=ANY_STORAGE to inputs_storage=ANY_STORAGE_INCL_PACKED_INT8. ANY_STORAGE is float-only, so when one of these no-op identity ops sits between two q8ta ops the BFS in TagMemoryMetaPass.constrain_op_*_repset short-circuits (zero overlap with PACKED_INT8_BUFFER) and forces transitions on both sides. With ANY_STORAGE_INCL_PACKED_INT8 they now admit both float and quantized layouts and the redundant-op transform folds them away. The 31 other ops using ANY_STORAGE are real compute ops (binaryop, comparison, softmax, argreduce, permute_copy, etc.) whose float-only kernels do not accept quantized int8x4 layouts (q8ta_* are separate ops); leaving those alone. On RefineNet 24feat (1x3x256x144) the 8 _clone_dim_order ops the partitioner had been inserting around the 4 fused q8ta_pixel_shuffle nodes are now folded by the delegate. Runtime q8ta_clone dispatches drop from 11 to 3 (the 3 residuals are unrelated, from the original model graph). ghstack-source-id: 379519734 @exported-using-ghexport Differential Revision: [D103770022](https://our.internmc.facebook.com/intern/diff/D103770022/)
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -252,10 +252,10 @@ def get_arg_tensor_source_repset(
         """
         arg_node = op_node.args[arg_i]
 
-        # For non-tensor arguments, return ALL_STORAGES_REPSET so that the respset does
+        # For non-tensor arguments, return ANY_STORAGE_INCL_PACKED_INT8 so that the respset does
         # not appear to be empty.
         if not utils.is_tensor_arg_node(arg_node):
-            return utils.ALL_STORAGES_REPSET
+            return utils.ANY_STORAGE_INCL_PACKED_INT8
 
         # Special case for cat - use the first tensor in the list as representative
         if isinstance(arg_node, list):
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -1158,7 +1158,7 @@ def register_permute_copy():
 @update_features(exir_ops.edge.aten.view_copy.default)
 def register_view_copy():
     return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
+        inputs_storage=utils.ANY_STORAGE_INCL_PACKED_INT8,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
         supports_highdim=True,
@@ -1213,7 +1213,7 @@ def register_unsqueeze_copy():
 @update_features(exir_ops.edge.aten.clone.default)
 def register_clone():
     return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
+        inputs_storage=utils.ANY_STORAGE_INCL_PACKED_INT8,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
         supports_highdim=True,
@@ -1223,7 +1223,7 @@ def register_clone():
 @update_features(exir_ops.edge.dim_order_ops._clone_dim_order.default)
 def register_clone_dim_order():
     return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
+        inputs_storage=utils.ANY_STORAGE_INCL_PACKED_INT8,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
         supports_highdim=True,
@@ -1237,7 +1237,7 @@ def register_clone_dim_order():
 @update_features(exir_ops.edge.aten.alias_copy.default)
 def register_alias_copy():
     return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
+        inputs_storage=utils.ANY_STORAGE_INCL_PACKED_INT8,
         inputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
         supports_highdim=True,
diff --git a/backends/vulkan/test/test_vulkan_tensor_repr.py b/backends/vulkan/test/test_vulkan_tensor_repr.py
@@ -649,7 +649,7 @@ def test_no_sync_primary_io_when_different_repsets(self):
     # -- Scalar args are skipped --
 
     def test_scalar_arg_skipped(self):
-        """Non-tensor args should be treated as ALL_STORAGES_REPSET."""
+        """Non-tensor args should be treated as ANY_STORAGE_INCL_PACKED_INT8."""
         tensor_arg = _make_tensor_arg_node((1, 3, 8, 8))
         # Second arg is a scalar (float)
         scalar_arg = 1.0
@@ -666,8 +666,8 @@ def test_scalar_arg_skipped(self):
             DEFAULT_TEXTURE_LIMITS,
         )
         self.assertFalse(op_repsets.any_is_empty())
-        # The scalar arg should get ALL_STORAGES_REPSET
-        # self.assertEqual(op_repsets.get_arg_repset(1), ALL_STORAGES_REPSET, f"""{op_repsets.get_arg_repset(1)}""")
+        # The scalar arg should get ANY_STORAGE_INCL_PACKED_INT8
+        # self.assertEqual(op_repsets.get_arg_repset(1), ANY_STORAGE_INCL_PACKED_INT8, f"""{op_repsets.get_arg_repset(1)}""")
 
     # -- pick_representations --
 
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
@@ -1203,8 +1203,15 @@ def filter_invalid_reprs_for_node_list(
 # Special use RepSets
 
 NO_STORAGE = TensorRepSet(set(), set())
-ALL_STORAGES_REPSET = TensorRepSet(
-    universal_memory_layout_set, universal_memory_layout_set
+# Buffer side admits both float and quantized (PACKED_INT8_*) layouts; texture side
+# is float-only because the Vulkan backend has no quantized texture support
+# (required_image_extents and the texture indexing helpers only know about the
+# float layouts). Used as an intersection identity (e.g. common_arg_repset
+# accumulator) and as a placeholder for non-tensor / not-yet-prepacked args, so
+# narrowing the texture side is non-breaking for those uses while letting it act
+# as a true universal set when intersected against quant-aware repsets.
+ANY_STORAGE_INCL_PACKED_INT8 = TensorRepSet(
+    universal_memory_layout_set, all_memory_layouts
 )
 
 
@@ -1330,19 +1337,19 @@ def __init__(  # noqa: C901
         # Now, go through the arguments of the operator and create a filtered repset
         # for each based on the actual tensor value.
         args_repset_list = TensorRepSetList([])
-        common_arg_repset = ALL_STORAGES_REPSET
+        common_arg_repset = ANY_STORAGE_INCL_PACKED_INT8
         for i, arg_node in enumerate(op_node.args):
             arg_repset = inputs_repsets[i]
 
-            # Use ALL_STORAGES_REPSET for non-tensor nodes so they don't cause the op
+            # Use ANY_STORAGE_INCL_PACKED_INT8 for non-tensor nodes so they don't cause the op
             # repsets to appear empty
             if not is_tensor_arg_node(arg_node):
-                args_repset_list.append(ALL_STORAGES_REPSET)
+                args_repset_list.append(ANY_STORAGE_INCL_PACKED_INT8)
             # NO_STORAGE is used to denote that an input is either a non tensor arg or
             # a weight tensor that is not prepacked. Similar to the above, use
-            # ALL_STORAGES_REPSET in this case.
+            # ANY_STORAGE_INCL_PACKED_INT8 in this case.
             elif arg_repset.is_empty():
-                args_repset_list.append(ALL_STORAGES_REPSET)
+                args_repset_list.append(ANY_STORAGE_INCL_PACKED_INT8)
             else:
                 assert not arg_repset.is_empty()
 
@@ -1355,7 +1362,7 @@ def __init__(  # noqa: C901
 
         # Repeat for output tensors.
         outs_repset_list = TensorRepSetList([])
-        common_out_repset = ALL_STORAGES_REPSET
+        common_out_repset = ANY_STORAGE_INCL_PACKED_INT8
         if num_tensors_in_node(op_node) == 1:
             common_out_repset = filter_invalid_reprs(
                 op_node.meta["val"], outputs_repsets[0], texture_limits