pytorch
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm‎
Lines changed: 0 additions & 3 deletions b/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎backends/cadence/aot/TARGETS‎
Lines changed: 17 additions & 1 deletion b/‎backends/cadence/aot/TARGETS‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 0 additions & 39 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 0 additions & 39 deletions
diff --git a/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 1 addition & 78 deletions b/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 1 addition & 78 deletions
diff --git a/‎backends/cadence/aot/quantizer/patterns.py‎
Lines changed: 0 additions & 22 deletions b/‎backends/cadence/aot/quantizer/patterns.py‎
Lines changed: 0 additions & 22 deletions
diff --git a/‎backends/cadence/aot/quantizer/quantizer.py‎
Lines changed: 0 additions & 29 deletions b/‎backends/cadence/aot/quantizer/quantizer.py‎
Lines changed: 0 additions & 29 deletions
@@ -1016,8 +1016,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
-        backend: [portable, xnnpack-f32, xnnpack-q8]
+        model: [mv3, resnet50, vit, mobilebert, emformer_transcribe]
+        backend: [portable, xnnpack-q8]
     with:
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
 
@@ -449,14 +449,12 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
         case ModelAssetType::CompiledModel: {
             // The model is already compiled; no further action needed.
             // Return the existing model URL.
-            ETCoreMLLogInfo("The model in the pte file is pre-compiled.  Skipping compilation.");
             return modelURL;
         }
 
         case ModelAssetType::Model: {
             // The model is not compiled yet.
             // Compile the model at the specified URL with a maximum wait time of 5 minutes.
-            ETCoreMLLogInfo("The model in the pte file is not pre-compiled.  Compiling with a 5 min timeout.");
             NSURL *compiledModelURL = [ETCoreMLModelCompiler compileModelAtURL:modelURL
                                                           maxWaitTimeInSeconds:(5 * 60)
                                                                          error:error];
@@ -492,7 +490,6 @@ - (nullable ETCoreMLAsset *)compiledModelAssetWithMetadata:(const ModelMetadata&
                                                                  error:error];
         if (compiledModelURL) {
             // Move the compiled model to the asset manager to transfer ownership.
-            ETCoreMLLogInfo("Storing compiled asset with identifier=%@ in the asset manager.", identifier);
             compiledModelAsset = [self.assetManager storeAssetAtURL:compiledModelURL withIdentifier:identifier error:error];
         }
     }];
 
@@ -143,7 +143,23 @@ executorch_generated_lib(
     visibility = ["PUBLIC"],
     deps = [
         "//executorch/backends/cadence/generic/kernels:cadence_kernels",
-        "//executorch/backends/cadence/generic/operators:cadence_generic_ops",
+        # Individual operator targets instead of combined cadence_generic_ops
+        "//executorch/backends/cadence/generic/operators:op_add",
+        "//executorch/backends/cadence/generic/operators:op_embedding",
+        "//executorch/backends/cadence/generic/operators:op_full",
+        "//executorch/backends/cadence/generic/operators:op_requantize_out",
+        "//executorch/backends/cadence/generic/operators:op_view_copy",
+        "//executorch/backends/cadence/generic/operators:im2row_out",
+        "//executorch/backends/cadence/generic/operators:dequantize_per_tensor",
+        "//executorch/backends/cadence/generic/operators:quantize_per_tensor",
+        "//executorch/backends/cadence/generic/operators:quantized_add_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv_nchw_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv_nhwc_out",
+        "//executorch/backends/cadence/generic/operators:quantized_fully_connected_out",
+        "//executorch/backends/cadence/generic/operators:quantized_layer_norm",
+        "//executorch/backends/cadence/generic/operators:quantized_linear_out",
+        "//executorch/backends/cadence/generic/operators:quantized_matmul_out",
+        "//executorch/backends/cadence/generic/operators:quantized_relu_out",
         "//executorch/kernels/portable:executorch_all_ops",
         "//executorch/kernels/portable:operators",
     ],
 
@@ -324,19 +324,6 @@
     "rope.out(Tensor input, Tensor sin_tensor, Tensor cos_tensor, Tensor? pos, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
-lib.define(
-    "quantized_softmax(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point) -> (Tensor out)"
-)
-lib.define(
-    "quantized_softmax.per_tensor(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point) -> (Tensor out)"
-)
-lib.define(
-    "quantized_softmax.out(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
-)
-lib.define(
-    "quantized_softmax.per_tensor_out(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
-)
-
 # Load/store with iDMA. These only exist before memory planning.
 # Post memory planning, we check that outputs/inputs for the load/store are in
 # DTCM and replace idma_load/idma_store with idma_copy.
@@ -2342,29 +2329,3 @@ def softmax_f32_f32_meta(
     half_to_float: Optional[bool] = None,
 ) -> torch.Tensor:
     return self.new_empty(self.size(), dtype=self.dtype)
-
-
-@register_fake("cadence::quantized_softmax")
-def quantized_softmax_meta(
-    input: torch.Tensor,
-    mask: torch.Tensor,
-    dim: int,
-    in_scale: torch.Tensor,
-    in_zero_point: torch.Tensor,
-    out_scale: torch.Tensor,
-    out_zero_point: torch.Tensor,
-) -> torch.Tensor:
-    return input.new_empty(input.size(), dtype=input.dtype)
-
-
-@register_fake("cadence::quantized_softmax.per_tensor")
-def quantized_softmax_per_tensor_meta(
-    input: torch.Tensor,
-    mask: torch.Tensor,
-    dim: int,
-    in_scale: float,
-    in_zero_point: int,
-    out_scale: float,
-    out_zero_point: int,
-) -> torch.Tensor:
-    return input.new_empty(input.size(), dtype=input.dtype)
@@ -6,10 +6,9 @@
 
 # pyre-strict
 
-from typing import Any, cast, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 import torch
-from executorch.backends.cadence.aot.compiler_utils import get_shape
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
     AddPattern,
@@ -26,7 +25,6 @@
     MatmulPattern,
     ReluPattern0,
     ReluPattern1,
-    SoftmaxPattern,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
     check_out_zero_point_is_min_range,
@@ -390,73 +388,6 @@ def get_args_and_kwargs_relu(
     return args, kwargs
 
 
-def get_args_and_kwargs_softmax(
-    graph_module: GraphModule,
-    inputs_inputs: List[fx.Node],
-    dequants_inputs: List[fx.Node],
-    quant_node: fx.Node,
-    op_node: fx.Node,
-) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
-    # Make a dummy mask tensor
-    mask_shape = get_shape(graph_module, cast(fx.Node, quant_node.args[0]))
-    mask_shape = list(mask_shape) if mask_shape else []
-    mask_shape[-1] = mask_shape[-1] // 16
-    mask_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        (
-            mask_shape,
-            0.0,
-        ),
-        {"dtype": torch.int32},
-    )
-    # Make the scale and zero_point tensors
-    in_scale_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        (
-            [1],
-            dequants_inputs[0].args[1],
-        ),
-        {"dtype": torch.float32},
-    )
-    in_zero_point_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        (
-            [1],
-            dequants_inputs[0].args[2],
-        ),
-        {"dtype": torch.int32},
-    )
-    out_scale_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        (
-            [1],
-            quant_node.args[1],
-        ),
-        {"dtype": torch.float32},
-    )
-    out_zero_point_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        (
-            [1],
-            quant_node.args[2],
-        ),
-        {"dtype": torch.int32},
-    )
-
-    # Make the args and kwargs for the replacement op
-    args = (
-        inputs_inputs[0],
-        mask_tensor,
-        op_node.args[1],
-        in_scale_tensor,
-        in_zero_point_tensor,
-        out_scale_tensor,
-        out_zero_point_tensor,
-    )
-    kwargs = {}
-    return args, kwargs
-
-
 class QuantFusion(ExportPass):
     # pyre-ignore[2]: Parameter `patterns` has no type specified
     def __init__(self, patterns) -> None:
@@ -612,14 +543,6 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_inputs,
                             quant_node,
                         )
-                    elif isinstance(pattern, SoftmaxPattern):
-                        args, kwargs = get_args_and_kwargs_softmax(
-                            graph_module,
-                            inputs_inputs,
-                            dequants_inputs,
-                            quant_node,
-                            anchor_output_node,
-                        )
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
                         args,
 
@@ -485,25 +485,3 @@ def partition_types(self) -> List[OpOverload]:
 class Conv2dReluPattern1(ConvReluBasePattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.conv2d.default, torch.ops.aten.relu_.default]
-
-
-class SoftmaxPattern(QuantizationPattern):
-
-    def partition_types(self) -> List[OpOverload]:
-        return [torch.ops.aten._softmax.default]
-
-    def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
-        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
-        softmax_node = fused_partition[0].nodes[-1]
-
-        return PartitionAnchors(
-            inputs=[(softmax_node, 0)],
-            weights=[],
-            biases=[],
-            output=[(softmax_node,)],
-        )
-
-    def replacement_op(self) -> OpOverload:
-        return torch.ops.cadence.quantized_softmax.default
@@ -27,7 +27,6 @@
     QuantizationPattern,
     ReluPattern0,
     ReluPattern1,
-    SoftmaxPattern,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
     find_sequential_partitions_aten,
@@ -59,15 +58,6 @@
     observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
 )
 
-act_qspec_asym16s = QuantizationSpec(
-    dtype=torch.int16,
-    quant_min=-32768,
-    quant_max=32767,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
-)
-
 wgt_qspec_asym8s = QuantizationSpec(
     dtype=torch.int8,
     quant_min=-128,
@@ -102,13 +92,6 @@
     None,
 )
 
-qconfig_A16 = QuantizationConfig(
-    act_qspec_asym16s,
-    act_qspec_asym16s,
-    wgt_qspec_asym8s,
-    None,
-)
-
 
 class CadenceAtenQuantizer(Quantizer):
     def __init__(
@@ -300,15 +283,3 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8))
         quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8))
         super().__init__(quantizers)
-
-
-class CadenceWithSoftmaxQuantizer(CadenceQuantizer):
-    """
-    Quantizer including A16 softmax
-    """
-
-    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
-        if quantizers is None:
-            quantizers = get_cadence_default_quantizers()
-        quantizers.append(CadenceAtenQuantizer(SoftmaxPattern(), qconfig_A16))
-        super().__init__(quantizers)