diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index 131c85c9ab1..b7a772421bf 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -2527,7 +2527,7 @@ def quantized_max_pool2d_nhwc_meta( def fully_connected_meta( src: torch.Tensor, weight: torch.Tensor, - bias: torch.Tensor, + bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: # src comes in shape [leading_dims, in_dim] # weight comes in shape [out_dim, in_dim] diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py index 6c780782070..c6bdcd244ca 100644 --- a/backends/cadence/aot/ref_implementations.py +++ b/backends/cadence/aot/ref_implementations.py @@ -633,10 +633,8 @@ def quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor() -> torch.Tensor: def fully_connected( input_tensor: torch.Tensor, weight: torch.Tensor, - bias: torch.Tensor, + bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - if input_tensor.shape[0] != 1: - raise ValueError("Fully connected linear only supports batch size of 1") return F.linear(input_tensor, weight, bias) diff --git a/backends/cadence/generic/operators/op_fully_connected.cpp b/backends/cadence/generic/operators/op_fully_connected.cpp index f1e53ad5f76..36befc52102 100644 --- a/backends/cadence/generic/operators/op_fully_connected.cpp +++ b/backends/cadence/generic/operators/op_fully_connected.cpp @@ -27,7 +27,8 @@ void linear( Tensor& output) { const float* __restrict__ input_data = input.const_data_ptr(); const float* __restrict__ weight_data = weight.const_data_ptr(); - const float* __restrict__ bias_data = bias.value().const_data_ptr(); + const float* __restrict__ bias_data = + bias.has_value() ? bias.value().const_data_ptr() : nullptr; float* __restrict__ output_data = output.mutable_data_ptr(); // input comes in shape [batch_size, in_dim] @@ -43,7 +44,7 @@ void linear( for (int i = 0; i < leading_dims; ++i) { for (int j = 0; j < M; ++j) { - float sum = bias_data[j]; + float sum = bias_data != nullptr ? bias_data[j] : 0.0f; for (int k = 0; k < N; ++k) { sum += input_data[i * N + k] * weight_data[j * N + k]; } diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp index ccc81a35aba..a0bed1e0b70 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp @@ -240,7 +240,10 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( WORD32 x_stride = stride[0]; WORD32 x_padding = padding[0]; WORD32 input_zero_bias = -in_zero_point; - WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + const float eff_scale = bias_scale * (1.0f / output_scale); + WORD32 out_multiplier32 = (eff_scale >= 1.0f) + ? static_cast(2147483647) + : static_cast(eff_scale * 2147483648.0f); WORD32 out_shift32 = 0; WORD32 kernel_zero_bias = -weight_zero_point; @@ -419,9 +422,9 @@ void quantized_conv1d_ncl_per_tensor_out( out); } } else if (dtype == ScalarType::Byte) { - // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1). - // Fall back to generic implementation. - if (groups > 1) { + // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1) + // or stride > 1. Fall back to generic implementation. + if (groups > 1 || stride[0] > 1) { impl::generic::native::quantized_conv1d_ncl_per_tensor_out( ctx, input, diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp index 2a11dbf358d..a85dd89de39 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp @@ -176,7 +176,10 @@ void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( WORD32 x_stride = stride[stride.size() - 1]; WORD32 x_padding = padding[padding.size() - 1]; WORD32 input_zero_bias = -in_zero_point; - WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + const float eff_scale = bias_scale * (1.0f / output_scale); + WORD32 out_multiplier32 = (eff_scale >= 1.0f) + ? static_cast(2147483647) + : static_cast(eff_scale * 2147483648.0f); WORD32 out_shift32 = 0; WORD32 kernel_zero_bias = -weight_zero_point; @@ -298,9 +301,9 @@ void quantized_conv1d_nlc_per_tensor_out( out); } } else if (dtype == ScalarType::Byte) { - // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1). - // Fall back to generic implementation. - if (groups > 1) { + // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1) + // or stride > 1. Fall back to generic implementation. + if (groups > 1 || stride[0] > 1) { impl::generic::native::quantized_conv1d_nlc_per_tensor_out( ctx, input,