microsoft
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 5 additions & 2 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/convolve.cpp‎
Lines changed: 96 additions & 0 deletions b/‎onnxruntime/core/mlas/lib/convolve.cpp‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/intrinsics/avx512/sconv_nchw_depthwise_multiplier_greater_than_1_avx512f.cpp‎
Lines changed: 172 additions & 0 deletions b/‎onnxruntime/core/mlas/lib/intrinsics/avx512/sconv_nchw_depthwise_multiplier_greater_than_1_avx512f.cpp‎
Lines changed: 172 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/mlasi.h‎
Lines changed: 22 additions & 2 deletions b/‎onnxruntime/core/mlas/lib/mlasi.h‎
Lines changed: 22 additions & 2 deletions
@@ -23,6 +23,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qgemm.cpp
   ${MLAS_SRC_DIR}/qdwconv.cpp
   ${MLAS_SRC_DIR}/convolve.cpp
+  ${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_greater_than_1.cpp
   ${MLAS_SRC_DIR}/convsym.cpp
   ${MLAS_SRC_DIR}/pooling.cpp
   ${MLAS_SRC_DIR}/transpose.cpp
@@ -118,7 +119,7 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
         ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
         ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-        ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_1.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -207,6 +208,7 @@ function(setup_mlas_source_for_windows)
       ${MLAS_SRC_DIR}/intrinsics/avx512/gelu_avx512f.cpp
       ${MLAS_SRC_DIR}/intrinsics/avx512/silu_avx512f.cpp
       ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
+      ${MLAS_SRC_DIR}/intrinsics/avx512/sconv_nchw_depthwise_multiplier_greater_than_1_avx512f.cpp
     )
 
     set_source_files_properties(${mlas_platform_srcs_avx512} PROPERTIES COMPILE_FLAGS "/arch:AVX512")
@@ -501,7 +503,7 @@ else()
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.h
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-          ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
+          ${MLAS_SRC_DIR}/sconv_nchw_depthwise_multiplier_1.cpp
         )
 
         # Conditionally add the SVE implementation if compiler supports it
@@ -778,6 +780,7 @@ endif()
           ${MLAS_SRC_DIR}/intrinsics/avx512/gelu_avx512f.cpp
           ${MLAS_SRC_DIR}/intrinsics/avx512/silu_avx512f.cpp
           ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
+          ${MLAS_SRC_DIR}/intrinsics/avx512/sconv_nchw_depthwise_multiplier_greater_than_1_avx512f.cpp
         )
         set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f")
 
 
@@ -877,6 +877,7 @@ enum MLAS_CONV_ALGORITHM {
     MlasConvAlgorithmGemmDirect,
     MlasConvAlgorithmExpandThenGemm,
     MlasConvAlgorithmExpandThenGemmSegmented,
+    MlasConvAlgorithmDepthwiseMultiplierGreaterThan1,
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
     MlasConvAlgorithmDepthwise,
 #endif
 
@@ -42,6 +42,53 @@ struct MLAS_CONV_WORK_BLOCK {
     ptrdiff_t TargetThreadCount;
 };
 
+static
+void
+MlasDepthwiseMultiplierGreaterThan1Threaded(
+    void* Context,
+    ptrdiff_t Index
+    )
+{
+    MLAS_CONV_WORK_BLOCK* WorkBlock = (MLAS_CONV_WORK_BLOCK*)Context;
+
+    const MLAS_CONV_PARAMETERS* Parameters = WorkBlock->Parameters;
+    const float* Zeros = nullptr;
+
+    const size_t GroupCount = Parameters->GroupCount;
+    const size_t BatchGroupCount = Parameters->BatchCount * GroupCount;
+
+    size_t BatchGroupStart;
+    size_t BatchGroupRemaining;
+
+    MlasPartitionWork(Index, WorkBlock->TargetThreadCount, BatchGroupCount,
+        &BatchGroupStart, &BatchGroupRemaining);
+
+    const size_t BatchGroupEnd = BatchGroupStart + BatchGroupRemaining;
+
+    const size_t FilterCount = Parameters->FilterCount;
+    const size_t OutputSize = Parameters->OutputSize;
+    const size_t K = Parameters->K;
+
+    const size_t InputGroupSize = Parameters->InputChannels * Parameters->InputSize;
+    const size_t OutputGroupSize = FilterCount * OutputSize;
+    const size_t FilterGroupSize = FilterCount * K;
+
+    for (size_t bg = BatchGroupStart; bg < BatchGroupEnd; bg++) {
+        size_t group = bg % GroupCount;
+
+        const float* input = WorkBlock->Input + bg * InputGroupSize;
+        const float* filter = WorkBlock->Filter + group * FilterGroupSize;
+        float* output = WorkBlock->Output + bg * OutputGroupSize;
+        const float* bias = WorkBlock->Bias;
+        if (bias != nullptr) {
+            bias += group * FilterCount;
+        }
+
+        MlasConvDepthwiseWithMultiplierFloat_CHW(Parameters, input, filter, output, Zeros);
+        MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize);
+    }
+}
+
 void
 MlasConvIm2Col(
     const MLAS_CONV_PARAMETERS* Parameters,
@@ -1106,6 +1153,30 @@ Return Value:
         return;
     }
 
+    if (Algorithm == MlasConvAlgorithmDepthwiseMultiplierGreaterThan1 && ((BatchCount > 1) || (GroupCount > 1))) {
+        const size_t BatchGroupCount = BatchCount * GroupCount;
+
+        ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+
+        if (static_cast<size_t>(TargetThreadCount) >= BatchGroupCount) {
+            TargetThreadCount = static_cast<ptrdiff_t>(BatchGroupCount);
+        }
+
+        MLAS_CONV_WORK_BLOCK WorkBlock;
+
+        WorkBlock.Parameters = Parameters;
+        WorkBlock.Input = Input;
+        WorkBlock.Filter = Filter;
+        WorkBlock.Bias = Bias;
+        WorkBlock.WorkingBuffer = nullptr;
+        WorkBlock.Output = Output;
+        WorkBlock.TargetThreadCount = TargetThreadCount;
+
+        MlasExecuteThreaded(MlasDepthwiseMultiplierGreaterThan1Threaded, &WorkBlock, TargetThreadCount, ThreadPool);
+
+        return;
+    }
+
 
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
@@ -1198,6 +1269,14 @@ Return Value:
                     break;
                 }
 
+                case MlasConvAlgorithmDepthwiseMultiplierGreaterThan1:
+                {
+                    const float* Zeros = nullptr;
+                    MlasConvDepthwiseWithMultiplierFloat_CHW(Parameters, Input, filter, Output, Zeros);
+                    MlasActivation(Parameters->Activation, Output, bias, FilterCount, OutputSize, OutputSize);
+                    break;
+                }
+
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
                 case MlasConvAlgorithmDepthwise:
@@ -1453,6 +1532,23 @@ Return Value:
 
     } else {
 
+#if defined(MLAS_TARGET_AMD64)
+
+    if (Dimensions == 2
+        && GroupCount > 1
+        && Parameters->FilterCount == 2 && Parameters->InputChannels == 1
+        && Parameters->KernelShape[0] == 7 && Parameters->KernelShape[1] == 7
+        && Parameters->Padding[0] == 3 && Parameters->Padding[1] == 3
+        && Parameters->Padding[2] == 3 && Parameters->Padding[3] == 3
+        && Parameters->StrideShape[0] == 2 && Parameters->StrideShape[1] == 2
+        && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1
+        && GetMlasPlatform().ConvNchwFloatKernel == MlasConvNchwFloatKernelAvx512F) {
+
+        Parameters->Algorithm = MlasConvAlgorithmDepthwiseMultiplierGreaterThan1;
+        return;
+    }
+#endif
+
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
         // Scalar (WASM_SCALAR) / vectorized (ARM64) direct conv for depthwise convolution.
 
@@ -0,0 +1,172 @@
+/*++
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+Module Name:
+    sconv_nchw_depthwise_multiplier_greater_than_1_avx512f.cpp
+Abstract:
+        This module implements the AVX512F kernel for the exact MobileClip grouped
+        projection case:
+
+            - CHW input/output layout per group slice
+            - input channels per group = 1
+            - output channels per group = 2
+            - kernel = 7x7
+            - stride = 2x2
+            - padding = 3,3,3,3
+            - dilation = 1x1
+
+        The outer dispatch is expected to guarantee these constraints.
+--*/
+
+#include "mlasi.h"
+
+#if defined(MLAS_TARGET_AMD64)
+
+namespace {
+
+MLAS_FORCEINLINE
+void
+MlasConv2dSingleChannelCHWKernel7x7Pad3Stride2Dilation1DepthMultiplier2Scalar(
+    const float* Input,
+    size_t InputHeight,
+    size_t InputWidth,
+    const float* Filter0,
+    const float* Filter1,
+    float* Output0,
+    float* Output1,
+    size_t OutputWidth,
+    size_t oh,
+    size_t ow,
+    float Beta
+    )
+/*++
+
+Routine Description:
+
+    Computes one border output point for the exact MobileClip
+    7x7/pad-3/stride-2/dilation-1, multiplier-2 case.
+
+    This helper is only used by the AVX512 implementation for border handling;
+    it is not a generic fallback dispatch path despite the scalar
+    implementation.
+
+--*/
+{
+    const ptrdiff_t input_origin_y = static_cast<ptrdiff_t>(oh * 2) - 3;
+    const ptrdiff_t input_origin_x = static_cast<ptrdiff_t>(ow * 2) - 3;
+    const size_t output_index = oh * OutputWidth + ow;
+
+    float acc0 = (Beta == 0.0f) ? 0.0f : Output0[output_index] * Beta;
+    float acc1 = (Beta == 0.0f) ? 0.0f : Output1[output_index] * Beta;
+
+    for (size_t kh = 0; kh < 7; ++kh) {
+        const ptrdiff_t ih = input_origin_y + static_cast<ptrdiff_t>(kh);
+        if (ih < 0 || ih >= static_cast<ptrdiff_t>(InputHeight)) {
+            continue;
+        }
+
+        const float* input_row = Input + static_cast<size_t>(ih) * InputWidth;
+        const float* filter0_row = Filter0 + kh * 7;
+        const float* filter1_row = Filter1 + kh * 7;
+
+        for (size_t kw = 0; kw < 7; ++kw) {
+            const ptrdiff_t iw = input_origin_x + static_cast<ptrdiff_t>(kw);
+            if (iw < 0 || iw >= static_cast<ptrdiff_t>(InputWidth)) {
+                continue;
+            }
+
+            const float input_value = input_row[static_cast<size_t>(iw)];
+            acc0 += input_value * filter0_row[kw];
+            acc1 += input_value * filter1_row[kw];
+        }
+    }
+
+    Output0[output_index] = acc0;
+    Output1[output_index] = acc1;
+}
+
+}  // namespace
+
+void
+MlasConvDepthwiseMultiplier2CHWKernel7x7S2Avx512F(
+    const float* Input,
+    size_t InputHeight,
+    size_t InputWidth,
+    const float* Filter,
+    float* Output,
+    size_t OutputHeight,
+    size_t OutputWidth,
+    float Beta
+    )
+/*++
+
+Routine Description:
+
+    Computes one group slice of the exact MobileClip grouped projection case.
+
+Assumptions:
+
+    - Input and output are CHW tensors for a single group slice.
+    - Filter is OIHW for a single group slice with exactly two output channels.
+    - Kernel = 7x7, stride = 2, padding = 3, dilation = 1.
+    - OutputHeight and OutputWidth match the supplied input geometry.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    constexpr size_t KernelSize = 7;
+    constexpr __mmask16 ValidKernelMask = 0x007F;
+
+    const float* Filter0 = Filter;
+    const float* Filter1 = Filter + KernelSize * KernelSize;
+    float* Output0 = Output;
+    float* Output1 = Output + (OutputHeight * OutputWidth);
+
+    for (size_t oh = 0; oh < OutputHeight; ++oh) {
+        const ptrdiff_t input_origin_y = static_cast<ptrdiff_t>(oh * 2) - 3;
+        const bool interior_y = input_origin_y >= 0 &&
+                                (input_origin_y + static_cast<ptrdiff_t>(KernelSize)) <= static_cast<ptrdiff_t>(InputHeight);
+
+        for (size_t ow = 0; ow < OutputWidth; ++ow) {
+            const ptrdiff_t input_origin_x = static_cast<ptrdiff_t>(ow * 2) - 3;
+            const bool interior_x = input_origin_x >= 0 &&
+                                    (input_origin_x + static_cast<ptrdiff_t>(KernelSize)) <= static_cast<ptrdiff_t>(InputWidth);
+
+            if (!(interior_y && interior_x)) {
+                MlasConv2dSingleChannelCHWKernel7x7Pad3Stride2Dilation1DepthMultiplier2Scalar(
+                    Input, InputHeight, InputWidth, Filter0, Filter1, Output0, Output1, OutputWidth, oh, ow, Beta);
+                continue;
+            }
+
+            __m512 acc0 = _mm512_setzero_ps();
+            __m512 acc1 = _mm512_setzero_ps();
+
+            for (size_t kh = 0; kh < KernelSize; ++kh) {
+                const float* input_row = Input + (static_cast<size_t>(input_origin_y) + kh) * InputWidth + static_cast<size_t>(input_origin_x);
+                const __m512 input_vec = _mm512_maskz_loadu_ps(ValidKernelMask, input_row);
+                const __m512 filter0_vec = _mm512_maskz_loadu_ps(ValidKernelMask, Filter0 + kh * KernelSize);
+                const __m512 filter1_vec = _mm512_maskz_loadu_ps(ValidKernelMask, Filter1 + kh * KernelSize);
+
+                acc0 = _mm512_fmadd_ps(input_vec, filter0_vec, acc0);
+                acc1 = _mm512_fmadd_ps(input_vec, filter1_vec, acc1);
+            }
+
+            const size_t output_index = oh * OutputWidth + ow;
+            float acc0_scalar = _mm512_reduce_add_ps(acc0);
+            float acc1_scalar = _mm512_reduce_add_ps(acc1);
+
+            if (Beta != 0.0f) {
+                acc0_scalar += Output0[output_index] * Beta;
+                acc1_scalar += Output1[output_index] * Beta;
+            }
+
+            Output0[output_index] = acc0_scalar;
+            Output1[output_index] = acc1_scalar;
+        }
+    }
+}
+
+#endif
@@ -1688,8 +1688,6 @@ MlasFp32FromBits(
 #endif
 
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
-
-
 void
 MLASCALL
 MlasConvDepthwiseFloat_CHW(
@@ -1702,6 +1700,28 @@ MlasConvDepthwiseFloat_CHW(
 
 #endif
 
+void
+MlasConvDepthwiseWithMultiplierFloat_CHW(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output,
+    const float* Zeros
+    );
+
+#if defined(MLAS_TARGET_AMD64)
+void
+MlasConvDepthwiseMultiplier2CHWKernel7x7S2Avx512F(
+    const float* Input,
+    size_t InputHeight,
+    size_t InputWidth,
+    const float* Filter,
+    float* Output,
+    size_t OutputHeight,
+    size_t OutputWidth,
+    float Beta
+    );
+#endif
 
 //
 // Define the missing ARM64 NEON intrinsic macros from arm64_neon.h that enable