google
diff --git a/‎BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/xnnpack.h‎
Lines changed: 23 additions & 0 deletions b/‎include/xnnpack.h‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/enums/operator-type.c‎
Lines changed: 9 additions & 8 deletions b/‎src/enums/operator-type.c‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎src/enums/operator-type.yaml‎
Lines changed: 2 additions & 0 deletions b/‎src/enums/operator-type.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/operator-run.c‎
Lines changed: 220 additions & 1 deletion b/‎src/operator-run.c‎
Lines changed: 220 additions & 1 deletion
@@ -570,6 +570,7 @@ xnnpack_cc_library(
     msvc_copts = xnnpack_msvc_std_copts(),
     deps = [
         ":common",
+        ":math",
         ":microparams",
     ],
 )
 
@@ -515,6 +515,7 @@ IF(XNNPACK_BUILD_LIBRARY)
   # Need C_EXTENSIONS to get constants for mmap (MAP_ANONYMOUS).
   SET_TARGET_PROPERTIES(memory PROPERTIES C_EXTENSIONS YES)
   ADD_LIBRARY(convolution-test-helpers OBJECT test/convolution-test-helpers.cc)
+  TARGET_INCLUDE_DIRECTORIES(convolution-test-helpers PRIVATE include src)
   ADD_LIBRARY(post-operation OBJECT src/operators/post-operation.c)
   IF(XNNPACK_LIBRARY_TYPE STREQUAL "default")
     ADD_LIBRARY(XNNPACK ${XNNPACK_SRCS})
 
@@ -3004,6 +3004,18 @@ enum xnn_status xnn_setup_convolution2d_nhwc_f32(
   const float* input,
   float* output);
 
+enum xnn_status xnn_create_convolution2d_nhwc_qd8_f32_qc8w(
+    uint32_t input_padding_top, uint32_t input_padding_right,
+    uint32_t input_padding_bottom, uint32_t input_padding_left,
+    uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height,
+    uint32_t subsampling_width, uint32_t dilation_height,
+    uint32_t dilation_width, uint32_t groups, size_t group_input_channels,
+    size_t group_output_channels, size_t input_channel_stride,
+    size_t output_channel_stride, const float* kernel_scale,
+    const int8_t* kernel, const float* bias, float output_min, float output_max,
+    uint32_t flags, xnn_code_cache_t code_cache,
+    xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out);
+
 enum xnn_status xnn_create_convolution2d_nhwc_qs8(
   uint32_t input_padding_top,
   uint32_t input_padding_right,
@@ -3034,6 +3046,12 @@ enum xnn_status xnn_create_convolution2d_nhwc_qs8(
   xnn_weights_cache_t weights_cache,
   xnn_operator_t* convolution_op_out);
 
+enum xnn_status xnn_reshape_convolution2d_nhwc_qd8_f32_qc8w(
+    xnn_operator_t convolution_op, size_t batch_size, size_t input_height,
+    size_t input_width, size_t* workspace_size, size_t* workspace_alignment,
+    size_t* output_height_out, size_t* output_width_out,
+    pthreadpool_t threadpool);
+
 enum xnn_status xnn_reshape_convolution2d_nhwc_qs8(
   xnn_operator_t convolution_op,
   size_t batch_size,
@@ -3045,6 +3063,11 @@ enum xnn_status xnn_reshape_convolution2d_nhwc_qs8(
   size_t* output_width_out,
   pthreadpool_t threadpool);
 
+enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f32_qc8w(
+    xnn_operator_t convolution_op, void* workspace, const int8_t* input,
+    float* output,
+    const struct xnn_dynamic_quantization_params* quantization_params);
+
 enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
   xnn_operator_t convolution_op,
   void* workspace,
 
@@ -14,15 +14,15 @@
 #include <xnnpack/operator-type.h>
 
 
-static const uint16_t offset[148] = {
+static const uint16_t offset[149] = {
   0, 8, 22, 36, 50, 64, 78, 92, 119, 147, 175, 203, 230, 257, 289, 321, 339, 357, 382, 408, 424, 440, 455, 470, 492,
-  515, 538, 561, 584, 607, 630, 653, 671, 694, 718, 736, 759, 783, 807, 831, 855, 879, 903, 927, 941, 956, 971, 997,
-  1023, 1049, 1075, 1107, 1139, 1165, 1192, 1219, 1236, 1253, 1287, 1321, 1335, 1349, 1363, 1379, 1395, 1421, 1447,
-  1479, 1511, 1548, 1585, 1611, 1643, 1669, 1703, 1737, 1771, 1805, 1839, 1873, 1903, 1933, 1953, 1973, 1994, 2015,
-  2036, 2057, 2081, 2105, 2128, 2151, 2169, 2187, 2202, 2217, 2235, 2253, 2272, 2291, 2310, 2329, 2346, 2363, 2379,
-  2395, 2423, 2451, 2479, 2507, 2534, 2561, 2578, 2619, 2660, 2678, 2696, 2714, 2732, 2747, 2763, 2779, 2797, 2815,
-  2833, 2859, 2886, 2913, 2930, 2947, 2969, 2991, 3020, 3049, 3068, 3087, 3106, 3125, 3140, 3155, 3170, 3185, 3204,
-  3224, 3244, 3264, 3285, 3306
+  515, 538, 561, 584, 607, 630, 653, 671, 694, 718, 736, 759, 783, 807, 831, 855, 890, 914, 938, 962, 976, 991, 1006,
+  1032, 1058, 1084, 1110, 1142, 1174, 1200, 1227, 1254, 1271, 1288, 1322, 1356, 1370, 1384, 1398, 1414, 1430, 1456,
+  1482, 1514, 1546, 1583, 1620, 1646, 1678, 1704, 1738, 1772, 1806, 1840, 1874, 1908, 1938, 1968, 1988, 2008, 2029,
+  2050, 2071, 2092, 2116, 2140, 2163, 2186, 2204, 2222, 2237, 2252, 2270, 2288, 2307, 2326, 2345, 2364, 2381, 2398,
+  2414, 2430, 2458, 2486, 2514, 2542, 2569, 2596, 2613, 2654, 2695, 2713, 2731, 2749, 2767, 2782, 2798, 2814, 2832,
+  2850, 2868, 2894, 2921, 2948, 2965, 2982, 3004, 3026, 3055, 3084, 3103, 3122, 3141, 3160, 3175, 3190, 3205, 3220,
+  3239, 3259, 3279, 3299, 3320, 3341
 };
 
 static const char data[] =
@@ -66,6 +66,7 @@ static const char data[] =
   "Convolution (NCHW, F32)\0"
   "Convolution (NHWC, F16)\0"
   "Convolution (NHWC, F32)\0"
+  "Convolution (NHWC, QD8, F32, QC8W)\0"
   "Convolution (NHWC, QC8)\0"
   "Convolution (NHWC, QS8)\0"
   "Convolution (NHWC, QU8)\0"
 
@@ -85,6 +85,8 @@
   string: "Convolution (NHWC, F16)"
 - name: xnn_operator_type_convolution_nhwc_f32
   string: "Convolution (NHWC, F32)"
+- name: xnn_operator_type_convolution_nhwc_qd8_f32_qc8w
+  string: "Convolution (NHWC, QD8, F32, QC8W)"
 - name: xnn_operator_type_convolution_nhwc_qc8
   string: "Convolution (NHWC, QC8)"
 - name: xnn_operator_type_convolution_nhwc_qs8
 
@@ -500,6 +500,34 @@ void xnn_compute_grouped_batch_igemm(
       &context->params);
 }
 
+void xnn_compute_grouped_batch_dqigemm(
+    const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+    size_t batch_index,
+    size_t group_index,
+    size_t mr_block_start,
+    size_t nr_block_start,
+    size_t mr_block_size,
+    size_t nr_block_size)
+{
+  const size_t ks        = context->ks;
+  const size_t cm_stride = context->cm_stride;
+
+  context->dq_ukernel.function[XNN_UARCH_DEFAULT](
+      mr_block_size,
+      nr_block_size,
+      context->kc,
+      context->ks_scaled,
+      (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+      (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride + group_index * context->gw_stride),
+      (void*) ((uintptr_t) context->c + group_index * context->gc_stride + batch_index * context->bc_stride + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+      cm_stride,
+      context->cn_stride,
+      context->a_offset + group_index * context->ga_stride + batch_index * context->ba_stride,
+      context->zero,
+      &context->params,
+      (const void*) ((uintptr_t) &context->quantization_params[batch_index]));
+}
+
 void xnn_compute_grouped_igemm(
     const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
     size_t group_index,
@@ -526,6 +554,33 @@ void xnn_compute_grouped_igemm(
       &context->params);
 }
 
+void xnn_compute_grouped_dqigemm(
+    const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+    size_t group_index,
+    size_t mr_block_start,
+    size_t nr_block_start,
+    size_t mr_block_size,
+    size_t nr_block_size)
+{
+  const size_t ks        = context->ks;
+  const size_t cm_stride = context->cm_stride;
+
+  context->dq_ukernel.function[XNN_UARCH_DEFAULT](
+      mr_block_size,
+      nr_block_size,
+      context->kc,
+      context->ks_scaled,
+      (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+      (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride + group_index * context->gw_stride),
+      (void*) ((uintptr_t) context->c + group_index * context->gc_stride + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+      cm_stride,
+      context->cn_stride,
+      context->a_offset + group_index * context->ga_stride,
+      context->zero,
+      &context->params,
+      (const void*) ((uintptr_t) context->quantization_params));
+}
+
 void xnn_compute_batch_igemm(
     const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
     size_t batch_index,
@@ -552,6 +607,33 @@ void xnn_compute_batch_igemm(
       &context->params);
 }
 
+void xnn_compute_batch_dqigemm(
+    const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+    size_t batch_index,
+    size_t mr_block_start,
+    size_t nr_block_start,
+    size_t mr_block_size,
+    size_t nr_block_size)
+{
+  const size_t ks        = context->ks;
+  const size_t cm_stride = context->cm_stride;
+
+  context->dq_ukernel.function[XNN_UARCH_DEFAULT](
+      mr_block_size,
+      nr_block_size,
+      context->kc,
+      context->ks_scaled,
+      (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+      (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride),
+      (void*) ((uintptr_t) context->c + batch_index * context->bc_stride + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+      cm_stride,
+      context->cn_stride,
+      context->a_offset + batch_index * context->ba_stride,
+      context->zero,
+      &context->params,
+      (const void*) ((uintptr_t) &context->quantization_params[batch_index]));
+}
+
 void xnn_compute_igemm(
     const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
     size_t mr_block_start,
@@ -577,6 +659,31 @@ void xnn_compute_igemm(
       &context->params);
 }
 
+void xnn_compute_dqigemm(
+    const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+    size_t mr_block_start,
+    size_t nr_block_start,
+    size_t mr_block_size,
+    size_t nr_block_size)
+{
+  const size_t ks        = context->ks;
+  const size_t cm_stride = context->cm_stride;
+
+  context->dq_ukernel.function[XNN_UARCH_DEFAULT](
+      mr_block_size,
+      nr_block_size,
+      context->kc,
+      context->ks_scaled,
+      (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+      (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride),
+      (void*) ((uintptr_t) context->c + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+      cm_stride,
+      context->cn_stride,
+      context->a_offset,
+      context->zero,
+      &context->params,
+      (const void*) ((uintptr_t) &context->quantization_params[/*mr_block_start=*/0]));
+}
 // `output_tile_start` should be a multiple of igemm.mr (tile size).
 void xnn_compute_conv2d_igemm_indirection(
     const struct conv2d_igemm_indirection_init_context context[restrict XNN_MIN_ELEMENTS(1)],
@@ -2028,7 +2135,7 @@ void xnn_compute_rope(
         cm_stride,
         context->cn_stride,
         context->fused_params,
-        (const void*) ((uintptr_t) &context->quantization_params[mr_block_start]));
+       (const void*) ((uintptr_t) &context->quantization_params[mr_block_start]));
   }
 
   void xnn_compute_hmp_grouped_batch_igemm(
@@ -2059,6 +2166,35 @@ void xnn_compute_rope(
         &context->params);
   }
 
+  void xnn_compute_hmp_grouped_batch_dqigemm(
+      const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+      uint32_t uarch_index,
+      size_t batch_index,
+      size_t group_index,
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size)
+  {
+    const size_t ks        = context->ks;
+    const size_t cm_stride = context->cm_stride;
+
+    context->dq_ukernel.function[uarch_index](
+        mr_block_size,
+        nr_block_size,
+        context->kc,
+        context->ks_scaled,
+        (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+        (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride + group_index * context->gw_stride),
+        (void*) ((uintptr_t) context->c + group_index * context->gc_stride + batch_index * context->bc_stride + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+        cm_stride,
+        context->cn_stride,
+        context->a_offset + group_index * context->ga_stride + batch_index * context->ba_stride,
+        context->zero,
+        &context->params,
+        (const void*) ((uintptr_t) &context->quantization_params[batch_index]));
+  }
+
   void xnn_compute_hmp_grouped_igemm(
       const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
       uint32_t uarch_index,
@@ -2086,6 +2222,34 @@ void xnn_compute_rope(
         &context->params);
   }
 
+  void xnn_compute_hmp_grouped_dqigemm(
+      const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+      uint32_t uarch_index,
+      size_t group_index,
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size)
+  {
+    const size_t ks        = context->ks;
+    const size_t cm_stride = context->cm_stride;
+
+    context->dq_ukernel.function[uarch_index](
+        mr_block_size,
+        nr_block_size,
+        context->kc,
+        context->ks_scaled,
+        (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+        (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride + group_index * context->gw_stride),
+        (void*) ((uintptr_t) context->c + group_index * context->gc_stride + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+        cm_stride,
+        context->cn_stride,
+        context->a_offset + group_index * context->ga_stride,
+        context->zero,
+        &context->params,
+        (const void*) ((uintptr_t) context->quantization_params));
+  }
+
   void xnn_compute_batch_hmp_igemm(
       const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
       uint32_t uarch_index,
@@ -2113,6 +2277,34 @@ void xnn_compute_rope(
         &context->params);
   }
 
+  void xnn_compute_batch_hmp_dqigemm(
+      const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+      uint32_t uarch_index,
+      size_t batch_index,
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size)
+  {
+    const size_t ks        = context->ks;
+    const size_t cm_stride = context->cm_stride;
+
+    context->dq_ukernel.function[uarch_index](
+        mr_block_size,
+        nr_block_size,
+        context->kc,
+        context->ks_scaled,
+        (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+        (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride),
+        (void*) ((uintptr_t) context->c + batch_index * context->bc_stride + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+        cm_stride,
+        context->cn_stride,
+        context->a_offset + batch_index * context->ba_stride,
+        context->zero,
+        &context->params,
+        (const void*) ((uintptr_t) &context->quantization_params[batch_index]));
+  }
+
   void xnn_compute_hmp_igemm(
       const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
       uint32_t uarch_index,
@@ -2139,6 +2331,33 @@ void xnn_compute_rope(
         &context->params);
   }
 
+  void xnn_compute_hmp_dqigemm(
+      const struct igemm_context context[restrict XNN_MIN_ELEMENTS(1)],
+      uint32_t uarch_index,
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size)
+  {
+    const size_t ks        = context->ks;
+    const size_t cm_stride = context->cm_stride;
+
+    context->dq_ukernel.function[uarch_index](
+        mr_block_size,
+        nr_block_size,
+        context->kc,
+        context->ks_scaled,
+        (const void**) ((uintptr_t) context->indirect_a + mr_block_start * ks * sizeof(void*)),
+        (const void*) ((uintptr_t) context->packed_w + nr_block_start * context->w_stride),
+        (void*) ((uintptr_t) context->c + mr_block_start * cm_stride + (nr_block_start << context->log2_csize)),
+        cm_stride,
+        context->cn_stride,
+        context->a_offset,
+        context->zero,
+        &context->params,
+        (const void*) ((uintptr_t) context->quantization_params));
+  }
+
 void xnn_compute_hmp_scaled_dot_product_attention(
   const struct scaled_dot_product_attention_context context[restrict XNN_MIN_ELEMENTS(1)],
   uint32_t uarch_index,
Original file line number	Diff line number	Diff line change
`@@ -570,6 +570,7 @@ xnnpack_cc_library(`
`570`	`570`	`msvc_copts = xnnpack_msvc_std_copts(),`
`571`	`571`	`deps = [`
`572`	`572`	`":common",`
	`573`	`+ ":math",`
`573`	`574`	`":microparams",`
`574`	`575`	`],`
`575`	`576`	`)`