openvinotoolkit
diff --git a/‎src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.cpp‎
Lines changed: 43 additions & 104 deletions b/‎src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.cpp‎
Lines changed: 43 additions & 104 deletions
diff --git a/‎src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.hpp‎
Lines changed: 13 additions & 16 deletions b/‎src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.hpp‎
Lines changed: 13 additions & 16 deletions
@@ -318,6 +318,7 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<isa, Vmm>::reduce_loop(
         const int32_t *p_sum_zp
                 = (sum_idx != -1) ? &p.entry_[sum_idx].sum.zero_point : nullptr;
         mov(ptr[rsp + reg_bcast_data_off], reg_bcast_data);
+        mov(reg_ptr_scales, ptr[rsp + reg_ptr_sum_scale_off]);
         if (p_sum_scale && *p_sum_scale != 1.f) {
             mov(ptr[rsp + reg_load_data_off], reg_load_data);
             mov(reg_ptr_sum_scale, reinterpret_cast<size_t>(p_sum_scale));
@@ -332,6 +333,14 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<isa, Vmm>::reduce_loop(
             }
             const bool mask_flag = mask_flag_in && i_load == load_loop_blk - 1;
             const int load_size = mask_flag ? get_tail_size() : simd_w;
+            const auto ptr_scales_offset
+                    = jcp.is_oc_scale * (sizeof(float) * jcp.oc_block * i_load);
+            if (jcp.with_bias) {
+                if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
+                    mov(reg_bias_data, ptr[rsp + reg_bias_data_off]);
+                cvt2ps(jcp.bia_dt, vmm_bias, reg_bias_data,
+                        jcp.typesize_bia * jcp.oc_block * i_load, load_size);
+            }
             if (jcp.signed_input || jcp.with_input_zp) {
                 mov(reg_comp_data, ptr[rsp + reg_comp_data_off]);
                 cvt2ps(data_type::s32, vmm_comp, reg_comp_data,
@@ -347,75 +356,12 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<isa, Vmm>::reduce_loop(
                 uni_vcvtdq2ps(vmm_zp_comp, vmm_zp_comp);
             }
 
-            // TODO: scales support is done not in the most optimal way.
-            // If there're two free Vmm registers, one can be used to store
-            // scale_adjust value permanently, the second one can re-use data
-            // from it and multiply by src_scale that can be obtained at the
-            // point of scales loading. Then it can be used when multiplying
-            // by wei_scales. And further re-used for dst scales to avoid
-            // reading from the same address, but reading from the Vmm instead.
-            // This would save 1st and 3rd sections for every output Vmm.
-            //
-            // If only one Vmm is found, it will add scale_adjust overhead per
-            // src_scale loading, but the second part of the idea holds.
-            //
-            // Note: attempts to identify these Vmms were not taken.
-
-            // `avx2` is less flexible ISA in terms of tail and broadcast handling.
-            // Thus, need to save scales values in Vmm registers.
-            bool is_vmm_scales_set = false;
-            if (jcp.with_src_scales) {
-                mov(reg_src_scales, ptr[rsp + reg_src_scales_off]);
-                uni_vbroadcastss(vmm_scales, ptr[reg_src_scales]);
-                is_vmm_scales_set = true;
-            }
-            if (jcp.with_wei_scales) {
-                mov(reg_wei_scales, ptr[rsp + reg_wei_scales_off]);
-
-                if (!jcp.is_oc_scale) {
-                    uni_vbroadcastss(vmm_scales_tmp, ptr[reg_wei_scales]);
-                } else {
-                    int scale_offset = jcp.is_oc_scale
-                            * (sizeof(float) * jcp.oc_block * i_load);
-                    if (mask_flag) {
-                        uni_vpxor(
-                                vmm_scales_tmp, vmm_scales_tmp, vmm_scales_tmp);
-                        cvt2ps(data_type::f32, vmm_scales_tmp, reg_wei_scales,
-                                scale_offset, get_tail_size());
-                    } else {
-                        uni_vmovups(vmm_scales_tmp,
-                                ptr[reg_wei_scales + scale_offset]);
-                    }
-                }
-                if (is_vmm_scales_set) {
-                    uni_vmulps(vmm_scales, vmm_scales, vmm_scales_tmp);
-                } else {
-                    uni_vmovups(vmm_scales, vmm_scales_tmp);
-                }
-                is_vmm_scales_set = true;
-            }
-            if (jcp.wei_adj_scale != 1.f) {
-                mov(reg_scale_adjust, float2int(1.f / jcp.wei_adj_scale));
-                auto vmm_scale_adjust = vmm_scales_tmp;
-                auto xmm_scale_adjust = Xmm(vmm_scale_adjust.getIdx());
-                uni_vmovq(xmm_scale_adjust, reg_scale_adjust);
-                uni_vbroadcastss(vmm_scale_adjust, xmm_scale_adjust);
-                if (is_vmm_scales_set) {
-                    uni_vmulps(vmm_scales, vmm_scales, vmm_scale_adjust);
-                } else {
-                    uni_vmovups(vmm_scales, vmm_scale_adjust);
-                }
-                is_vmm_scales_set = true;
-            }
-
-            // The order of this load is important. `vmm_bias` is used as a
-            // temporary vector register for scales. Load bias data into it
-            // after scales are processed.
-            if (jcp.with_bias) {
-                if (jcp.signed_input || jcp.with_dst_scales)
-                    mov(reg_bias_data, ptr[rsp + reg_bias_data_off]);
-                cvt2ps(jcp.bia_dt, vmm_bias, reg_bias_data,
-                        jcp.typesize_bia * jcp.oc_block * i_load, load_size);
+            if (mask_flag) {
+                uni_vpxor(vmm_scale, vmm_scale, vmm_scale);
+                cvt2ps(data_type::f32, vmm_scale, reg_ptr_scales,
+                        ptr_scales_offset, get_tail_size());
+            } else {
+                uni_vmovups(vmm_scale, ptr[reg_ptr_scales + ptr_scales_offset]);
             }
 
             for (int i_ur = 0; i_ur < ur; ++i_ur) {
@@ -424,23 +370,23 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<isa, Vmm>::reduce_loop(
                 if (jcp.signed_input || jcp.with_input_zp) uni_vaddps(r, r, vmm_comp);
                 if (jcp.src_zero_point) uni_vaddps(r, r, vmm_zp_comp);
 
-                if (is_vmm_scales_set) uni_vmulps(r, r, vmm_scales);
+                uni_vmulps(r, r, vmm_scale);
 
                 if (jcp.with_bias) uni_vaddps(r, r, vmm_bias);
             }
         }
 
         apply_postops(ur, load_loop_blk, mask_flag_in, p_sum_scale, p_sum_zp);
 
-        if (jcp.with_dst_scales) {
-            mov(reg_dst_scales, ptr[rsp + reg_dst_scales_off]);
-            uni_vbroadcastss(vmm_dst_scales, ptr[reg_dst_scales]);
+        if (jcp.dst_scale) {
+            mov(reg_ptr_dst_scale, ptr[rsp + reg_dst_scale_off]);
+            uni_vmovups(vmm_dst_scale, ptr[reg_ptr_dst_scale]);
 
             /* Apply dst scale to accumulator */
             for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
                 for (int i_ur = 0; i_ur < ur; ++i_ur) {
                     const auto r = vreg_accum(load_loop_blk, i_load, i_ur);
-                    uni_vmulps(r, r, vmm_dst_scales);
+                    uni_vmulps(r, r, vmm_dst_scale);
                 }
             }
         }
@@ -604,23 +550,18 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<isa, Vmm>::generate() {
         mov(reg_src_zero_point, ptr[param1 + GET_OFF(src_zero_point)]);
         mov(ptr[rsp + reg_src_zero_point_off], reg_src_zero_point);
     }
-    if (jcp.with_src_scales) {
-        mov(reg_src_scales, ptr[param1 + GET_OFF(src_scales)]);
-        mov(ptr[rsp + reg_src_scales_off], reg_src_scales);
-    }
-    if (jcp.with_wei_scales) {
-        mov(reg_wei_scales, ptr[param1 + GET_OFF(wei_scales)]);
-        mov(ptr[rsp + reg_wei_scales_off], reg_wei_scales);
-    }
-    if (jcp.with_dst_scales) {
-        if (!jcp.signed_input && !jcp.with_input_zp) mov(ptr[rsp + reg_bias_data_off], reg_bias_data);
-        mov(reg_dst_scales, ptr[param1 + GET_OFF(dst_scales)]);
-        mov(ptr[rsp + reg_dst_scales_off], reg_dst_scales);
+    if (jcp.dst_scale) {
+        if (!jcp.signed_input && !jcp.with_input_zp)
+            mov(ptr[rsp + reg_bias_data_off], reg_bias_data);
+        mov(reg_ptr_dst_scale, ptr[param1 + GET_OFF(dst_scale)]);
+        mov(ptr[rsp + reg_dst_scale_off], reg_ptr_dst_scale);
     }
     if (jcp.dst_zero_point) {
         mov(reg_dst_zero_point, ptr[param1 + GET_OFF(dst_zero_point)]);
         mov(ptr[rsp + reg_dst_zero_point_off], reg_dst_zero_point);
     }
+    mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]);
+    mov(ptr[rsp + reg_ptr_sum_scale_off], reg_ptr_scales);
     mov(reg_bcast_data, ptr[param1 + GET_OFF(bcast_data)]);
     mov(reg_load_data, ptr[param1 + GET_OFF(load_data)]);
     mov(reg_output_data, ptr[param1 + GET_OFF(output_data)]);
@@ -636,11 +577,11 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<isa, Vmm>::generate() {
         bcast_loop(load_loop_blk);
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
         if (jcp.with_bias) {
-            if (jcp.signed_input || jcp.with_dst_scales || jcp.with_input_zp)
+            if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                 mov(reg_bias_data, ptr[rsp + reg_bias_data_off]);
             add(reg_bias_data,
                     load_loop_blk * jcp.load_block * jcp.typesize_bia);
-            if (jcp.signed_input || jcp.with_dst_scales || jcp.with_input_zp)
+            if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                 mov(ptr[rsp + reg_bias_data_off], reg_bias_data);
         }
         if (jcp.signed_input || jcp.with_input_zp) {
@@ -656,13 +597,11 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<isa, Vmm>::generate() {
             mov(ptr[rsp + reg_zp_compensation_off], reg_zp_compensation);
         }
         mov(ptr[rsp + reg_bcast_data_off], reg_bcast_data);
-        if (jcp.with_wei_scales) {
-            mov(reg_wei_scales, ptr[rsp + reg_wei_scales_off]);
-            add(reg_wei_scales,
-                    jcp.is_oc_scale * load_loop_blk * jcp.load_block
-                            * sizeof(float));
-            mov(ptr[rsp + reg_wei_scales_off], reg_wei_scales);
-        }
+        mov(reg_ptr_scales, ptr[rsp + reg_ptr_sum_scale_off]);
+        add(reg_ptr_scales,
+                jcp.is_oc_scale * load_loop_blk * jcp.load_block
+                        * sizeof(float));
+        mov(ptr[rsp + reg_ptr_sum_scale_off], reg_ptr_scales);
         mov(reg_bcast_data, ptr[rsp + reg_bcast_data_off]);
         add(reg_output_data, load_loop_blk * jcp.load_block * jcp.typesize_out);
         sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step);
@@ -1023,11 +962,10 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel_t<isa>::init_conf(
     // miniumum size of load dim chunk for work distribution within threads
     jcp.nb_load_chunk = 1;
 
-    jcp.is_oc_scale = attr.scales_.get_mask(DNNL_ARG_WEIGHTS) > 0;
-    jcp.with_src_scales = !attr.scales_.get(DNNL_ARG_SRC).has_default_values();
-    jcp.with_wei_scales
-            = !attr.scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
-    jcp.with_dst_scales = !attr.scales_.get(DNNL_ARG_DST).has_default_values();
+    const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
+    const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
+    jcp.dst_scale = !dst_scales.has_default_values();
 
     jcp.wei_adj_scale
             = (weights_d.extra().flags & memory_extra_flags::scale_adjust)
@@ -1043,11 +981,12 @@ void jit_uni_x8s8s32x_1x1_conv_kernel_t<isa>::init_scratchpad(
         const jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
     using namespace dnnl::impl::memory_tracking::names;
 
-    if (jcp.with_dst_scales) {
-        // See brgemm_types.hpp comment for `with_dst_scales`.
-        scratchpad.book(key_conv_dst_scales,
-                static_cast<size_t>(jcp.nthr) * sizeof(float), 4096);
+    dim_t count = 8;
+    if (!attr.scales_.has_default_values(DNNL_ARG_WEIGHTS)) {
+        const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_mask > 0) count = static_cast<dim_t>(jcp.oc) * jcp.ngroups;
     }
+    scratchpad.book<float>(key_conv_adjusted_scales, count);
 }
 
 template struct jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t<avx2, Ymm>;
 
@@ -48,17 +48,15 @@ struct jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t : public jit_generator_t {
         ker_max_reg_idx = 13,
     };
     const Xbyak::Reg64 reg_bcast_data = r8;
+    const Xbyak::Reg64 reg_ptr_scales = r8;
     const Xbyak::Reg64 reg_output_data = r9;
-    const Xbyak::Reg64 reg_src_scales = r8;
-    const Xbyak::Reg64 reg_wei_scales = r8;
-    const Xbyak::Reg64 reg_scale_adjust = r8;
-    const Xbyak::Reg64 reg_dst_scales = r12;
     const Xbyak::Reg64 reg_load_data = r10;
     const Xbyak::Reg64 reg_ptr_sum_scale = r10;
     const Xbyak::Reg64 reg_ptr_sum_zp = rdx;
     const Xbyak::Reg64 reg_reduce_loop_work = r11;
     const Xbyak::Reg64 reg_bias_data = r12;
     const Xbyak::Reg64 reg_comp_data = r12;
+    const Xbyak::Reg64 reg_ptr_dst_scale = r12;
     const Xbyak::Reg64 reg_init_bcast = r13;
     const Xbyak::Reg64 reg_store_bcast = r13;
     const Xbyak::Reg64 reg_reduce_loop_iter = r13;
@@ -91,9 +89,7 @@ struct jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t : public jit_generator_t {
     const Vmm vmm_bcast = Vmm(0);
     const Vmm vmm_saturation = Vmm(0);
     /* used during scale section of store_output */
-    const Vmm vmm_scales = Vmm(1);
-    const Vmm vmm_scales_tmp = Vmm(3); // Has dependency on `vmm_bias`.
-    const Vmm vmm_dst_scales = Vmm(1);
+    const Vmm vmm_scale = Vmm(1);
     /* used during post_op sum section of store_output */
     const Vmm vmm_prev_dst = Vmm(1);
     /* used during bias section of store_output */
@@ -102,22 +98,23 @@ struct jit_uni_x8s8s32x_1x1_conv_kernel_vmm_t : public jit_generator_t {
     /* zero-point */
     const Vmm vmm_zp = Vmm(1);
     const Vmm vmm_zp_comp = Vmm(2);
+    /* dst scale */
+    const Vmm vmm_dst_scale = Vmm(1);
 
     constexpr static int simd_w = isa == avx2 ? 8 : 4;
     constexpr static int reg64_size = sizeof(int64_t);
     constexpr static int bcast_loop_work_off = 0;
     constexpr static int reg_bias_data_off = 1 * reg64_size;
     constexpr static int reg_bcast_data_off = 2 * reg64_size;
     constexpr static int reg_load_data_off = 3 * reg64_size;
-    constexpr static int reg_src_scales_off = 4 * reg64_size;
-    constexpr static int reg_wei_scales_off = 5 * reg64_size;
-    constexpr static int reg_dst_scales_off = 6 * reg64_size;
-    constexpr static int reg_bcast_loop_iter_off = 7 * reg64_size;
-    constexpr static int reg_comp_data_off = 8 * reg64_size;
-    constexpr static int reg_zp_compensation_off = 9 * reg64_size;
-    constexpr static int reg_src_zero_point_off = 10 * reg64_size;
-    constexpr static int reg_dst_zero_point_off = 11 * reg64_size;
-    constexpr static int stack_space_needed = 12 * reg64_size;
+    constexpr static int reg_ptr_sum_scale_off = 4 * reg64_size;
+    constexpr static int reg_bcast_loop_iter_off = 5 * reg64_size;
+    constexpr static int reg_comp_data_off = 6 * reg64_size;
+    constexpr static int reg_zp_compensation_off = 7 * reg64_size;
+    constexpr static int reg_src_zero_point_off = 8 * reg64_size;
+    constexpr static int reg_dst_zero_point_off = 9 * reg64_size;
+    constexpr static int reg_dst_scale_off = 10 * reg64_size;
+    constexpr static int stack_space_needed = 11 * reg64_size;
 
     int vreg_accum_idx(
             const int load_loop_blk, const int i_load, const int i_ur);