@@ -1682,31 +1682,35 @@ IF_AVX(DEF_ISEL(VMOVSHDUP_YMMqq_YMMqq) = MOVSHDUP<VV256W, V256>;)
16821682
16831683namespace {
16841684
1685- template <typename S1>
1686- DEF_SEM (SQRTSS, V128W dst, S1 src1) {
1685+ template <typename D, typename S1, typename S2>
1686+ DEF_SEM (SQRTSS, D dst, S1 src1, S2 src2) {
1687+
1688+ // Extract a "single-precision" (32-bit) float from [31:0] of src2 vector:
1689+ auto src_float = FExtractV32 (FReadV32 (src2), 0 );
16871690
1688- // Extract a "single-precision" (32-bit) float from [31:0] of src1 vector:
1689- auto src_float = FExtractV32 ( FReadV32 (src1), 0 );
1691+ // Initialize dest vector, while also copying src1[127:32] -> dst[127:32].
1692+ auto temp_vec = FReadV32 (src1);
16901693
1691- // Store the square root result in dest[32 :0]:
1694+ // Store the square root result in dest[31 :0]:
16921695 auto square_root = SquareRoot32 (memory, state, src_float);
1693- auto temp_vec = FReadV32 (dst); // initialize a destination vector
16941696 temp_vec = FInsertV32 (temp_vec, 0 , square_root);
16951697
16961698 // Write out the result and return memory state:
16971699 FWriteV32 (dst, temp_vec); // SSE: Writes to XMM, AVX: Zero-extends XMM.
16981700 return memory;
16991701}
17001702
1701- template <typename S1 >
1702- DEF_SEM (RSQRTSS, V128W dst, S1 src1) {
1703+ template <typename D, typename S1, typename S2 >
1704+ DEF_SEM (RSQRTSS, D dst, S1 src1, S2 src2 ) {
17031705
1704- // Extract a "single-precision" (32-bit) float from [31:0] of src1 vector:
1705- auto src_float = FExtractV32 (FReadV32 (src1 ), 0 );
1706+ // Extract a "single-precision" (32-bit) float from [31:0] of src2 vector:
1707+ auto src_float = FExtractV32 (FReadV32 (src2 ), 0 );
17061708
1707- // Store the square root result in dest[32:0]:
1709+ // Initialize dest vector, while also copying src1[127:32] -> dst[127:32].
1710+ auto temp_vec = FReadV32 (src1);
1711+
1712+ // Store the square root result in dest[31:0]:
17081713 auto square_root = SquareRoot32 (memory, state, src_float);
1709- auto temp_vec = FReadV32 (dst); // initialize a destination vector
17101714 temp_vec = FInsertV32 (temp_vec, 0 , FDiv (1 .0f , square_root));
17111715
17121716 // Write out the result and return memory state:
@@ -1753,8 +1757,8 @@ DEF_SEM(VRSQRTSS, D dst, S1 src1, S2 src2) {
17531757#endif // HAS_FEATURE_AVX
17541758} // namespace
17551759
1756- DEF_ISEL (SQRTSS_XMMss_MEMss) = SQRTSS<MV32>;
1757- DEF_ISEL (SQRTSS_XMMss_XMMss) = SQRTSS<V128>;
1760+ DEF_ISEL (SQRTSS_XMMss_MEMss) = SQRTSS<V128W, V128, MV32>;
1761+ DEF_ISEL (SQRTSS_XMMss_XMMss) = SQRTSS<V128W, V128, V128>;
17581762IF_AVX (DEF_ISEL(VSQRTSS_XMMdq_XMMdq_MEMd) = VSQRTSS<VV128W, V128, MV32>;)
17591763IF_AVX (DEF_ISEL(VSQRTSS_XMMdq_XMMdq_XMMd) = VSQRTSS<VV128W, V128, V128>;)
17601764/*
@@ -1763,8 +1767,8 @@ IF_AVX(DEF_ISEL(VSQRTSS_XMMdq_XMMdq_XMMd) = VSQRTSS<VV128W, V128, V128>;)
176317674318 VSQRTSS VSQRTSS_XMMf32_MASKmskw_XMMf32_MEMf32_AVX512 AVX512 AVX512EVEX AVX512F_SCALAR ATTRIBUTES: DISP8_SCALAR MASKOP_EVEX MEMORY_FAULT_SUPPRESSION MXCSR SIMD_SCALAR
17641768*/
17651769
1766- DEF_ISEL (RSQRTSS_XMMss_MEMss) = RSQRTSS<MV32>;
1767- DEF_ISEL (RSQRTSS_XMMss_XMMss) = RSQRTSS<V128>;
1770+ DEF_ISEL (RSQRTSS_XMMss_MEMss) = RSQRTSS<V128W, V128, MV32>;
1771+ DEF_ISEL (RSQRTSS_XMMss_XMMss) = RSQRTSS<V128W, V128, V128>;
17681772IF_AVX (DEF_ISEL(VRSQRTSS_XMMdq_XMMdq_MEMd) = VRSQRTSS<VV128W, V128, MV32>;)
17691773IF_AVX (DEF_ISEL(VRSQRTSS_XMMdq_XMMdq_XMMd) = VRSQRTSS<VV128W, V128, V128>;)
17701774
@@ -1801,15 +1805,17 @@ DEF_HELPER(SquareRoot64, float64_t src_float)->float64_t {
18011805 return square_root;
18021806}
18031807
1804- template <typename S1>
1805- DEF_SEM (SQRTSD, V128W dst, S1 src1) {
1808+ template <typename D, typename S1, typename S2>
1809+ DEF_SEM (SQRTSD, D dst, S1 src1, S2 src2) {
1810+
1811+ // Extract a "double-precision" (64-bit) float from [63:0] of src2 vector:
1812+ auto src_float = FExtractV64 (FReadV64 (src2), 0 );
18061813
1807- // Extract a "double-precision" (64-bit) float from [63:0] of src1 vector:
1808- auto src_float = FExtractV64 ( FReadV64 (src1), 0 );
1814+ // Initialize dest vector, while also copying src1[127:64] -> dst[127:64].
1815+ auto temp_vec = FReadV64 (src1);
18091816
18101817 // Store the square root result in dest[63:0]:
18111818 auto square_root = SquareRoot64 (memory, state, src_float);
1812- auto temp_vec = FReadV64 (dst); // initialize a destination vector
18131819 temp_vec = FInsertV64 (temp_vec, 0 , square_root);
18141820
18151821 // Write out the result and return memory state:
@@ -1839,8 +1845,8 @@ DEF_SEM(VSQRTSD, D dst, S1 src1, S2 src2) {
18391845
18401846} // namespace
18411847
1842- DEF_ISEL (SQRTSD_XMMsd_MEMsd) = SQRTSD<MV64>;
1843- DEF_ISEL (SQRTSD_XMMsd_XMMsd) = SQRTSD<V128>;
1848+ DEF_ISEL (SQRTSD_XMMsd_MEMsd) = SQRTSD<V128W, V128, MV64>;
1849+ DEF_ISEL (SQRTSD_XMMsd_XMMsd) = SQRTSD<V128W, V128, V128>;
18441850IF_AVX (DEF_ISEL(VSQRTSD_XMMdq_XMMdq_MEMq) = VSQRTSD<VV128W, V128, MV64>;)
18451851IF_AVX (DEF_ISEL(VSQRTSD_XMMdq_XMMdq_XMMq) = VSQRTSD<VV128W, V128, V128>;)
18461852/*
0 commit comments