ZJIT: Replace fragile Mul+RShift+JoMul pattern with speculative smulh

tekknolagi · tekknolagi · commit b4a2de378e70 · 2026-03-11T01:49:27.000-07:00
Instead of scratch_split inserting an RShift between Mul and JoMul (which broke when a spill Store disrupted the pattern), make each instruction emit independently: - Mul always speculatively emits smulh into X16 before mul - JoMul carries the Mul output operand and emits a barrel-shifted cmp (CMP X16, val, ASR Shopify#62) to check overflow in one instruction - No cross-pass coordination, no pattern matching, no synthetic RShift Also adds cmp_shifted (CMP with ASR) to the ARM64 assembler.
diff --git a/zjit/src/asm/arm64/inst/data_reg.rs b/zjit/src/asm/arm64/inst/data_reg.rs
@@ -92,6 +92,26 @@ impl DataReg {
         Self::subs(31, rn, rm, num_bits)
     }
 
+    /// CMP (shifted register) with explicit shift
+    /// Encodes: CMP <Xn>, <Xm>, <shift> #<amount>
+    pub fn cmp_shifted(rn: u8, rm: u8, shift: u8, amount: u8, num_bits: u8) -> Self {
+        Self {
+            rd: 31,
+            rn,
+            imm6: amount,
+            rm,
+            shift: match shift {
+                0b00 => Shift::LSL,
+                0b01 => Shift::LSR,
+                0b10 => Shift::ASR,
+                _ => panic!("Invalid shift type"),
+            },
+            s: S::UpdateFlags,
+            op: Op::Sub,
+            sf: num_bits.into()
+        }
+    }
+
     /// SUB (shifted register)
     /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUB--shifted-register---Subtract--shifted-register--?lang=en>
     pub fn sub(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
diff --git a/zjit/src/asm/arm64/mod.rs b/zjit/src/asm/arm64/mod.rs
@@ -344,6 +344,20 @@ pub fn cmp(cb: &mut CodeBlock, rn: A64Opnd, rm: A64Opnd) {
     cb.write_bytes(&bytes);
 }
 
+/// CMP (shifted register) - compare with shifted second operand
+/// shift: 0b00=LSL, 0b01=LSR, 0b10=ASR
+pub fn cmp_shifted(cb: &mut CodeBlock, rn: A64Opnd, rm: A64Opnd, shift: u8, amount: u8) {
+    let bytes: [u8; 4] = match (rn, rm) {
+        (A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rn.num_bits == rm.num_bits, "All operands must be of the same size.");
+            DataReg::cmp_shifted(rn.reg_no, rm.reg_no, shift, amount, rn.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to cmp_shifted instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
 /// CSEL - conditionally select between two registers
 pub fn csel(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd, cond: u8) {
     let bytes: [u8; 4] = match (rd, rn, rm) {
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
@@ -797,33 +797,12 @@ impl Assembler {
                     *left = split_memory_read(asm, *left, SCRATCH0_OPND);
                     *right = split_memory_read(asm, *right, SCRATCH1_OPND);
                     let mem_out = split_memory_write(out, SCRATCH0_OPND);
-                    let reg_out = out.clone();
-
-                    let has_jo_mul = idx + 1 < linearized_insns.len() && matches!(linearized_insns[idx + 1], Insn::JoMul(_));
 
                     asm.push_insn(insn);
 
-                    // When JoMul follows, the emit pass needs Mul → RShift → JoMul
-                    // to be contiguous so it can pair smulh+mul+asr+cmp. The spill
-                    // Store must NOT be between Mul and RShift. Instead, we record
-                    // the spill destination in the RShift and have the emit pass
-                    // emit the store between mul and asr (before asr clobbers the
-                    // mul output register).
-                    if has_jo_mul {
-                        // Emit RShift immediately after Mul (before any Store)
-                        asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: reg_out, shift: Opnd::UImm(63) });
-                        // Emit spill Store after RShift. The emit pass will
-                        // skip it along with the RShift, and emit the spill
-                        // at the right point (between mul and asr).
-                        if let Some(mem_out) = mem_out {
-                            let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
-                            asm.store(mem_out, reg_out);
-                        }
-                    } else {
-                        if let Some(mem_out) = mem_out {
-                            let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
-                            asm.store(mem_out, SCRATCH0_OPND);
-                        }
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
                     }
                 }
                 Insn::LShift { opnd, out, .. } |
@@ -928,6 +907,10 @@ impl Assembler {
                         }
                     }
                 }
+                Insn::JoMul(opnd, _) => {
+                    *opnd = split_memory_read(asm, *opnd, SCRATCH0_OPND);
+                    asm.push_insn(insn);
+                }
                 &mut Insn::PatchPoint { ref target, invariant, version } => {
                     split_patch_point(asm, target, invariant, version);
                 }
@@ -1252,49 +1235,12 @@ impl Assembler {
                     }
                 },
                 Insn::Mul { left, right, out } => {
-                    // Look for the RShift+JoMul overflow check sequence inserted
-                    // by arm64_scratch_split. When the Mul output is spilled,
-                    // scratch_split emits [Mul, RShift, Store, JoMul] with the
-                    // Store after the RShift. Without a spill, it's just
-                    // [Mul, RShift, JoMul].
-                    let rshift_insn = match (insns.get(insn_idx + 1), insns.get(insn_idx + 2), insns.get(insn_idx + 3)) {
-                        (Some(&Insn::RShift { out: out_sign, opnd: out_opnd, shift: out_shift }), Some(&Insn::Store { dest: spill_dest, src: spill_src }), Some(Insn::JoMul(_))) => {
-                            Some((out_sign, out_opnd, out_shift, Some((spill_dest, spill_src))))
-                        }
-                        (Some(&Insn::RShift { out: out_sign, opnd: out_opnd, shift: out_shift }), Some(Insn::JoMul(_)), _) => {
-                            Some((out_sign, out_opnd, out_shift, None))
-                        }
-                        _ => None,
-                    };
-
-                    if let Some((out_sign, out_opnd, out_shift, spill)) = rshift_insn {
-                        // Compute the high 64 bits into EMIT_OPND (X16)
-                        smulh(cb, Self::EMIT_OPND, left.into(), right.into());
-
-                        // Compute the low 64 bits into `out` (may clobber inputs,
-                        // so this must come after smulh)
-                        mul(cb, out.into(), left.into(), right.into());
-
-                        // If the mul result was spilled, emit the store now
-                        // BEFORE asr clobbers the output register with the sign
-                        // bit. The spill source is always a register (SCRATCH0),
-                        // not EMIT_OPND (X16), so the smulh result is preserved.
-                        if let Some((spill_dest, spill_src)) = spill {
-                            stur(cb, spill_src.into(), spill_dest.into());
-                            insn_idx += 1; // will skip the Store insn
-                        }
-
-                        // Shift to extract the sign bit of the 64-bit mul result
-                        asr(cb, out_sign.into(), out_opnd.into(), out_shift.into());
-                        insn_idx += 1; // skip the RShift
-
-                        // If the high 64-bits are not all zeros or all ones,
-                        // matching the sign bit, then we have an overflow
-                        cmp(cb, Self::EMIT_OPND, out_sign.into());
-                        // JoMul will emit_conditional_jump::<{Condition::NE}>
-                    } else {
-                        mul(cb, out.into(), left.into(), right.into());
-                    }
+                    // Speculatively emit smulh into EMIT_OPND (X16) for a
+                    // potential following JoMul. If no JoMul follows, X16 is
+                    // simply overwritten later. Must come before mul since mul
+                    // may clobber an input register.
+                    smulh(cb, Self::EMIT_OPND, left.into(), right.into());
+                    mul(cb, out.into(), left.into(), right.into());
                 },
                 Insn::And { left, right, out } => {
                     and(cb, out.into(), left.into(), right.into());
@@ -1558,7 +1504,14 @@ impl Assembler {
                 Insn::Je(target) | Insn::Jz(target) => {
                     emit_conditional_jump::<{Condition::EQ}>(self, cb, target.clone());
                 },
-                Insn::Jne(target) | Insn::Jnz(target) | Insn::JoMul(target) => {
+                Insn::Jne(target) | Insn::Jnz(target) => {
+                    emit_conditional_jump::<{Condition::NE}>(self, cb, target.clone());
+                },
+                Insn::JoMul(val, target) => {
+                    // Compare smulh result (in EMIT_OPND/X16 from preceding Mul)
+                    // with the mul output sign-extended from bit 62. Uses the
+                    // barrel shifter built into CMP for a single instruction.
+                    cmp_shifted(cb, Self::EMIT_OPND, val.into(), 0b10, 62); // ASR #62
                     emit_conditional_jump::<{Condition::NE}>(self, cb, target.clone());
                 },
                 Insn::Jl(target) => {
@@ -1809,11 +1762,12 @@ mod tests {
         asm.compile_with_num_regs(&mut cb, 2);
 
         assert_disasm_snapshot!(cb.disasm(), @"
-            0x0: mov x0, #3
-            0x4: mul x0, x9, x0
-            0x8: mov x1, x0
+        0x0: mov x0, #3
+        0x4: smulh x16, x9, x0
+        0x8: mul x0, x9, x0
+        0xc: mov x1, x0
         ");
-        assert_snapshot!(cb.hexdump(), @"600080d2207d009be10300aa");
+        assert_snapshot!(cb.hexdump(), @"600080d2307d409b207d009be10300aa");
     }
 
     #[test]
diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs
@@ -621,8 +621,9 @@ pub enum Insn {
     /// Jump if overflow
     Jo(Target),
 
-    /// Jump if overflow in multiplication
-    JoMul(Target),
+    /// Jump if overflow in multiplication.
+    /// The operand is the Mul output, used on ARM64 for the barrel-shifted compare.
+    JoMul(Opnd, Target),
 
     /// Jump if zero
     Jz(Target),
@@ -734,7 +735,7 @@ impl Insn {
             Insn::Jne(target) |
             Insn::Jnz(target) |
             Insn::Jo(target) |
-            Insn::JoMul(target) |
+            Insn::JoMul(_, target) |
             Insn::Jz(target) |
             Insn::Joz(_, target) |
             Insn::Jonz(_, target) |
@@ -786,7 +787,7 @@ impl Insn {
             Insn::Jne(_) => "Jne",
             Insn::Jnz(_) => "Jnz",
             Insn::Jo(_) => "Jo",
-            Insn::JoMul(_) => "JoMul",
+            Insn::JoMul(..) => "JoMul",
             Insn::Jz(_) => "Jz",
             Insn::Joz(..) => "Joz",
             Insn::Jonz(..) => "Jonz",
@@ -894,7 +895,7 @@ impl Insn {
             Insn::Jne(target) |
             Insn::Jnz(target) |
             Insn::Jo(target) |
-            Insn::JoMul(target) |
+            Insn::JoMul(_, target) |
             Insn::Jz(target) |
             Insn::Joz(_, target) |
             Insn::Jonz(_, target) |
@@ -928,7 +929,7 @@ impl Insn {
             Insn::Jne(_) |
             Insn::Jnz(_) |
             Insn::Jo(_) |
-            Insn::JoMul(_) |
+            Insn::JoMul(..) |
             Insn::Jz(_) |
             Insn::Joz(..) |
             Insn::Jonz(..) |
@@ -966,7 +967,7 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
             Insn::Jne(target) |
             Insn::Jnz(target) |
             Insn::Jo(target) |
-            Insn::JoMul(target) |
+            Insn::JoMul(_, target) |
             Insn::Jz(target) |
             Insn::Label(target) |
             Insn::LeaJumpTarget { target, .. } |
@@ -1158,7 +1159,6 @@ impl<'a> InsnOpndMutIterator<'a> {
             Insn::Jne(target) |
             Insn::Jnz(target) |
             Insn::Jo(target) |
-            Insn::JoMul(target) |
             Insn::Jz(target) |
             Insn::Label(target) |
             Insn::LeaJumpTarget { target, .. } |
@@ -1192,6 +1192,7 @@ impl<'a> InsnOpndMutIterator<'a> {
                 }
             }
 
+            Insn::JoMul(opnd, target) |
             Insn::Joz(opnd, target) |
             Insn::Jonz(opnd, target) => {
                 if self.idx == 0 {
@@ -1798,7 +1799,7 @@ impl Assembler
             Insn::Jbe(Target::Block(edge)) => Insn::Jbe(Target::Label(process_edge(edge))),
             Insn::Jb(Target::Block(edge)) => Insn::Jb(Target::Label(process_edge(edge))),
             Insn::Jo(Target::Block(edge)) => Insn::Jo(Target::Label(process_edge(edge))),
-            Insn::JoMul(Target::Block(edge)) => Insn::JoMul(Target::Label(process_edge(edge))),
+            Insn::JoMul(opnd, Target::Block(edge)) => Insn::JoMul(*opnd, Target::Label(process_edge(edge))),
             Insn::Joz(opnd, Target::Block(edge)) => Insn::Joz(*opnd, Target::Label(process_edge(edge))),
             Insn::Jonz(opnd, Target::Block(edge)) => Insn::Jonz(*opnd, Target::Label(process_edge(edge))),
             _ => insn.clone()
@@ -2452,6 +2453,7 @@ impl fmt::Display for Assembler {
                         // If the instruction has a SideExit, avoid using opnd_iter(), which has stack/locals.
                         // Here, only handle instructions that have both Opnd and Target.
                         match insn {
+                            Insn::JoMul(opnd, _) |
                             Insn::Joz(opnd, _) |
                             Insn::Jonz(opnd, _) |
                             Insn::LeaJumpTarget { out: opnd, target: _ } => {
@@ -2463,6 +2465,7 @@ impl fmt::Display for Assembler {
                         // If the instruction has a Block target, avoid using opnd_iter() for branch args
                         // since they're already printed inline with the target. Only print non-target operands.
                         match insn {
+                            Insn::JoMul(opnd, _) |
                             Insn::Joz(opnd, _) |
                             Insn::Jonz(opnd, _) |
                             Insn::LeaJumpTarget { out: opnd, target: _ } => {
@@ -2787,8 +2790,8 @@ impl Assembler {
         self.push_insn(Insn::Jo(target));
     }
 
-    pub fn jo_mul(&mut self, target: Target) {
-        self.push_insn(Insn::JoMul(target));
+    pub fn jo_mul(&mut self, val: Opnd, target: Target) {
+        self.push_insn(Insn::JoMul(val, target));
     }
 
     pub fn jz(&mut self, target: Target) {
diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs
@@ -1007,7 +1007,7 @@ impl Assembler {
                 }
 
                 Insn::Jo(target) |
-                Insn::JoMul(target) => {
+                Insn::JoMul(_, target) => {
                     match *target {
                         Target::CodePtr(code_ptr) => jo_ptr(cb, code_ptr),
                         Target::Label(label) => jo_label(cb, label),
diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs
@@ -1984,8 +1984,8 @@ fn gen_fixnum_mult(jit: &mut JITState, asm: &mut Assembler, left: lir::Opnd, rig
     let right_untag = asm.sub(right, Opnd::UImm(1));
     let out_val = asm.mul(left_untag, right_untag);
 
-    // Test for overflow
-    asm.jo_mul(side_exit(jit, state, FixnumMultOverflow));
+    // Test for overflow (on ARM64, JoMul uses out_val for barrel-shifted cmp)
+    asm.jo_mul(out_val, side_exit(jit, state, FixnumMultOverflow));
     asm.add(out_val, Opnd::UImm(1))
 }
 

Original file line number	Diff line number	Diff line change
`@@ -1007,7 +1007,7 @@ impl Assembler {`
`1007`	`1007`	`}`
`1008`	`1008`
`1009`	`1009`	`Insn::Jo(target) \|`
`1010`		`- Insn::JoMul(target) => {`
	`1010`	`+ Insn::JoMul(_, target) => {`
`1011`	`1011`	`match *target {`
`1012`	`1012`	`Target::CodePtr(code_ptr) => jo_ptr(cb, code_ptr),`
`1013`	`1013`	`Target::Label(label) => jo_label(cb, label),`
Original file line number	Diff line number	Diff line change
`@@ -1984,8 +1984,8 @@ fn gen_fixnum_mult(jit: &mut JITState, asm: &mut Assembler, left: lir::Opnd, rig`
`1984`	`1984`	`let right_untag = asm.sub(right, Opnd::UImm(1));`
`1985`	`1985`	`let out_val = asm.mul(left_untag, right_untag);`
`1986`	`1986`
`1987`		`- // Test for overflow`
`1988`		`- asm.jo_mul(side_exit(jit, state, FixnumMultOverflow));`
	`1987`	`+ // Test for overflow (on ARM64, JoMul uses out_val for barrel-shifted cmp)`
	`1988`	`+ asm.jo_mul(out_val, side_exit(jit, state, FixnumMultOverflow));`
`1989`	`1989`	`asm.add(out_val, Opnd::UImm(1))`
`1990`	`1990`	`}`
`1991`	`1991`