From bf9107cb10a1e260dff175a427fba7bc796b869f Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Thu, 21 May 2026 21:11:00 +0200
Subject: [PATCH] feat(riscv): Phase 2 i64 ops in the RV32 instruction selector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends the RV32IMAC instruction selector with the harder i64 surface,
building on the Phase-1 typed-vstack register-pair representation.

Implemented (each lowering to an RV32IMAC sequence):
- I64Mul — low-64 product via mul + mulhu carry + 2 cross-term muls
- I64Shl / I64ShrS / I64ShrU — runtime-amount shifts with a data-dependent
  branch on shamt >= 32 (cross-word) vs. < 32 (within-word + carry)
- I64Rotl / I64Rotr — composed from a pair of cross-word shifts ORed together
- I64Clz / I64Ctz / I64Popcnt — base-ISA software sequences (no Zbb):
  clz/ctz branch on the hi/lo half and use an unrolled binary-search
  clz_word; ctz reuses clz via `x & -x`; popcnt is the SWAR mul-collapse
- I64LtS/LtU/LeS/LeU/GtS/GtU/GeS/GeU — hi-then-lo compare ladder (hi signed
  or unsigned per op, lo always unsigned), reduced to less-than + invert
- I64Extend8S / I64Extend16S / I64Extend32S — sub-word sign extension with
  sign propagation into the high word via srai 31

Deferred to Phase 3:
- I64DivS / I64DivU / I64RemS / I64RemU — RV32 has no 64-bit divide; these
  need a __divdi3-style software long-division routine. They fall through
  to the existing `Unsupported` arm — fail loudly, no silent miscompile.

Tests: 23 new shape-assertion tests (one+ per implemented op, both shift
cross-word cases, the clz hi-vs-lo branch, deferred-op Unsupported check).

Validation: cargo test (148 pass, was 125), clippy -D warnings clean,
cargo fmt --check clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 crates/synth-backend-riscv/src/selector.rs | 3011 +++++++++++++++-----
 1 file changed, 2312 insertions(+), 699 deletions(-)

diff --git a/crates/synth-backend-riscv/src/selector.rs b/crates/synth-backend-riscv/src/selector.rs
index 6512440..4a87e21 100644
--- a/crates/synth-backend-riscv/src/selector.rs
+++ b/crates/synth-backend-riscv/src/selector.rs
@@ -5,10 +5,9 @@
 //! flow (block / loop / if / br / br_if), and local variable access.
 //!
 //! Out of scope (see `select_simple` doc comments for the full list):
-//! - i64 multiply / divide / remainder / shifts / rotates / count-leading-or-
-//!   trailing-zeros / popcount / signed-and-unsigned compare ladders /
-//!   sign-extending sub-word loads (Phase 2 — needs runtime helpers and
-//!   shamt branching at the 32-bit boundary)
+//! - i64 divide / remainder (Phase 3 — needs a `__divdi3`-style software
+//!   long-division routine; RV32 has no 64-bit divide instruction)
+//! - sign-extending sub-word i64 loads (`i64.load8_s` etc.)
 //! - F32/F64 (RV32F/D — not yet wired)
 //! - br_table (lowered in B3 alongside jump tables)
 //! - Cross-function calls (need linker-resolvable Call ops + relocations)
@@ -417,16 +416,53 @@ impl Selector {
             I64Or => self.lower_i64_bitwise(op, |rd, rs1, rs2| RiscVOp::Or { rd, rs1, rs2 })?,
             I64Xor => self.lower_i64_bitwise(op, |rd, rs1, rs2| RiscVOp::Xor { rd, rs1, rs2 })?,
 
+            // ─── i64 multiply ───────────────────────────────────────────
+            I64Mul => self.lower_i64_mul(op)?,
+
+            // ─── i64 shifts / rotates ───────────────────────────────────
+            I64Shl => self.lower_i64_shift(op, ShiftKind::Shl)?,
+            I64ShrS => self.lower_i64_shift(op, ShiftKind::ShrS)?,
+            I64ShrU => self.lower_i64_shift(op, ShiftKind::ShrU)?,
+            I64Rotl => self.lower_i64_rotate(op, true)?,
+            I64Rotr => self.lower_i64_rotate(op, false)?,
+
+            // ─── i64 bit-counting ───────────────────────────────────────
+            I64Clz => self.lower_i64_clz(op)?,
+            I64Ctz => self.lower_i64_ctz(op)?,
+            I64Popcnt => self.lower_i64_popcnt(op)?,
+
             // ─── i64 comparisons (result is an i32 0/1) ─────────────────
             I64Eq => self.lower_i64_eq(op, false)?,
             I64Ne => self.lower_i64_eq(op, true)?,
             I64Eqz => self.lower_i64_eqz(op)?,
+            I64LtS => self.lower_i64_cmp(op, CmpKind::Lt, true)?,
+            I64LtU => self.lower_i64_cmp(op, CmpKind::Lt, false)?,
+            I64LeS => self.lower_i64_cmp(op, CmpKind::Le, true)?,
+            I64LeU => self.lower_i64_cmp(op, CmpKind::Le, false)?,
+            I64GtS => self.lower_i64_cmp(op, CmpKind::Gt, true)?,
+            I64GtU => self.lower_i64_cmp(op, CmpKind::Gt, false)?,
+            I64GeS => self.lower_i64_cmp(op, CmpKind::Ge, true)?,
+            I64GeU => self.lower_i64_cmp(op, CmpKind::Ge, false)?,
 
             // ─── i64 / i32 conversions ──────────────────────────────────
             I64ExtendI32U => self.lower_i64_extend_i32_u(op)?,
             I64ExtendI32S => self.lower_i64_extend_i32_s(op)?,
             I32WrapI64 => self.lower_i32_wrap_i64(op)?,
 
+            // ─── i64 in-place sign extension ────────────────────────────
+            I64Extend8S => self.lower_i64_extend_sub(op, 8)?,
+            I64Extend16S => self.lower_i64_extend_sub(op, 16)?,
+            I64Extend32S => self.lower_i64_extend_sub(op, 32)?,
+
+            // i64 division / remainder is deferred to Phase 3 — RV32 has no
+            // 64-bit divide instruction, so these need a `__divdi3`-style
+            // software long-division routine (multi-block loop). Implementing
+            // it inline would balloon the selector; emitting it as a runtime
+            // helper needs a call-out the selector doesn't yet support.
+            // Until then they fall through to the `Unsupported` arm below —
+            // fail loudly rather than silently miscompile.
+            // I64DivS / I64DivU / I64RemS / I64RemU — Phase 3.
+
             // ─── i64 memory ─────────────────────────────────────────────
             I64Load { offset, align: _ } => self.lower_i64_load(op, *offset)?,
             I64Store { offset, align: _ } => self.lower_i64_store(op, *offset)?,
@@ -1499,128 +1535,1165 @@ impl Selector {
         });
         Ok(())
     }
-}
-
-#[derive(Debug, Clone, Copy)]
-enum LoadKind {
-    I8S,
-    I8U,
-    I16S,
-    I16U,
-}
 
-#[derive(Debug, Clone, Copy)]
-enum StoreKind {
-    Word,
-    Half,
-    Byte,
-}
+    // ────────── i64 lowerings (Phase 2) ──────────
+    //
+    // Phase 2 adds the harder i64 surface: multiply, the cross-word shifts
+    // and rotates, the bit-counting trio, the signed/unsigned compare
+    // ladders, and the in-place sub-word sign extensions. Division and
+    // remainder are deferred to Phase 3 (need a __divdi3-style runtime).
 
-fn offset_to_imm(offset: u32) -> Result<i32, SelectorError> {
-    if offset > 2047 {
-        // RV32 imm12 only supports ±2 KiB. Larger wasm offsets need an
-        // extra `addi tmp, tmp, hi` step, which we'll add when a real wasm
-        // module hits this. The skeleton fails loudly so we don't silently
-        // truncate.
-        return Err(SelectorError::ImmediateTooLarge {
-            value: offset as i64,
-            context: "memory offset",
+    /// 64×64 → low-64 multiply.
+    ///
+    /// A full 128-bit product isn't needed — wasm `i64.mul` keeps only the
+    /// low 64 bits. Writing `a = a_hi·2^32 + a_lo`, `b = b_hi·2^32 + b_lo`:
+    /// ```text
+    ///   a·b = a_hi·b_hi·2^64            (drops — above bit 63)
+    ///       + (a_hi·b_lo + a_lo·b_hi)·2^32
+    ///       + a_lo·b_lo
+    /// ```
+    /// So the low half is just `mul(a_lo, b_lo)` and the high half is the
+    /// carry out of that product (`mulhu(a_lo, b_lo)` — the *unsigned* upper
+    /// 32 bits, since the halves are treated as unsigned limbs) plus the two
+    /// cross terms `mul(a_lo, b_hi)` and `mul(a_hi, b_lo)`. The `a_hi·b_hi`
+    /// term lands entirely above bit 63 and is discarded.
+    fn lower_i64_mul(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let ((al, ah), (bl, bh)) = self.pop_pair_i64(op)?;
+        let lo = self.alloc_temp();
+        let hi_carry = self.alloc_temp();
+        let cross1 = self.alloc_temp();
+        let cross2 = self.alloc_temp();
+        let hi_partial = self.alloc_temp();
+        let hi = self.alloc_temp();
+        // lo = a_lo * b_lo  (low 32 bits of the limb product)
+        self.out.push(RiscVOp::Mul {
+            rd: lo,
+            rs1: al,
+            rs2: bl,
+        });
+        // hi_carry = mulhu(a_lo, b_lo)  — upper 32 bits of the same product,
+        // i.e. the carry from the low limb into the high limb.
+        self.out.push(RiscVOp::Mulhu {
+            rd: hi_carry,
+            rs1: al,
+            rs2: bl,
+        });
+        // cross1 = a_lo * b_hi  (only the low 32 bits matter at the 2^32 place)
+        self.out.push(RiscVOp::Mul {
+            rd: cross1,
+            rs1: al,
+            rs2: bh,
+        });
+        // cross2 = a_hi * b_lo
+        self.out.push(RiscVOp::Mul {
+            rd: cross2,
+            rs1: ah,
+            rs2: bl,
         });
+        // hi = hi_carry + cross1 + cross2
+        self.out.push(RiscVOp::Add {
+            rd: hi_partial,
+            rs1: hi_carry,
+            rs2: cross1,
+        });
+        self.out.push(RiscVOp::Add {
+            rd: hi,
+            rs1: hi_partial,
+            rs2: cross2,
+        });
+        self.push_i64(lo, hi);
+        Ok(())
     }
-    Ok(offset as i32)
-}
 
-/// Materialize a 32-bit immediate into `rd` using `lui + addi` when needed.
-fn emit_load_imm(out: &mut Vec<RiscVOp>, rd: Reg, value: i32) {
-    if (-2048..=2047).contains(&value) {
-        out.push(RiscVOp::Addi {
-            rd,
-            rs1: Reg::ZERO,
-            imm: value,
+    /// 64-bit shift by a runtime amount (shl / shr_s / shr_u).
+    ///
+    /// The wasm shift amount is itself an i64 on the stack, but only the low
+    /// 6 bits matter (`shamt mod 64`). RV32's register shifts (`sll`/`srl`/
+    /// `sra`) already mask the amount to its low 5 bits, so a within-word
+    /// shift "for free" computes `x << (shamt & 31)`. The hard part is the
+    /// carry between halves and the `shamt >= 32` case.
+    ///
+    /// We emit a data-dependent branch on bit 5 of the shift amount:
+    ///
+    /// **Left shift, `shamt < 32`:**
+    /// ```text
+    ///   lo' = lo << s
+    ///   hi' = (hi << s) | (lo >> (32 - s))     ; bits carried up from lo
+    /// ```
+    /// The `lo >> (32 - s)` term needs care when `s == 0`: `32 - 0 == 32`
+    /// which RV masks back to `0`, so `lo >> 0 == lo` would wrongly OR the
+    /// whole of lo into hi. We guard that by computing the carry as
+    /// `(lo >> 1) >> (31 - s)` — a two-step shift that is well-defined for
+    /// every `s` in `0..=31` and yields 0 when `s == 0`.
+    ///
+    /// **Left shift, `shamt >= 32`:** all of lo moves into hi.
+    /// ```text
+    ///   hi' = lo << (s - 32)        ; s-32 in 0..=31, RV masks it anyway
+    ///   lo' = 0
+    /// ```
+    /// Right shifts are the mirror image (carry goes downward; for shr_s the
+    /// fill is the sign bit, materialised once via `sra hi, 31`).
+    fn lower_i64_shift(&mut self, op: &WasmOp, kind: ShiftKind) -> Result<(), SelectorError> {
+        // rhs is the i64 shift amount; only its low limb is relevant.
+        let ((vlo, vhi), (samt, _samt_hi)) = self.pop_pair_i64(op)?;
+        // s = shamt & 63 — clamp to the 0..=63 window wasm guarantees.
+        let s = self.alloc_temp();
+        self.out.push(RiscVOp::Andi {
+            rd: s,
+            rs1: samt,
+            imm: 63,
         });
-        return;
-    }
-    let value_u = value as u32;
-    let lo12 = (value_u & 0xFFF) as i32;
-    let lo12_signed = if lo12 >= 0x800 { lo12 - 0x1000 } else { lo12 };
-    let hi20 = (value_u.wrapping_sub(lo12_signed as u32)) >> 12;
-    out.push(RiscVOp::Lui {
-        rd,
-        imm20: hi20 & 0xFFFFF,
-    });
-    if lo12_signed != 0 {
-        out.push(RiscVOp::Addi {
-            rd,
-            rs1: rd,
-            imm: lo12_signed,
+        // s_lo = s & 31 — the within-word part (also what RV's shift uses).
+        let s_lo = self.alloc_temp();
+        self.out.push(RiscVOp::Andi {
+            rd: s_lo,
+            rs1: s,
+            imm: 31,
+        });
+        // Result registers — written by both arms of the branch, so they
+        // must be allocated once up front (a fresh temp per arm would leave
+        // the join point reading an undefined register).
+        let res_lo = self.alloc_temp();
+        let res_hi = self.alloc_temp();
+
+        let big_label = self.fresh_label("Lshift_big");
+        let done_label = self.fresh_label("Lshift_done");
+        // andi test, s, 32 ; bne test, zero, Lshift_big → take the cross-word
+        // path when bit 5 of the shift amount is set (shamt >= 32).
+        let test = self.alloc_temp();
+        self.out.push(RiscVOp::Andi {
+            rd: test,
+            rs1: s,
+            imm: 32,
+        });
+        self.out.push(RiscVOp::Branch {
+            cond: Branch::Ne,
+            rs1: test,
+            rs2: Reg::ZERO,
+            label: big_label.clone(),
         });
-    }
-}
 
-#[cfg(test)]
-mod tests {
-    use super::*;
+        // ── small case: shamt < 32 ───────────────────────────────────────
+        // `inv = 31 - s_lo` is the complementary shift used to extract the
+        // bits crossing the half boundary. Built once, reused below.
+        let inv = self.alloc_temp();
+        self.out.push(RiscVOp::Addi {
+            rd: inv,
+            rs1: Reg::ZERO,
+            imm: 31,
+        });
+        self.out.push(RiscVOp::Sub {
+            rd: inv,
+            rs1: inv,
+            rs2: s_lo,
+        });
+        match kind {
+            ShiftKind::Shl => {
+                // lo' = lo << s
+                self.out.push(RiscVOp::Sll {
+                    rd: res_lo,
+                    rs1: vlo,
+                    rs2: s_lo,
+                });
+                // carry = (lo >> 1) >> (31 - s)  — well-defined for s==0..31
+                let carry = self.alloc_temp();
+                self.out.push(RiscVOp::Srli {
+                    rd: carry,
+                    rs1: vlo,
+                    shamt: 1,
+                });
+                self.out.push(RiscVOp::Srl {
+                    rd: carry,
+                    rs1: carry,
+                    rs2: inv,
+                });
+                // hi' = (hi << s) | carry
+                let hi_shifted = self.alloc_temp();
+                self.out.push(RiscVOp::Sll {
+                    rd: hi_shifted,
+                    rs1: vhi,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Or {
+                    rd: res_hi,
+                    rs1: hi_shifted,
+                    rs2: carry,
+                });
+            }
+            ShiftKind::ShrS | ShiftKind::ShrU => {
+                // hi' = hi >> s  (sra for shr_s, srl for shr_u)
+                self.out.push(if kind == ShiftKind::ShrS {
+                    RiscVOp::Sra {
+                        rd: res_hi,
+                        rs1: vhi,
+                        rs2: s_lo,
+                    }
+                } else {
+                    RiscVOp::Srl {
+                        rd: res_hi,
+                        rs1: vhi,
+                        rs2: s_lo,
+                    }
+                });
+                // carry = (hi << 1) << (31 - s)  — bits dropping down from hi
+                let carry = self.alloc_temp();
+                self.out.push(RiscVOp::Slli {
+                    rd: carry,
+                    rs1: vhi,
+                    shamt: 1,
+                });
+                self.out.push(RiscVOp::Sll {
+                    rd: carry,
+                    rs1: carry,
+                    rs2: inv,
+                });
+                // lo' = (lo >> s) | carry  — lo is always logical-shifted.
+                let lo_shifted = self.alloc_temp();
+                self.out.push(RiscVOp::Srl {
+                    rd: lo_shifted,
+                    rs1: vlo,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Or {
+                    rd: res_lo,
+                    rs1: lo_shifted,
+                    rs2: carry,
+                });
+            }
+        }
+        self.out.push(RiscVOp::Jal {
+            rd: Reg::ZERO,
+            label: done_label.clone(),
+        });
 
-    fn s(ops: &[WasmOp], num_params: u32) -> Vec<RiscVOp> {
-        select(ops, num_params).unwrap().ops
+        // ── big case: shamt >= 32 ────────────────────────────────────────
+        // The whole of one half moves into the other; `s_lo` (= s & 31) is
+        // exactly `s - 32` because bit 5 is set, so it doubles as the
+        // residual shift amount within the surviving half.
+        self.out.push(RiscVOp::Label { name: big_label });
+        match kind {
+            ShiftKind::Shl => {
+                // hi' = lo << (s - 32) ; lo' = 0
+                self.out.push(RiscVOp::Sll {
+                    rd: res_hi,
+                    rs1: vlo,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Addi {
+                    rd: res_lo,
+                    rs1: Reg::ZERO,
+                    imm: 0,
+                });
+            }
+            ShiftKind::ShrU => {
+                // lo' = hi >> (s - 32) ; hi' = 0
+                self.out.push(RiscVOp::Srl {
+                    rd: res_lo,
+                    rs1: vhi,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Addi {
+                    rd: res_hi,
+                    rs1: Reg::ZERO,
+                    imm: 0,
+                });
+            }
+            ShiftKind::ShrS => {
+                // lo' = hi >>s (s - 32) ; hi' = sign(hi) = hi >>s 31
+                self.out.push(RiscVOp::Sra {
+                    rd: res_lo,
+                    rs1: vhi,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Srai {
+                    rd: res_hi,
+                    rs1: vhi,
+                    shamt: 31,
+                });
+            }
+        }
+        self.out.push(RiscVOp::Label { name: done_label });
+        self.push_i64(res_lo, res_hi);
+        Ok(())
     }
 
-    fn count<F: Fn(&RiscVOp) -> bool>(out: &[RiscVOp], f: F) -> usize {
-        out.iter().filter(|o| f(o)).count()
+    /// 64-bit rotate (rotl / rotr) by a runtime amount.
+    ///
+    /// Composed from the two shifts: `rotl(x, n) = (x << n) | (x >> (64-n))`
+    /// and `rotr(x, n) = (x >> n) | (x << (64-n))`. The shift helper already
+    /// handles the cross-word logic, so the rotate just feeds it the right
+    /// amounts and ORs the two i64 results half-by-half.
+    ///
+    /// `n` is masked to `0..=63`; `64 - (n mod 64)` is computed as
+    /// `(64 - n) mod 64` so that `n == 0` maps to a `0` counter-shift rather
+    /// than a 64-bit shift (which the shift helper would treat as `shamt &
+    /// 63 == 0` anyway — but computing it explicitly keeps the intent clear).
+    fn lower_i64_rotate(&mut self, op: &WasmOp, left: bool) -> Result<(), SelectorError> {
+        let ((vlo, vhi), (namt, _namt_hi)) = self.pop_i64_then_amount(op)?;
+        // n = amount & 63
+        let n = self.alloc_temp();
+        self.out.push(RiscVOp::Andi {
+            rd: n,
+            rs1: namt,
+            imm: 63,
+        });
+        // comp = (64 - n) & 63  — the complementary rotate distance.
+        let comp = self.alloc_temp();
+        self.out.push(RiscVOp::Addi {
+            rd: comp,
+            rs1: Reg::ZERO,
+            imm: 64,
+        });
+        self.out.push(RiscVOp::Sub {
+            rd: comp,
+            rs1: comp,
+            rs2: n,
+        });
+        self.out.push(RiscVOp::Andi {
+            rd: comp,
+            rs1: comp,
+            imm: 63,
+        });
+        // For rotl: part_a = x << n, part_b = x >> comp.
+        // For rotr: part_a = x >> n, part_b = x << comp.
+        let (amt_a, amt_b) = (n, comp);
+        let (a_lo, a_hi) = if left {
+            self.emit_i64_shift_inline(vlo, vhi, amt_a, ShiftKind::Shl)
+        } else {
+            self.emit_i64_shift_inline(vlo, vhi, amt_a, ShiftKind::ShrU)
+        };
+        let (b_lo, b_hi) = if left {
+            self.emit_i64_shift_inline(vlo, vhi, amt_b, ShiftKind::ShrU)
+        } else {
+            self.emit_i64_shift_inline(vlo, vhi, amt_b, ShiftKind::Shl)
+        };
+        // result = part_a | part_b, half by half.
+        let res_lo = self.alloc_temp();
+        let res_hi = self.alloc_temp();
+        self.out.push(RiscVOp::Or {
+            rd: res_lo,
+            rs1: a_lo,
+            rs2: b_lo,
+        });
+        self.out.push(RiscVOp::Or {
+            rd: res_hi,
+            rs1: a_hi,
+            rs2: b_hi,
+        });
+        self.push_i64(res_lo, res_hi);
+        Ok(())
     }
 
-    #[test]
-    fn add_two_params() {
-        let out = s(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::LocalGet(1),
-                WasmOp::I32Add,
-                WasmOp::End,
-            ],
-            2,
-        );
-        assert!(count(&out, |op| matches!(op, RiscVOp::Add { .. })) == 1);
-        assert!(matches!(
-            out.last().unwrap(),
-            RiscVOp::Jalr {
-                rd: Reg::ZERO,
-                rs1: Reg::RA,
-                ..
-            }
-        ));
+    /// Pop an i64 value and an i64 shift/rotate amount, returning
+    /// `((v_lo, v_hi), (amt_lo, amt_hi))`. Identical to `pop_pair_i64` —
+    /// kept as a named helper so the rotate code reads naturally.
+    fn pop_i64_then_amount(&mut self, op: &WasmOp) -> Result<(I64Pair, I64Pair), SelectorError> {
+        self.pop_pair_i64(op)
     }
 
-    #[test]
-    fn div_emits_zero_trap() {
-        let out = s(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::LocalGet(1),
-                WasmOp::I32DivS,
-                WasmOp::End,
-            ],
-            2,
-        );
-        // bne ... zero, Ldiv_ok ; ebreak ; Ldiv_ok: ; div ...
-        assert!(
-            count(&out, |op| matches!(
-                op,
-                RiscVOp::Branch {
-                    cond: Branch::Ne,
-                    ..
-                }
-            )) >= 1
-        );
-        assert!(count(&out, |op| matches!(op, RiscVOp::Ebreak)) >= 1);
-        assert!(count(&out, |op| matches!(op, RiscVOp::Div { .. })) >= 1);
+    /// Emit an i64 shift whose value and amount are already in registers
+    /// (rather than on the vstack), returning the `(lo, hi)` result pair.
+    /// This is the register-level core of `lower_i64_shift`, factored out so
+    /// the rotate lowering can issue two shifts without round-tripping
+    /// through the value stack. The cross-word branching logic is identical
+    /// to `lower_i64_shift` — see that method's doc comment for the math.
+    fn emit_i64_shift_inline(
+        &mut self,
+        vlo: Reg,
+        vhi: Reg,
+        amount: Reg,
+        kind: ShiftKind,
+    ) -> I64Pair {
+        let s = self.alloc_temp();
+        self.out.push(RiscVOp::Andi {
+            rd: s,
+            rs1: amount,
+            imm: 63,
+        });
+        let s_lo = self.alloc_temp();
+        self.out.push(RiscVOp::Andi {
+            rd: s_lo,
+            rs1: s,
+            imm: 31,
+        });
+        let res_lo = self.alloc_temp();
+        let res_hi = self.alloc_temp();
+        let big_label = self.fresh_label("Lshift_big");
+        let done_label = self.fresh_label("Lshift_done");
+        let test = self.alloc_temp();
+        self.out.push(RiscVOp::Andi {
+            rd: test,
+            rs1: s,
+            imm: 32,
+        });
+        self.out.push(RiscVOp::Branch {
+            cond: Branch::Ne,
+            rs1: test,
+            rs2: Reg::ZERO,
+            label: big_label.clone(),
+        });
+        // small case
+        let inv = self.alloc_temp();
+        self.out.push(RiscVOp::Addi {
+            rd: inv,
+            rs1: Reg::ZERO,
+            imm: 31,
+        });
+        self.out.push(RiscVOp::Sub {
+            rd: inv,
+            rs1: inv,
+            rs2: s_lo,
+        });
+        match kind {
+            ShiftKind::Shl => {
+                self.out.push(RiscVOp::Sll {
+                    rd: res_lo,
+                    rs1: vlo,
+                    rs2: s_lo,
+                });
+                let carry = self.alloc_temp();
+                self.out.push(RiscVOp::Srli {
+                    rd: carry,
+                    rs1: vlo,
+                    shamt: 1,
+                });
+                self.out.push(RiscVOp::Srl {
+                    rd: carry,
+                    rs1: carry,
+                    rs2: inv,
+                });
+                let hi_shifted = self.alloc_temp();
+                self.out.push(RiscVOp::Sll {
+                    rd: hi_shifted,
+                    rs1: vhi,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Or {
+                    rd: res_hi,
+                    rs1: hi_shifted,
+                    rs2: carry,
+                });
+            }
+            ShiftKind::ShrS | ShiftKind::ShrU => {
+                self.out.push(if kind == ShiftKind::ShrS {
+                    RiscVOp::Sra {
+                        rd: res_hi,
+                        rs1: vhi,
+                        rs2: s_lo,
+                    }
+                } else {
+                    RiscVOp::Srl {
+                        rd: res_hi,
+                        rs1: vhi,
+                        rs2: s_lo,
+                    }
+                });
+                let carry = self.alloc_temp();
+                self.out.push(RiscVOp::Slli {
+                    rd: carry,
+                    rs1: vhi,
+                    shamt: 1,
+                });
+                self.out.push(RiscVOp::Sll {
+                    rd: carry,
+                    rs1: carry,
+                    rs2: inv,
+                });
+                let lo_shifted = self.alloc_temp();
+                self.out.push(RiscVOp::Srl {
+                    rd: lo_shifted,
+                    rs1: vlo,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Or {
+                    rd: res_lo,
+                    rs1: lo_shifted,
+                    rs2: carry,
+                });
+            }
+        }
+        self.out.push(RiscVOp::Jal {
+            rd: Reg::ZERO,
+            label: done_label.clone(),
+        });
+        // big case
+        self.out.push(RiscVOp::Label { name: big_label });
+        match kind {
+            ShiftKind::Shl => {
+                self.out.push(RiscVOp::Sll {
+                    rd: res_hi,
+                    rs1: vlo,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Addi {
+                    rd: res_lo,
+                    rs1: Reg::ZERO,
+                    imm: 0,
+                });
+            }
+            ShiftKind::ShrU => {
+                self.out.push(RiscVOp::Srl {
+                    rd: res_lo,
+                    rs1: vhi,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Addi {
+                    rd: res_hi,
+                    rs1: Reg::ZERO,
+                    imm: 0,
+                });
+            }
+            ShiftKind::ShrS => {
+                self.out.push(RiscVOp::Sra {
+                    rd: res_lo,
+                    rs1: vhi,
+                    rs2: s_lo,
+                });
+                self.out.push(RiscVOp::Srai {
+                    rd: res_hi,
+                    rs1: vhi,
+                    shamt: 31,
+                });
+            }
+        }
+        self.out.push(RiscVOp::Label { name: done_label });
+        (res_lo, res_hi)
     }
 
-    #[test]
-    fn rem_unsigned_emits_remu() {
-        let out = s(
-            &[
-                WasmOp::LocalGet(0),
+    /// 64-bit count-leading-zeros → i32 result in `0..=64`.
+    ///
+    /// RV32IMAC has no `clz` (that's Zbb), so each 32-bit half uses a
+    /// software `clz_word` (an unrolled binary-search sequence — see
+    /// `emit_clz_word`). The 64-bit answer branches on whether the high
+    /// half is non-zero:
+    /// ```text
+    ///   if hi != 0 { clz = clz_word(hi) }
+    ///   else       { clz = 32 + clz_word(lo) }
+    /// ```
+    fn lower_i64_clz(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let (lo, hi) = self.pop_i64(op)?;
+        let result = self.alloc_temp();
+        let hi_zero = self.fresh_label("Lclz_hizero");
+        let done = self.fresh_label("Lclz_done");
+        // beq hi, zero, Lclz_hizero → all leading zeros are in (or past) lo.
+        self.out.push(RiscVOp::Branch {
+            cond: Branch::Eq,
+            rs1: hi,
+            rs2: Reg::ZERO,
+            label: hi_zero.clone(),
+        });
+        // hi != 0: the answer is entirely within the high word.
+        let hi_clz = self.emit_clz_word(hi);
+        self.out.push(RiscVOp::Addi {
+            rd: result,
+            rs1: hi_clz,
+            imm: 0,
+        });
+        self.out.push(RiscVOp::Jal {
+            rd: Reg::ZERO,
+            label: done.clone(),
+        });
+        // hi == 0: 32 leading zeros from the high word + clz of the low word.
+        self.out.push(RiscVOp::Label { name: hi_zero });
+        let lo_clz = self.emit_clz_word(lo);
+        self.out.push(RiscVOp::Addi {
+            rd: result,
+            rs1: lo_clz,
+            imm: 32,
+        });
+        self.out.push(RiscVOp::Label { name: done });
+        self.push_i32(result);
+        Ok(())
+    }
+
+    /// 64-bit count-trailing-zeros → i32 result in `0..=64`.
+    ///
+    /// Mirror of clz: branch on the *low* half.
+    /// ```text
+    ///   if lo != 0 { ctz = ctz_word(lo) }
+    ///   else       { ctz = 32 + ctz_word(hi) }
+    /// ```
+    fn lower_i64_ctz(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let (lo, hi) = self.pop_i64(op)?;
+        let result = self.alloc_temp();
+        let lo_zero = self.fresh_label("Lctz_lozero");
+        let done = self.fresh_label("Lctz_done");
+        self.out.push(RiscVOp::Branch {
+            cond: Branch::Eq,
+            rs1: lo,
+            rs2: Reg::ZERO,
+            label: lo_zero.clone(),
+        });
+        // lo != 0: answer is within the low word.
+        let lo_ctz = self.emit_ctz_word(lo);
+        self.out.push(RiscVOp::Addi {
+            rd: result,
+            rs1: lo_ctz,
+            imm: 0,
+        });
+        self.out.push(RiscVOp::Jal {
+            rd: Reg::ZERO,
+            label: done.clone(),
+        });
+        // lo == 0: 32 trailing zeros from the low word + ctz of the high word.
+        self.out.push(RiscVOp::Label { name: lo_zero });
+        let hi_ctz = self.emit_ctz_word(hi);
+        self.out.push(RiscVOp::Addi {
+            rd: result,
+            rs1: hi_ctz,
+            imm: 32,
+        });
+        self.out.push(RiscVOp::Label { name: done });
+        self.push_i32(result);
+        Ok(())
+    }
+
+    /// 64-bit population count → i32 result in `0..=64`.
+    ///
+    /// `popcnt(x) = popcnt(x_lo) + popcnt(x_hi)` — the two halves are
+    /// independent, no carry. Each `popcnt_word` is the classic SWAR
+    /// bit-twiddle (see `emit_popcnt_word`).
+    fn lower_i64_popcnt(&mut self, op: &WasmOp) -> Result<(), SelectorError> {
+        let (lo, hi) = self.pop_i64(op)?;
+        let lo_pc = self.emit_popcnt_word(lo);
+        let hi_pc = self.emit_popcnt_word(hi);
+        let result = self.alloc_temp();
+        self.out.push(RiscVOp::Add {
+            rd: result,
+            rs1: lo_pc,
+            rs2: hi_pc,
+        });
+        self.push_i32(result);
+        Ok(())
+    }
+
+    /// Software count-leading-zeros for a single 32-bit word, returning the
+    /// register holding the `0..=32` result.
+    ///
+    /// Uses the standard branchless binary-search reduction: at each step,
+    /// test whether the value's set bits all live in the lower `k` bits; if
+    /// so the top `k` bits were all zero — add `k` to the count and shift
+    /// the value left by `k` to bring the next window into view. Probing
+    /// k = 16, 8, 4, 2, 1 narrows to the exact leading-zero count. A final
+    /// correction handles the all-zero input (which would otherwise report
+    /// 31 instead of 32).
+    ///
+    /// The sequence is fully unrolled and branch-free, so it costs a fixed
+    /// number of instructions regardless of input — preferable to a loop in
+    /// a leaf compiler with no spill slots.
+    fn emit_clz_word(&mut self, src: Reg) -> Reg {
+        // x is the running value; n accumulates the count.
+        let x = self.alloc_temp();
+        let n = self.alloc_temp();
+        self.out.push(RiscVOp::Addi {
+            rd: x,
+            rs1: src,
+            imm: 0,
+        });
+        self.out.push(RiscVOp::Addi {
+            rd: n,
+            rs1: Reg::ZERO,
+            imm: 0,
+        });
+        // For each probe width k: if (x >> (32-k)) == 0 then the top k bits
+        // are zero — add k to n and shift x up by k. Implemented branchlessly
+        // via a 0/1 mask derived from sltiu.
+        for k in [16u8, 8, 4, 2, 1] {
+            let probe = self.alloc_temp();
+            // probe = x >> (32 - k)  — the top k bits, right-justified.
+            self.out.push(RiscVOp::Srli {
+                rd: probe,
+                rs1: x,
+                shamt: 32 - k,
+            });
+            // is_zero = (probe == 0) ? 1 : 0
+            let is_zero = self.alloc_temp();
+            self.out.push(RiscVOp::Sltiu {
+                rd: is_zero,
+                rs1: probe,
+                imm: 1,
+            });
+            // n += is_zero * k  → add k only when the window was all zeros.
+            let add = self.alloc_temp();
+            self.out.push(RiscVOp::Slli {
+                rd: add,
+                rs1: is_zero,
+                shamt: k.trailing_zeros() as u8,
+            });
+            self.out.push(RiscVOp::Add {
+                rd: n,
+                rs1: n,
+                rs2: add,
+            });
+            // shift = is_zero * k, then x <<= shift (no-op when window had a 1).
+            self.out.push(RiscVOp::Sll {
+                rd: x,
+                rs1: x,
+                rs2: add,
+            });
+        }
+        // After the five probes, n is 31 for an all-zero input (each window
+        // contributed) and the true count for anything else. Correct the
+        // off-by-one: if the original src was 0, the answer is 32.
+        let is_src_zero = self.alloc_temp();
+        self.out.push(RiscVOp::Sltiu {
+            rd: is_src_zero,
+            rs1: src,
+            imm: 1,
+        });
+        let result = self.alloc_temp();
+        self.out.push(RiscVOp::Add {
+            rd: result,
+            rs1: n,
+            rs2: is_src_zero,
+        });
+        result
+    }
+
+    /// Software count-trailing-zeros for a 32-bit word → register with the
+    /// `0..=32` result.
+    ///
+    /// Trick: `x & -x` isolates the lowest set bit, then `ctz(x) =
+    /// 31 - clz(x & -x)` for any non-zero `x`. The all-zero input is handled
+    /// separately because `x & -x == 0` would feed clz a zero (clz → 32,
+    /// giving `31 - 32 = -1`), so we add a correction that forces the answer
+    /// to 32 when `src == 0`.
+    fn emit_ctz_word(&mut self, src: Reg) -> Reg {
+        // neg = -src ; iso = src & neg  — isolates the lowest set bit.
+        let neg = self.alloc_temp();
+        self.out.push(RiscVOp::Sub {
+            rd: neg,
+            rs1: Reg::ZERO,
+            rs2: src,
+        });
+        let iso = self.alloc_temp();
+        self.out.push(RiscVOp::And {
+            rd: iso,
+            rs1: src,
+            rs2: neg,
+        });
+        // clz_iso = clz(iso). For non-zero src, ctz = 31 - clz_iso.
+        let clz_iso = self.emit_clz_word(iso);
+        let thirty_one = self.alloc_temp();
+        self.out.push(RiscVOp::Addi {
+            rd: thirty_one,
+            rs1: Reg::ZERO,
+            imm: 31,
+        });
+        let ctz = self.alloc_temp();
+        self.out.push(RiscVOp::Sub {
+            rd: ctz,
+            rs1: thirty_one,
+            rs2: clz_iso,
+        });
+        // src == 0 correction: `iso` is 0, clz_iso is 32, so ctz computed as
+        // -1. Add 33 in that case to land on 32 ( -1 + 33 = 32 ).
+        let is_src_zero = self.alloc_temp();
+        self.out.push(RiscVOp::Sltiu {
+            rd: is_src_zero,
+            rs1: src,
+            imm: 1,
+        });
+        // correction = is_src_zero * 33
+        let corr = self.alloc_temp();
+        self.out.push(RiscVOp::Slli {
+            rd: corr,
+            rs1: is_src_zero,
+            shamt: 5,
+        });
+        // corr is now is_src_zero*32; add is_src_zero once more for *33.
+        self.out.push(RiscVOp::Add {
+            rd: corr,
+            rs1: corr,
+            rs2: is_src_zero,
+        });
+        let result = self.alloc_temp();
+        self.out.push(RiscVOp::Add {
+            rd: result,
+            rs1: ctz,
+            rs2: corr,
+        });
+        result
+    }
+
+    /// Software population count for a 32-bit word → register with the
+    /// `0..=32` result.
+    ///
+    /// The classic SWAR (SIMD-within-a-register) bit-twiddle:
+    /// ```text
+    ///   x = x - ((x >> 1) & 0x55555555)               ; sum bits in pairs
+    ///   x = (x & 0x33333333) + ((x >> 2) & 0x33333333) ; sum pairs into nibbles
+    ///   x = (x + (x >> 4)) & 0x0F0F0F0F                ; sum nibbles into bytes
+    ///   x = (x * 0x01010101) >> 24                     ; sum bytes via mul
+    /// ```
+    /// The final `mul` collapses the four byte-sums into the top byte — this
+    /// is why the routine needs the M extension (which RV32IMAC has).
+    fn emit_popcnt_word(&mut self, src: Reg) -> Reg {
+        let m1 = self.alloc_temp(); // 0x55555555
+        let m2 = self.alloc_temp(); // 0x33333333
+        let m4 = self.alloc_temp(); // 0x0F0F0F0F
+        let m8 = self.alloc_temp(); // 0x01010101
+        emit_load_imm(&mut self.out, m1, 0x5555_5555u32 as i32);
+        emit_load_imm(&mut self.out, m2, 0x3333_3333u32 as i32);
+        emit_load_imm(&mut self.out, m4, 0x0F0F_0F0Fu32 as i32);
+        emit_load_imm(&mut self.out, m8, 0x0101_0101u32 as i32);
+
+        // step 1: x = x - ((x >> 1) & m1)
+        let x = self.alloc_temp();
+        let t = self.alloc_temp();
+        self.out.push(RiscVOp::Srli {
+            rd: t,
+            rs1: src,
+            shamt: 1,
+        });
+        self.out.push(RiscVOp::And {
+            rd: t,
+            rs1: t,
+            rs2: m1,
+        });
+        self.out.push(RiscVOp::Sub {
+            rd: x,
+            rs1: src,
+            rs2: t,
+        });
+        // step 2: x = (x & m2) + ((x >> 2) & m2)
+        let a = self.alloc_temp();
+        let b = self.alloc_temp();
+        self.out.push(RiscVOp::And {
+            rd: a,
+            rs1: x,
+            rs2: m2,
+        });
+        self.out.push(RiscVOp::Srli {
+            rd: b,
+            rs1: x,
+            shamt: 2,
+        });
+        self.out.push(RiscVOp::And {
+            rd: b,
+            rs1: b,
+            rs2: m2,
+        });
+        self.out.push(RiscVOp::Add {
+            rd: x,
+            rs1: a,
+            rs2: b,
+        });
+        // step 3: x = (x + (x >> 4)) & m4
+        self.out.push(RiscVOp::Srli {
+            rd: t,
+            rs1: x,
+            shamt: 4,
+        });
+        self.out.push(RiscVOp::Add {
+            rd: x,
+            rs1: x,
+            rs2: t,
+        });
+        self.out.push(RiscVOp::And {
+            rd: x,
+            rs1: x,
+            rs2: m4,
+        });
+        // step 4: x = (x * m8) >> 24  — the byte-sum collapse.
+        self.out.push(RiscVOp::Mul {
+            rd: x,
+            rs1: x,
+            rs2: m8,
+        });
+        let result = self.alloc_temp();
+        self.out.push(RiscVOp::Srli {
+            rd: result,
+            rs1: x,
+            shamt: 24,
+        });
+        result
+    }
+
+    /// 64-bit comparison ladder (lt / le / gt / ge, signed or unsigned).
+    /// Result is an i32 0/1.
+    ///
+    /// The standard hi-then-lo decomposition. For `a < b`:
+    /// ```text
+    ///   if a_hi != b_hi { result = (a_hi <cmp> b_hi) }   ; hi decides
+    ///   else            { result = (a_lo <u  b_lo) }     ; lo is the tie-break
+    /// ```
+    /// The hi-half comparison uses the *signed* relation for signed ops
+    /// (`slt`) and the unsigned relation otherwise (`sltu`). The lo-half
+    /// comparison is **always unsigned** — the low 32 bits are a magnitude,
+    /// the sign lives entirely in the high word.
+    ///
+    /// `le` / `gt` / `ge` are derived from `lt` by the usual identities:
+    /// `a > b ≡ b < a` (swap operands) and `a >= b ≡ !(a < b)`,
+    /// `a <= b ≡ !(b < a)` (swap + invert). We compute a base `lt` with the
+    /// possibly-swapped operands, then optionally flip the 0/1 result.
+    fn lower_i64_cmp(
+        &mut self,
+        op: &WasmOp,
+        kind: CmpKind,
+        signed: bool,
+    ) -> Result<(), SelectorError> {
+        let ((al, ah), (bl, bh)) = self.pop_pair_i64(op)?;
+        // Reduce every variant to a "strictly-less-than" on a chosen operand
+        // ordering, plus an optional final invert:
+        //   lt : less(a, b)            ge : !less(a, b)
+        //   gt : less(b, a)            le : !less(b, a)
+        let (swap, invert) = match kind {
+            CmpKind::Lt => (false, false),
+            CmpKind::Ge => (false, true),
+            CmpKind::Gt => (true, false),
+            CmpKind::Le => (true, true),
+        };
+        let ((xl, xh), (yl, yh)) = if swap {
+            ((bl, bh), (al, ah))
+        } else {
+            ((al, ah), (bl, bh))
+        };
+        // result holds the 0/1 outcome; written by both arms of the branch.
+        let result = self.alloc_temp();
+        let hi_eq = self.fresh_label("Lcmp_hieq");
+        let done = self.fresh_label("Lcmp_done");
+        // beq x_hi, y_hi, Lcmp_hieq → hi halves tie, fall through to lo test.
+        self.out.push(RiscVOp::Branch {
+            cond: Branch::Eq,
+            rs1: xh,
+            rs2: yh,
+            label: hi_eq.clone(),
+        });
+        // hi halves differ: the hi comparison is the whole answer. Signed
+        // ops compare the hi halves *signed* (the sign bit lives here).
+        self.out.push(if signed {
+            RiscVOp::Slt {
+                rd: result,
+                rs1: xh,
+                rs2: yh,
+            }
+        } else {
+            RiscVOp::Sltu {
+                rd: result,
+                rs1: xh,
+                rs2: yh,
+            }
+        });
+        self.out.push(RiscVOp::Jal {
+            rd: Reg::ZERO,
+            label: done.clone(),
+        });
+        // hi halves equal: tie-break on the lo halves, always *unsigned*.
+        self.out.push(RiscVOp::Label { name: hi_eq });
+        self.out.push(RiscVOp::Sltu {
+            rd: result,
+            rs1: xl,
+            rs2: yl,
+        });
+        self.out.push(RiscVOp::Label { name: done });
+        // For ge / le, flip the strict-less result: a >= b ≡ !(a < b).
+        if invert {
+            let flipped = self.alloc_temp();
+            self.out.push(RiscVOp::Xori {
+                rd: flipped,
+                rs1: result,
+                imm: 1,
+            });
+            self.push_i32(flipped);
+        } else {
+            self.push_i32(result);
+        }
+        Ok(())
+    }
+
+    /// In-place sign extension `i64.extend{8,16,32}_s`.
+    ///
+    /// Wasm semantics: the input is an i64; sign-extend its low `width` bits
+    /// across the whole 64-bit value. The high `64 - width` bits of the
+    /// input are discarded.
+    ///
+    /// For width 8 / 16 we sign-extend within the low word by a shift pair
+    /// `(x << (32-width)) >>s (32-width)`, then propagate the sign into the
+    /// high word with `sra lo, 31` (all-ones if negative, all-zeros if not).
+    /// For width 32 the low word is already the full value, so we only need
+    /// the sign-propagation step.
+    fn lower_i64_extend_sub(&mut self, op: &WasmOp, width: u8) -> Result<(), SelectorError> {
+        let (lo_in, _hi_in) = self.pop_i64(op)?;
+        let lo = self.alloc_temp();
+        if width < 32 {
+            // shift = 32 - width ; lo = (lo_in << shift) >>s shift
+            let shift = 32 - width;
+            self.out.push(RiscVOp::Slli {
+                rd: lo,
+                rs1: lo_in,
+                shamt: shift,
+            });
+            self.out.push(RiscVOp::Srai {
+                rd: lo,
+                rs1: lo,
+                shamt: shift,
+            });
+        } else {
+            // width == 32: the low word is already the sign-extended value;
+            // just carry it forward (mv lo, lo_in).
+            self.out.push(RiscVOp::Addi {
+                rd: lo,
+                rs1: lo_in,
+                imm: 0,
+            });
+        }
+        // hi = sra(lo, 31) — broadcast the sign bit across the high word.
+        let hi = self.alloc_temp();
+        self.out.push(RiscVOp::Srai {
+            rd: hi,
+            rs1: lo,
+            shamt: 31,
+        });
+        self.push_i64(lo, hi);
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+enum LoadKind {
+    I8S,
+    I8U,
+    I16S,
+    I16U,
+}
+
+#[derive(Debug, Clone, Copy)]
+enum StoreKind {
+    Word,
+    Half,
+    Byte,
+}
+
+/// Which 64-bit shift to lower. Drives the `lower_i64_shift` cross-word
+/// branching — all three share the same `shamt >= 32` structure but differ
+/// in the per-half instruction (logical vs. arithmetic) and the direction.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum ShiftKind {
+    /// `i64.shl` — logical left.
+    Shl,
+    /// `i64.shr_s` — arithmetic right (sign-propagating).
+    ShrS,
+    /// `i64.shr_u` — logical right (zero-fill).
+    ShrU,
+}
+
+/// Which 64-bit comparison to lower. The signed/unsigned distinction is a
+/// separate flag because only the *hi*-half comparison changes with
+/// signedness — the lo half is always compared unsigned.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum CmpKind {
+    Lt,
+    Le,
+    Gt,
+    Ge,
+}
+
+fn offset_to_imm(offset: u32) -> Result<i32, SelectorError> {
+    if offset > 2047 {
+        // RV32 imm12 only supports ±2 KiB. Larger wasm offsets need an
+        // extra `addi tmp, tmp, hi` step, which we'll add when a real wasm
+        // module hits this. The skeleton fails loudly so we don't silently
+        // truncate.
+        return Err(SelectorError::ImmediateTooLarge {
+            value: offset as i64,
+            context: "memory offset",
+        });
+    }
+    Ok(offset as i32)
+}
+
+/// Materialize a 32-bit immediate into `rd` using `lui + addi` when needed.
+fn emit_load_imm(out: &mut Vec<RiscVOp>, rd: Reg, value: i32) {
+    if (-2048..=2047).contains(&value) {
+        out.push(RiscVOp::Addi {
+            rd,
+            rs1: Reg::ZERO,
+            imm: value,
+        });
+        return;
+    }
+    let value_u = value as u32;
+    let lo12 = (value_u & 0xFFF) as i32;
+    let lo12_signed = if lo12 >= 0x800 { lo12 - 0x1000 } else { lo12 };
+    let hi20 = (value_u.wrapping_sub(lo12_signed as u32)) >> 12;
+    out.push(RiscVOp::Lui {
+        rd,
+        imm20: hi20 & 0xFFFFF,
+    });
+    if lo12_signed != 0 {
+        out.push(RiscVOp::Addi {
+            rd,
+            rs1: rd,
+            imm: lo12_signed,
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn s(ops: &[WasmOp], num_params: u32) -> Vec<RiscVOp> {
+        select(ops, num_params).unwrap().ops
+    }
+
+    fn count<F: Fn(&RiscVOp) -> bool>(out: &[RiscVOp], f: F) -> usize {
+        out.iter().filter(|o| f(o)).count()
+    }
+
+    #[test]
+    fn add_two_params() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::LocalGet(1),
+                WasmOp::I32Add,
+                WasmOp::End,
+            ],
+            2,
+        );
+        assert!(count(&out, |op| matches!(op, RiscVOp::Add { .. })) == 1);
+        assert!(matches!(
+            out.last().unwrap(),
+            RiscVOp::Jalr {
+                rd: Reg::ZERO,
+                rs1: Reg::RA,
+                ..
+            }
+        ));
+    }
+
+    #[test]
+    fn div_emits_zero_trap() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::LocalGet(1),
+                WasmOp::I32DivS,
+                WasmOp::End,
+            ],
+            2,
+        );
+        // bne ... zero, Ldiv_ok ; ebreak ; Ldiv_ok: ; div ...
+        assert!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Ne,
+                    ..
+                }
+            )) >= 1
+        );
+        assert!(count(&out, |op| matches!(op, RiscVOp::Ebreak)) >= 1);
+        assert!(count(&out, |op| matches!(op, RiscVOp::Div { .. })) >= 1);
+    }
+
+    #[test]
+    fn rem_unsigned_emits_remu() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
                 WasmOp::LocalGet(1),
                 WasmOp::I32RemU,
                 WasmOp::End,
@@ -1752,286 +2825,687 @@ mod tests {
             ],
             1,
         );
-        // add tmp, s11, addr ; lw dst, 0(tmp)
+        // add tmp, s11, addr ; lw dst, 0(tmp)
+        assert!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Add {
+                    rs1: LINEAR_MEM_BASE,
+                    ..
+                }
+            )) == 1
+        );
+        assert!(count(&out, |op| matches!(op, RiscVOp::Lw { .. })) == 1);
+    }
+
+    #[test]
+    fn store_word_uses_sw() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0), // address
+                WasmOp::LocalGet(1), // value
+                WasmOp::I32Store {
+                    offset: 16,
+                    align: 2,
+                },
+                WasmOp::End,
+            ],
+            2,
+        );
+        let sw = out.iter().find(|op| matches!(op, RiscVOp::Sw { .. }));
+        assert!(matches!(sw, Some(RiscVOp::Sw { imm: 16, .. })));
+    }
+
+    #[test]
+    fn store_byte_uses_sb() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::LocalGet(1),
+                WasmOp::I32Store8 {
+                    offset: 0,
+                    align: 0,
+                },
+                WasmOp::End,
+            ],
+            2,
+        );
+        assert!(count(&out, |op| matches!(op, RiscVOp::Sb { .. })) == 1);
+    }
+
+    #[test]
+    fn load_signed_byte_uses_lb() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::I32Load8S {
+                    offset: 4,
+                    align: 0,
+                },
+                WasmOp::End,
+            ],
+            1,
+        );
+        assert!(count(&out, |op| matches!(op, RiscVOp::Lb { imm: 4, .. })) == 1);
+    }
+
+    #[test]
+    fn load_unsigned_halfword_uses_lhu() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::I32Load16U {
+                    offset: 8,
+                    align: 1,
+                },
+                WasmOp::End,
+            ],
+            1,
+        );
+        assert!(count(&out, |op| matches!(op, RiscVOp::Lhu { imm: 8, .. })) == 1);
+    }
+
+    #[test]
+    fn block_end_emits_label_only() {
+        let out = s(&[WasmOp::Block, WasmOp::End, WasmOp::End], 0);
+        // One Label for the block end + one for the function epilogue's
+        // implicit end... actually the function-level End emits the
+        // epilogue (no label). So exactly 1 Label.
+        assert!(count(&out, |op| matches!(op, RiscVOp::Label { .. })) == 1);
+    }
+
+    #[test]
+    fn loop_emits_head_label() {
+        let out = s(&[WasmOp::Loop, WasmOp::End, WasmOp::End], 0);
+        // Loop emits a head label up front and an end label at End.
+        assert!(count(&out, |op| matches!(op, RiscVOp::Label { .. })) == 2);
+    }
+
+    #[test]
+    fn br_in_loop_jumps_to_head() {
+        let out = s(
+            &[
+                WasmOp::Loop,
+                WasmOp::Br(0),
+                WasmOp::End, // end of loop
+                WasmOp::End, // function end
+            ],
+            0,
+        );
+        // Find the head label name (first Label emitted after Loop).
+        let head_label = match out.iter().find(|op| matches!(op, RiscVOp::Label { .. })) {
+            Some(RiscVOp::Label { name }) => name.clone(),
+            _ => panic!("expected at least one label"),
+        };
+        // The Jal should target the head label, NOT the end label.
+        let jal_target = out.iter().find_map(|op| match op {
+            RiscVOp::Jal { label, .. } => Some(label.clone()),
+            _ => None,
+        });
+        assert_eq!(jal_target, Some(head_label));
+    }
+
+    #[test]
+    fn br_in_block_jumps_to_end() {
+        let out = s(
+            &[
+                WasmOp::Block,
+                WasmOp::Br(0),
+                WasmOp::End, // end of block (this label is the br target)
+                WasmOp::End,
+            ],
+            0,
+        );
+        // The Jal target should be the end label, which is the (only) Label op.
+        let label_name = match out.iter().find(|op| matches!(op, RiscVOp::Label { .. })) {
+            Some(RiscVOp::Label { name }) => name.clone(),
+            _ => panic!("missing label"),
+        };
+        let jal_target = out.iter().find_map(|op| match op {
+            RiscVOp::Jal { label, .. } => Some(label.clone()),
+            _ => None,
+        });
+        assert_eq!(jal_target, Some(label_name));
+    }
+
+    #[test]
+    fn if_without_else_uses_beq_to_end() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::If,
+                WasmOp::End, // end of if (no else)
+                WasmOp::End,
+            ],
+            1,
+        );
+        // We expect one beq to skip the then-branch when cond is zero.
+        assert!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Eq,
+                    ..
+                }
+            )) == 1
+        );
+    }
+
+    #[test]
+    fn if_else_emits_jal_skip() {
+        let out = s(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::If,
+                WasmOp::Else,
+                WasmOp::End,
+                WasmOp::End,
+            ],
+            1,
+        );
+        // The then-branch needs to jump past the else.
+        assert!(count(&out, |op| matches!(op, RiscVOp::Jal { .. })) >= 1);
+    }
+
+    #[test]
+    fn br_if_uses_bne() {
+        let out = s(
+            &[
+                WasmOp::Block,
+                WasmOp::LocalGet(0),
+                WasmOp::BrIf(0),
+                WasmOp::End,
+                WasmOp::End,
+            ],
+            1,
+        );
         assert!(
             count(&out, |op| matches!(
                 op,
-                RiscVOp::Add {
-                    rs1: LINEAR_MEM_BASE,
+                RiscVOp::Branch {
+                    cond: Branch::Ne,
                     ..
                 }
             )) == 1
         );
-        assert!(count(&out, |op| matches!(op, RiscVOp::Lw { .. })) == 1);
     }
 
     #[test]
-    fn store_word_uses_sw() {
-        let out = s(
+    fn br_out_of_range_errors() {
+        let r = select(
             &[
-                WasmOp::LocalGet(0), // address
-                WasmOp::LocalGet(1), // value
-                WasmOp::I32Store {
-                    offset: 16,
-                    align: 2,
-                },
+                WasmOp::Br(5), // no enclosing frames at all
                 WasmOp::End,
             ],
-            2,
+            0,
         );
-        let sw = out.iter().find(|op| matches!(op, RiscVOp::Sw { .. }));
-        assert!(matches!(sw, Some(RiscVOp::Sw { imm: 16, .. })));
+        assert!(matches!(r, Err(SelectorError::BrOutOfRange { .. })));
     }
 
     #[test]
-    fn store_byte_uses_sb() {
+    fn drop_pops_stack() {
+        let out = s(&[WasmOp::I32Const(42), WasmOp::Drop, WasmOp::End], 0);
+        // No mv to a0 because the stack is empty after drop.
+        let last = out.last().unwrap();
+        assert!(matches!(
+            last,
+            RiscVOp::Jalr {
+                rd: Reg::ZERO,
+                rs1: Reg::RA,
+                ..
+            }
+        ));
+    }
+
+    #[test]
+    fn unreachable_emits_ebreak() {
+        let out = s(&[WasmOp::Unreachable, WasmOp::End], 0);
+        assert!(count(&out, |op| matches!(op, RiscVOp::Ebreak)) == 1);
+    }
+
+    #[test]
+    fn local_tee_keeps_value_on_stack() {
+        // tee(0); add — should produce one mv to the local + one add
         let out = s(
             &[
                 WasmOp::LocalGet(0),
-                WasmOp::LocalGet(1),
-                WasmOp::I32Store8 {
-                    offset: 0,
-                    align: 0,
-                },
+                WasmOp::I32Const(1),
+                WasmOp::LocalTee(0),
+                WasmOp::I32Add,
                 WasmOp::End,
             ],
-            2,
+            1,
         );
-        assert!(count(&out, |op| matches!(op, RiscVOp::Sb { .. })) == 1);
+        assert!(count(&out, |op| matches!(op, RiscVOp::Add { .. })) == 1);
     }
 
     #[test]
-    fn load_signed_byte_uses_lb() {
-        let out = s(
+    fn memory_offset_too_large_errors() {
+        let r = select(
             &[
                 WasmOp::LocalGet(0),
-                WasmOp::I32Load8S {
-                    offset: 4,
-                    align: 0,
+                WasmOp::I32Load {
+                    offset: 4096, // > 2047
+                    align: 2,
                 },
                 WasmOp::End,
             ],
             1,
         );
-        assert!(count(&out, |op| matches!(op, RiscVOp::Lb { imm: 4, .. })) == 1);
+        assert!(matches!(r, Err(SelectorError::ImmediateTooLarge { .. })));
+    }
+
+    // ─── Phase 1 binary-safety tests ────────────────────────────────────
+
+    fn s_with_opts(ops: &[WasmOp], num_params: u32, o: SelectorOptions) -> Vec<RiscVOp> {
+        select_with_options(ops, num_params, o).unwrap().ops
     }
 
     #[test]
-    fn load_unsigned_halfword_uses_lhu() {
-        let out = s(
+    fn rv32_software_bounds_emits_bgeu_and_ebreak() {
+        let opts = SelectorOptions {
+            bounds: RvBoundsMode::Software { mem_size: 0x10000 },
+            signed_div_overflow_trap: true,
+        };
+        let out = s_with_opts(
             &[
                 WasmOp::LocalGet(0),
-                WasmOp::I32Load16U {
-                    offset: 8,
-                    align: 1,
+                WasmOp::I32Load {
+                    offset: 0,
+                    align: 2,
                 },
                 WasmOp::End,
             ],
             1,
+            opts,
         );
-        assert!(count(&out, |op| matches!(op, RiscVOp::Lhu { imm: 8, .. })) == 1);
+        // Expect at least one BGEU against the memsize and one ebreak in the
+        // trap basic block.
+        assert!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Geu,
+                    ..
+                }
+            )) >= 1,
+            "expected at least one bgeu for the bounds check, got: {:?}",
+            out
+        );
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Ebreak)) >= 1,
+            "expected an ebreak in the trap path"
+        );
+    }
+
+    // ──────────── i64 Phase-1 tests ────────────
+    //
+    // These tests assert the *shape* of the emitted sequence (op counts, kinds,
+    // and select fields), which is the right granularity for a selector — we
+    // don't want to over-pin register allocation choices.
+
+    /// Helper: build the op sequence for an i64 test scenario. Appends an
+    /// `End` so the function epilogue is emitted; tests work with the full
+    /// output. `num_params` controls how many arg registers are available
+    /// (use 1+ for sequences that contain `LocalGet`).
+    fn run_i64_with_params(seq: &[WasmOp], num_params: u32) -> Vec<RiscVOp> {
+        let mut full = seq.to_vec();
+        full.push(WasmOp::End);
+        s(&full, num_params)
+    }
+
+    /// Most i64 tests use only consts → no LocalGet → no arg regs needed.
+    fn run_i64(seq: &[WasmOp]) -> Vec<RiscVOp> {
+        run_i64_with_params(seq, 0)
     }
 
     #[test]
-    fn block_end_emits_label_only() {
-        let out = s(&[WasmOp::Block, WasmOp::End, WasmOp::End], 0);
-        // One Label for the block end + one for the function epilogue's
-        // implicit end... actually the function-level End emits the
-        // epilogue (no label). So exactly 1 Label.
-        assert!(count(&out, |op| matches!(op, RiscVOp::Label { .. })) == 1);
+    fn i64_const_emits_two_load_imm_sequences() {
+        // I64Const(0x1_0000_0001) → lo = 1, hi = 1. Each half goes through
+        // emit_load_imm (here both fit in the addi short path), giving 2
+        // Addi-from-ZERO ops.
+        let out = run_i64(&[WasmOp::I64Const(0x1_0000_0001), WasmOp::Drop]);
+        let imm_loads = count(&out, |op| {
+            matches!(op, RiscVOp::Addi { rs1: Reg::ZERO, .. })
+        });
+        // 2 for the i64 const (lo + hi), nothing else (Drop emits no code).
+        assert_eq!(imm_loads, 2);
     }
 
     #[test]
-    fn loop_emits_head_label() {
-        let out = s(&[WasmOp::Loop, WasmOp::End, WasmOp::End], 0);
-        // Loop emits a head label up front and an end label at End.
-        assert!(count(&out, |op| matches!(op, RiscVOp::Label { .. })) == 2);
+    fn i64_add_emits_add_sltu_add_add_pattern() {
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Add,
+            WasmOp::Drop,
+        ]);
+        // Skip past the const-materialization Addi/Lui ops and the trailing
+        // Jalr; isolate the I64Add's emitted sequence.
+        let from_add: Vec<&RiscVOp> = out
+            .iter()
+            .skip_while(|o| !matches!(o, RiscVOp::Add { .. }))
+            .collect();
+        // Expected: Add, Sltu, Add, Add, then function epilogue's Jalr.
+        assert!(matches!(from_add[0], RiscVOp::Add { .. }));
+        assert!(matches!(from_add[1], RiscVOp::Sltu { .. }));
+        assert!(matches!(from_add[2], RiscVOp::Add { .. }));
+        assert!(matches!(from_add[3], RiscVOp::Add { .. }));
     }
 
     #[test]
-    fn br_in_loop_jumps_to_head() {
-        let out = s(
-            &[
-                WasmOp::Loop,
-                WasmOp::Br(0),
-                WasmOp::End, // end of loop
-                WasmOp::End, // function end
-            ],
-            0,
-        );
-        // Find the head label name (first Label emitted after Loop).
-        let head_label = match out.iter().find(|op| matches!(op, RiscVOp::Label { .. })) {
-            Some(RiscVOp::Label { name }) => name.clone(),
-            _ => panic!("expected at least one label"),
-        };
-        // The Jal should target the head label, NOT the end label.
-        let jal_target = out.iter().find_map(|op| match op {
-            RiscVOp::Jal { label, .. } => Some(label.clone()),
-            _ => None,
-        });
-        assert_eq!(jal_target, Some(head_label));
+    fn i64_sub_emits_borrow_pattern() {
+        let out = run_i64(&[
+            WasmOp::I64Const(10),
+            WasmOp::I64Const(3),
+            WasmOp::I64Sub,
+            WasmOp::Drop,
+        ]);
+        // Expected: Sltu (borrow), Sub (lo), Sub (hi diff), Sub (hi - borrow)
+        let from_sub: Vec<&RiscVOp> = out
+            .iter()
+            .skip_while(|o| !matches!(o, RiscVOp::Sltu { .. }))
+            .collect();
+        assert!(matches!(from_sub[0], RiscVOp::Sltu { .. }));
+        assert!(matches!(from_sub[1], RiscVOp::Sub { .. }));
+        assert!(matches!(from_sub[2], RiscVOp::Sub { .. }));
+        assert!(matches!(from_sub[3], RiscVOp::Sub { .. }));
     }
 
     #[test]
-    fn br_in_block_jumps_to_end() {
-        let out = s(
-            &[
-                WasmOp::Block,
-                WasmOp::Br(0),
-                WasmOp::End, // end of block (this label is the br target)
-                WasmOp::End,
-            ],
-            0,
+    fn i64_and_or_xor_each_emit_two_ops() {
+        // I64And: two And ops on lo/hi
+        let out_and = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64And,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out_and, |op| matches!(op, RiscVOp::And { .. })),
+            2,
+            "I64And should emit 2 And ops"
+        );
+
+        let out_or = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Or,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out_or, |op| matches!(op, RiscVOp::Or { .. })),
+            2,
+            "I64Or should emit 2 Or ops"
+        );
+
+        let out_xor = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Xor,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out_xor, |op| matches!(op, RiscVOp::Xor { .. })),
+            2,
+            "I64Xor should emit 2 Xor ops"
         );
-        // The Jal target should be the end label, which is the (only) Label op.
-        let label_name = match out.iter().find(|op| matches!(op, RiscVOp::Label { .. })) {
-            Some(RiscVOp::Label { name }) => name.clone(),
-            _ => panic!("missing label"),
-        };
-        let jal_target = out.iter().find_map(|op| match op {
-            RiscVOp::Jal { label, .. } => Some(label.clone()),
-            _ => None,
-        });
-        assert_eq!(jal_target, Some(label_name));
     }
 
     #[test]
-    fn if_without_else_uses_beq_to_end() {
-        let out = s(
+    fn rv32_no_bounds_check_emits_no_bgeu() {
+        let opts = SelectorOptions::wasm_compliant();
+        let out = s_with_opts(
             &[
                 WasmOp::LocalGet(0),
-                WasmOp::If,
-                WasmOp::End, // end of if (no else)
+                WasmOp::I32Load {
+                    offset: 0,
+                    align: 2,
+                },
                 WasmOp::End,
             ],
             1,
+            opts,
         );
-        // We expect one beq to skip the then-branch when cond is zero.
+        // No bgeu should be emitted when the bounds mode is `None`.
         assert!(
             count(&out, |op| matches!(
                 op,
                 RiscVOp::Branch {
-                    cond: Branch::Eq,
+                    cond: Branch::Geu,
                     ..
                 }
-            )) == 1
+            )) == 0,
+            "no bounds-check bgeu expected when mode is None, got: {:?}",
+            out
         );
     }
 
     #[test]
-    fn if_else_emits_jal_skip() {
-        let out = s(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::If,
-                WasmOp::Else,
-                WasmOp::End,
-                WasmOp::End,
-            ],
+    fn i64_eq_emits_xor_xor_or_sltiu() {
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Eq,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Xor { .. })),
+            2,
+            "I64Eq emits two Xors (one per half)"
+        );
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Or { .. })),
             1,
+            "I64Eq ors the half-diffs together"
+        );
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Sltiu { imm: 1, .. })),
+            1,
+            "I64Eq compares the combined diff with 1 (sltiu)"
         );
-        // The then-branch needs to jump past the else.
-        assert!(count(&out, |op| matches!(op, RiscVOp::Jal { .. })) >= 1);
     }
 
     #[test]
-    fn br_if_uses_bne() {
-        let out = s(
-            &[
-                WasmOp::Block,
-                WasmOp::LocalGet(0),
-                WasmOp::BrIf(0),
-                WasmOp::End,
-                WasmOp::End,
-            ],
+    fn i64_ne_emits_xor_xor_or_sltu() {
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64Ne,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Xor { .. })), 2);
+        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Or { .. })), 1);
+        assert_eq!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Sltu { rs1: Reg::ZERO, .. }
+            )),
+            1,
+            "I64Ne uses sltu rd, zero, diff"
+        );
+    }
+
+    #[test]
+    fn i64_eqz_emits_or_sltiu() {
+        // I64Eqz pops i64 and pushes i32 — verify by checking we don't get a
+        // Type-mismatch on a subsequent i32 consumer.
+        let out = run_i64(&[
+            WasmOp::I64Const(0),
+            WasmOp::I64Eqz,
+            // After Eqz the stack should hold an i32; an I32Eqz consumer
+            // confirms the type-state on the vstack.
+            WasmOp::I32Eqz,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Or { .. })),
             1,
+            "I64Eqz emits a single Or"
+        );
+        // Two Sltiu(imm=1): one from I64Eqz, one from the I32Eqz that follows.
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Sltiu { imm: 1, .. })),
+            2,
         );
+    }
+
+    #[test]
+    fn i64_extend_i32_u_pushes_zero_hi() {
+        // I64ExtendI32U: hi = 0 (via `addi rd, ZERO, 0`). The presence of an
+        // extra Addi-from-ZERO with imm=0 is the giveaway.
+        let out = run_i64(&[
+            WasmOp::I32Const(5), // small enough to take the addi short path
+            WasmOp::I64ExtendI32U,
+            WasmOp::Drop,
+        ]);
+        // We get: addi (i32const 5), addi (hi=0). Both have rs1=ZERO; the
+        // hi-zero load uses imm=0.
         assert!(
-            count(&out, |op| matches!(
+            out.iter().any(|op| matches!(
                 op,
-                RiscVOp::Branch {
-                    cond: Branch::Ne,
+                RiscVOp::Addi {
+                    rs1: Reg::ZERO,
+                    imm: 0,
                     ..
                 }
-            )) == 1
+            )),
+            "expected addi rd, zero, 0 to zero the hi half"
         );
     }
 
     #[test]
-    fn br_out_of_range_errors() {
-        let r = select(
-            &[
-                WasmOp::Br(5), // no enclosing frames at all
-                WasmOp::End,
-            ],
-            0,
+    fn i64_extend_i32_s_emits_sra_31() {
+        let out = run_i64(&[WasmOp::I32Const(5), WasmOp::I64ExtendI32S, WasmOp::Drop]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })),
+            1,
+            "I64ExtendI32S uses srai by 31 to sign-extend"
         );
-        assert!(matches!(r, Err(SelectorError::BrOutOfRange { .. })));
     }
 
     #[test]
-    fn drop_pops_stack() {
-        let out = s(&[WasmOp::I32Const(42), WasmOp::Drop, WasmOp::End], 0);
-        // No mv to a0 because the stack is empty after drop.
-        let last = out.last().unwrap();
-        assert!(matches!(
-            last,
-            RiscVOp::Jalr {
-                rd: Reg::ZERO,
-                rs1: Reg::RA,
-                ..
-            }
-        ));
+    fn i32_wrap_i64_drops_hi() {
+        // I32WrapI64 emits zero new instructions; the lo half just continues
+        // to live on the value stack as an i32. We verify the op count is
+        // exactly what the surrounding const+drop emits, with no leftover.
+        let baseline = run_i64(&[WasmOp::I64Const(42), WasmOp::Drop]);
+        let with_wrap = run_i64(&[WasmOp::I64Const(42), WasmOp::I32WrapI64, WasmOp::Drop]);
+        assert_eq!(
+            baseline.len(),
+            with_wrap.len(),
+            "I32WrapI64 must not emit any instructions"
+        );
     }
 
     #[test]
-    fn unreachable_emits_ebreak() {
-        let out = s(&[WasmOp::Unreachable, WasmOp::End], 0);
-        assert!(count(&out, |op| matches!(op, RiscVOp::Ebreak)) == 1);
+    fn i64_load_emits_two_lw_at_offset_and_offset_plus_4() {
+        let out = run_i64_with_params(
+            &[
+                WasmOp::LocalGet(0), // address (treated as i32)
+                WasmOp::I64Load {
+                    offset: 16,
+                    align: 3,
+                },
+                WasmOp::Drop,
+            ],
+            1,
+        );
+        // We expect two Lw ops with imms 16 and 20.
+        let lws: Vec<i32> = out
+            .iter()
+            .filter_map(|op| match op {
+                RiscVOp::Lw { imm, .. } => Some(*imm),
+                _ => None,
+            })
+            .collect();
+        assert_eq!(lws, vec![16, 20], "I64Load emits lw @offset and @offset+4");
     }
 
     #[test]
-    fn local_tee_keeps_value_on_stack() {
-        // tee(0); add — should produce one mv to the local + one add
+    fn i64_store_emits_two_sw_at_offset_and_offset_plus_4() {
+        let out = run_i64_with_params(
+            &[
+                WasmOp::LocalGet(0), // address
+                WasmOp::I64Const(0xDEADBEEF_CAFEBABE_u64 as i64),
+                WasmOp::I64Store {
+                    offset: 8,
+                    align: 3,
+                },
+            ],
+            1,
+        );
+        let sws: Vec<i32> = out
+            .iter()
+            .filter_map(|op| match op {
+                RiscVOp::Sw { imm, .. } => Some(*imm),
+                _ => None,
+            })
+            .collect();
+        assert_eq!(sws, vec![8, 12], "I64Store emits sw @offset and @offset+4");
+    }
+
+    // -------- Call (v0.3.1 minimum-viable) --------
+
+    /// Smoke: a single-arg, single-return call. Args move to a0; the
+    /// `RiscVOp::Call` label encodes the func index.
+    #[test]
+    fn call_emits_label_and_argument_marshalling() {
         let out = s(
             &[
-                WasmOp::LocalGet(0),
-                WasmOp::I32Const(1),
-                WasmOp::LocalTee(0),
-                WasmOp::I32Add,
+                WasmOp::LocalGet(0), // arg 0 = a0 (already)
+                WasmOp::Call(7),
                 WasmOp::End,
             ],
             1,
         );
-        assert!(count(&out, |op| matches!(op, RiscVOp::Add { .. })) == 1);
+        // Must contain a Call op with the expected label
+        let has_call = out.iter().any(|op| {
+            matches!(op,
+            RiscVOp::Call { label } if label == "synth_func_7")
+        });
+        assert!(
+            has_call,
+            "expected Call {{ label: \"synth_func_7\" }}, got: {:?}",
+            out
+        );
     }
 
     #[test]
-    fn memory_offset_too_large_errors() {
-        let r = select(
+    fn rv32_pmp_mode_emits_no_inline_check() {
+        let opts = SelectorOptions {
+            bounds: RvBoundsMode::Pmp,
+            signed_div_overflow_trap: true,
+        };
+        let out = s_with_opts(
             &[
                 WasmOp::LocalGet(0),
                 WasmOp::I32Load {
-                    offset: 4096, // > 2047
+                    offset: 0,
                     align: 2,
                 },
                 WasmOp::End,
             ],
             1,
+            opts,
+        );
+        // PMP mode behaves the same as None in code-gen — hardware handles it.
+        assert!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Geu,
+                    ..
+                }
+            )) == 0
         );
-        assert!(matches!(r, Err(SelectorError::ImmediateTooLarge { .. })));
-    }
-
-    // ─── Phase 1 binary-safety tests ────────────────────────────────────
-
-    fn s_with_opts(ops: &[WasmOp], num_params: u32, o: SelectorOptions) -> Vec<RiscVOp> {
-        select_with_options(ops, num_params, o).unwrap().ops
     }
 
     #[test]
-    fn rv32_software_bounds_emits_bgeu_and_ebreak() {
+    fn rv32_mask_mode_emits_andi() {
+        // mask = 65535 (= 0x10000 - 1). 0xFFFF > 0x7FF so emit_load_imm + and.
         let opts = SelectorOptions {
-            bounds: RvBoundsMode::Software { mem_size: 0x10000 },
+            bounds: RvBoundsMode::Mask { mask: 0xFFFF },
             signed_div_overflow_trap: true,
         };
         let out = s_with_opts(
@@ -2046,250 +3520,306 @@ mod tests {
             1,
             opts,
         );
-        // Expect at least one BGEU against the memsize and one ebreak in the
-        // trap basic block.
+        // Either Andi (small mask) or And (large mask) appears for the mask op.
         assert!(
             count(&out, |op| matches!(
+                op,
+                RiscVOp::And { .. } | RiscVOp::Andi { .. }
+            )) >= 1
+        );
+    }
+
+    #[test]
+    fn rv32_signed_div_emits_overflow_guard() {
+        let opts = SelectorOptions::wasm_compliant();
+        let out = s_with_opts(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::LocalGet(1),
+                WasmOp::I32DivS,
+                WasmOp::End,
+            ],
+            2,
+            opts,
+        );
+        // Expect: bne rs2,zero (zero-divisor guard) AND bne rs1,INT_MIN (overflow)
+        // AND bne rs2,-1 (overflow). So at least 3 BNE-shaped branches.
+        let bne_count = count(&out, |op| {
+            matches!(
                 op,
                 RiscVOp::Branch {
-                    cond: Branch::Geu,
+                    cond: Branch::Ne,
                     ..
                 }
-            )) >= 1,
-            "expected at least one bgeu for the bounds check, got: {:?}",
-            out
-        );
+            )
+        });
         assert!(
-            count(&out, |op| matches!(op, RiscVOp::Ebreak)) >= 1,
-            "expected an ebreak in the trap path"
+            bne_count >= 3,
+            "expected at least 3 BNEs (zero + INT_MIN + -1 guards), got {} in: {:?}",
+            bne_count,
+            out
         );
-    }
-
-    // ──────────── i64 Phase-1 tests ────────────
-    //
-    // These tests assert the *shape* of the emitted sequence (op counts, kinds,
-    // and select fields), which is the right granularity for a selector — we
-    // don't want to over-pin register allocation choices.
-
-    /// Helper: build the op sequence for an i64 test scenario. Appends an
-    /// `End` so the function epilogue is emitted; tests work with the full
-    /// output. `num_params` controls how many arg registers are available
-    /// (use 1+ for sequences that contain `LocalGet`).
-    fn run_i64_with_params(seq: &[WasmOp], num_params: u32) -> Vec<RiscVOp> {
-        let mut full = seq.to_vec();
-        full.push(WasmOp::End);
-        s(&full, num_params)
-    }
-
-    /// Most i64 tests use only consts → no LocalGet → no arg regs needed.
-    fn run_i64(seq: &[WasmOp]) -> Vec<RiscVOp> {
-        run_i64_with_params(seq, 0)
+        // And two ebreaks: one for div-by-zero, one for the overflow trap.
+        assert!(count(&out, |op| matches!(op, RiscVOp::Ebreak)) >= 2);
     }
 
     #[test]
-    fn i64_const_emits_two_load_imm_sequences() {
-        // I64Const(0x1_0000_0001) → lo = 1, hi = 1. Each half goes through
-        // emit_load_imm (here both fit in the addi short path), giving 2
-        // Addi-from-ZERO ops.
-        let out = run_i64(&[WasmOp::I64Const(0x1_0000_0001), WasmOp::Drop]);
-        let imm_loads = count(&out, |op| {
-            matches!(op, RiscVOp::Addi { rs1: Reg::ZERO, .. })
+    fn rv32_signed_div_overflow_trap_disabled_only_emits_zero_guard() {
+        let opts = SelectorOptions {
+            bounds: RvBoundsMode::None,
+            signed_div_overflow_trap: false,
+        };
+        let out = s_with_opts(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::LocalGet(1),
+                WasmOp::I32DivS,
+                WasmOp::End,
+            ],
+            2,
+            opts,
+        );
+        // Only the zero-divisor BNE; no overflow guards.
+        let bne_count = count(&out, |op| {
+            matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Ne,
+                    ..
+                }
+            )
         });
-        // 2 for the i64 const (lo + hi), nothing else (Drop emits no code).
-        assert_eq!(imm_loads, 2);
-    }
-
-    #[test]
-    fn i64_add_emits_add_sltu_add_add_pattern() {
-        let out = run_i64(&[
-            WasmOp::I64Const(1),
-            WasmOp::I64Const(2),
-            WasmOp::I64Add,
-            WasmOp::Drop,
-        ]);
-        // Skip past the const-materialization Addi/Lui ops and the trailing
-        // Jalr; isolate the I64Add's emitted sequence.
-        let from_add: Vec<&RiscVOp> = out
-            .iter()
-            .skip_while(|o| !matches!(o, RiscVOp::Add { .. }))
-            .collect();
-        // Expected: Add, Sltu, Add, Add, then function epilogue's Jalr.
-        assert!(matches!(from_add[0], RiscVOp::Add { .. }));
-        assert!(matches!(from_add[1], RiscVOp::Sltu { .. }));
-        assert!(matches!(from_add[2], RiscVOp::Add { .. }));
-        assert!(matches!(from_add[3], RiscVOp::Add { .. }));
-    }
-
-    #[test]
-    fn i64_sub_emits_borrow_pattern() {
-        let out = run_i64(&[
-            WasmOp::I64Const(10),
-            WasmOp::I64Const(3),
-            WasmOp::I64Sub,
-            WasmOp::Drop,
-        ]);
-        // Expected: Sltu (borrow), Sub (lo), Sub (hi diff), Sub (hi - borrow)
-        let from_sub: Vec<&RiscVOp> = out
-            .iter()
-            .skip_while(|o| !matches!(o, RiscVOp::Sltu { .. }))
-            .collect();
-        assert!(matches!(from_sub[0], RiscVOp::Sltu { .. }));
-        assert!(matches!(from_sub[1], RiscVOp::Sub { .. }));
-        assert!(matches!(from_sub[2], RiscVOp::Sub { .. }));
-        assert!(matches!(from_sub[3], RiscVOp::Sub { .. }));
+        assert_eq!(bne_count, 1);
+        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Ebreak)), 1);
     }
 
     #[test]
-    fn i64_and_or_xor_each_emit_two_ops() {
-        // I64And: two And ops on lo/hi
-        let out_and = run_i64(&[
-            WasmOp::I64Const(1),
-            WasmOp::I64Const(2),
-            WasmOp::I64And,
-            WasmOp::Drop,
-        ]);
-        assert_eq!(
-            count(&out_and, |op| matches!(op, RiscVOp::And { .. })),
+    fn rv32_unsigned_div_skips_overflow_guard() {
+        // Unsigned division has no INT_MIN/-1 special case.
+        let opts = SelectorOptions::wasm_compliant();
+        let out = s_with_opts(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::LocalGet(1),
+                WasmOp::I32DivU,
+                WasmOp::End,
+            ],
             2,
-            "I64And should emit 2 And ops"
+            opts,
         );
+        let bne_count = count(&out, |op| {
+            matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Ne,
+                    ..
+                }
+            )
+        });
+        assert_eq!(bne_count, 1, "only zero-divisor guard expected for div_u");
+    }
 
-        let out_or = run_i64(&[
-            WasmOp::I64Const(1),
-            WasmOp::I64Const(2),
-            WasmOp::I64Or,
-            WasmOp::Drop,
-        ]);
-        assert_eq!(
-            count(&out_or, |op| matches!(op, RiscVOp::Or { .. })),
+    #[test]
+    fn rv32_signed_rem_also_gets_overflow_guard() {
+        let opts = SelectorOptions::wasm_compliant();
+        let out = s_with_opts(
+            &[
+                WasmOp::LocalGet(0),
+                WasmOp::LocalGet(1),
+                WasmOp::I32RemS,
+                WasmOp::End,
+            ],
             2,
-            "I64Or should emit 2 Or ops"
+            opts,
         );
+        let bne_count = count(&out, |op| {
+            matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Ne,
+                    ..
+                }
+            )
+        });
+        assert!(bne_count >= 3);
+        assert!(count(&out, |op| matches!(op, RiscVOp::Rem { .. })) == 1);
+    }
 
-        let out_xor = run_i64(&[
-            WasmOp::I64Const(1),
-            WasmOp::I64Const(2),
-            WasmOp::I64Xor,
-            WasmOp::Drop,
-        ]);
-        assert_eq!(
-            count(&out_xor, |op| matches!(op, RiscVOp::Xor { .. })),
-            2,
-            "I64Xor should emit 2 Xor ops"
+    /// Two-arg call: top-of-stack args move to a0 and a1 in source order.
+    /// The lower arg (last-pushed) is the *first* arg per wasm semantics.
+    #[test]
+    fn call_two_args_marshals_to_a0_a1() {
+        let out = s(
+            &[
+                WasmOp::I32Const(11), // pushed first → arg 0 → a0
+                WasmOp::I32Const(22), // pushed second → arg 1 → a1
+                WasmOp::Call(3),
+                WasmOp::End,
+            ],
+            0,
         );
+        let has_call = out
+            .iter()
+            .any(|op| matches!(op, RiscVOp::Call { label } if label == "synth_func_3"));
+        assert!(has_call);
+        // After the call, the return value vreg = a0 is on the stack and
+        // gets moved to a0 by the End epilogue (a no-op).
     }
 
-    #[test]
-    fn rv32_no_bounds_check_emits_no_bgeu() {
-        let opts = SelectorOptions::wasm_compliant();
-        let out = s_with_opts(
+    /// Limitation marker: back-to-back `Call`s with surviving prior
+    /// results need function-signature info we don't yet plumb. v0.3.1
+    /// supports leaf-call patterns; v0.4 will pipe `FuncSig` from the
+    /// decoder and lift this restriction.
+    ///
+    /// This test is deliberately written to FAIL today as documentation
+    /// of the gap — it would be deleted/inverted in v0.4. Marked
+    /// `#[ignore]` so CI passes; revisit when signature plumbing lands.
+    #[test]
+    #[ignore = "v0.3.1 Call lowering needs function-signature info to handle back-to-back calls with surviving results — tracked for v0.4"]
+    fn recursive_self_call_emits_two_call_ops() {
+        let out = s(
             &[
                 WasmOp::LocalGet(0),
-                WasmOp::I32Load {
-                    offset: 0,
-                    align: 2,
-                },
+                WasmOp::I32Const(1),
+                WasmOp::I32Sub,
+                WasmOp::Call(0), // first recursive call
+                WasmOp::LocalGet(0),
+                WasmOp::I32Const(2),
+                WasmOp::I32Sub,
+                WasmOp::Call(0), // second recursive call
+                WasmOp::I32Add,
                 WasmOp::End,
             ],
             1,
-            opts,
-        );
-        // No bgeu should be emitted when the bounds mode is `None`.
-        assert!(
-            count(&out, |op| matches!(
-                op,
-                RiscVOp::Branch {
-                    cond: Branch::Geu,
-                    ..
-                }
-            )) == 0,
-            "no bounds-check bgeu expected when mode is None, got: {:?}",
-            out
         );
+        let call_count = out
+            .iter()
+            .filter(|op| matches!(op, RiscVOp::Call { label } if label == "synth_func_0"))
+            .count();
+        assert_eq!(call_count, 2, "expected 2 calls in fib pattern: {:?}", out);
+    }
+
+    // ──────────── i64 Phase-2 tests ────────────
+    //
+    // Same shape-assertion style as the Phase-1 i64 tests: count/kind of the
+    // emitted `RiscVOp`s, plus structural pins on the hardest lowerings (the
+    // shift cross-word branch and the clz hi-vs-lo branch).
+
+    /// Slice the output starting at the first op matching `pred` — used to
+    /// isolate one op's emitted sequence past the const-materialisation noise.
+    fn slice_from<F: Fn(&RiscVOp) -> bool>(out: &[RiscVOp], pred: F) -> Vec<RiscVOp> {
+        out.iter().skip_while(|o| !pred(o)).cloned().collect()
     }
 
+    // -------- I64Mul --------
+
     #[test]
-    fn i64_eq_emits_xor_xor_or_sltiu() {
+    fn i64_mul_emits_mul_mulhu_cross_terms() {
+        // lo = mul(al,bl); hi = mulhu(al,bl) + mul(al,bh) + mul(ah,bl).
+        // → 3 Mul, 1 Mulhu, 2 Add.
         let out = run_i64(&[
-            WasmOp::I64Const(1),
-            WasmOp::I64Const(2),
-            WasmOp::I64Eq,
+            WasmOp::I64Const(3),
+            WasmOp::I64Const(5),
+            WasmOp::I64Mul,
             WasmOp::Drop,
         ]);
         assert_eq!(
-            count(&out, |op| matches!(op, RiscVOp::Xor { .. })),
-            2,
-            "I64Eq emits two Xors (one per half)"
+            count(&out, |op| matches!(op, RiscVOp::Mul { .. })),
+            3,
+            "I64Mul emits 3 mul (low product + 2 cross terms)"
         );
         assert_eq!(
-            count(&out, |op| matches!(op, RiscVOp::Or { .. })),
+            count(&out, |op| matches!(op, RiscVOp::Mulhu { .. })),
             1,
-            "I64Eq ors the half-diffs together"
+            "I64Mul emits 1 mulhu for the low-product carry"
         );
         assert_eq!(
-            count(&out, |op| matches!(op, RiscVOp::Sltiu { imm: 1, .. })),
-            1,
-            "I64Eq compares the combined diff with 1 (sltiu)"
+            count(&out, |op| matches!(op, RiscVOp::Add { .. })),
+            2,
+            "I64Mul sums the carry and two cross terms with 2 adds"
         );
     }
 
+    // -------- I64Shl / I64ShrS / I64ShrU --------
+
     #[test]
-    fn i64_ne_emits_xor_xor_or_sltu() {
+    fn i64_shl_emits_cross_word_branch_and_two_sequences() {
+        // The shift lowering is data-dependent: it emits a branch on bit 5
+        // of the shift amount plus a small-case and a big-case sequence.
         let out = run_i64(&[
-            WasmOp::I64Const(1),
-            WasmOp::I64Const(2),
-            WasmOp::I64Ne,
+            WasmOp::I64Const(0xFF),
+            WasmOp::I64Const(4),
+            WasmOp::I64Shl,
             WasmOp::Drop,
         ]);
-        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Xor { .. })), 2);
-        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Or { .. })), 1);
-        assert_eq!(
+        // One bne branches to the >= 32 (cross-word) path.
+        assert!(
             count(&out, |op| matches!(
                 op,
-                RiscVOp::Sltu { rs1: Reg::ZERO, .. }
-            )),
-            1,
-            "I64Ne uses sltu rd, zero, diff"
+                RiscVOp::Branch {
+                    cond: Branch::Ne,
+                    ..
+                }
+            )) >= 1,
+            "I64Shl must branch on shamt >= 32"
+        );
+        // One Jal skips the big-case sequence after the small case.
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Jal { .. })) >= 1,
+            "I64Shl small-case path jumps over the big-case sequence"
+        );
+        // Two labels: the big-case entry + the join point.
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Label { .. })) >= 2,
+            "I64Shl emits a big-case label and a done label"
         );
     }
 
     #[test]
-    fn i64_eqz_emits_or_sltiu() {
-        // I64Eqz pops i64 and pushes i32 — verify by checking we don't get a
-        // Type-mismatch on a subsequent i32 consumer.
+    fn i64_shl_small_case_uses_register_shifts_and_carry_or() {
+        // Pin the within-word (< 32) sequence structure: the small case does
+        // `lo << s`, a carry extracted via `(lo >> 1) >> inv`, `hi << s`, and
+        // ORs the carry in. So the small-case region has Sll/Srl/Or ops and
+        // an Srli (the `>> 1` of the carry extraction).
         let out = run_i64(&[
-            WasmOp::I64Const(0),
-            WasmOp::I64Eqz,
-            // After Eqz the stack should hold an i32; an I32Eqz consumer
-            // confirms the type-state on the vstack.
-            WasmOp::I32Eqz,
+            WasmOp::I64Const(0x1234),
+            WasmOp::I64Const(8),
+            WasmOp::I64Shl,
             WasmOp::Drop,
         ]);
-        assert_eq!(
-            count(&out, |op| matches!(op, RiscVOp::Or { .. })),
-            1,
-            "I64Eqz emits a single Or"
+        // The carry path uses `srli rd, lo, 1`.
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Srli { shamt: 1, .. })) >= 1,
+            "I64Shl small case extracts the cross-half carry via `>> 1`"
         );
-        // Two Sltiu(imm=1): one from I64Eqz, one from the I32Eqz that follows.
-        assert_eq!(
-            count(&out, |op| matches!(op, RiscVOp::Sltiu { imm: 1, .. })),
-            2,
+        // At least two register Sll: `lo << s` and `hi << s`.
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Sll { .. })) >= 2,
+            "I64Shl small case shifts both halves with register Sll"
+        );
+        // The carry is merged into hi with an Or.
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Or { .. })) >= 1,
+            "I64Shl small case ORs the carry into the high half"
         );
     }
 
     #[test]
-    fn i64_extend_i32_u_pushes_zero_hi() {
-        // I64ExtendI32U: hi = 0 (via `addi rd, ZERO, 0`). The presence of an
-        // extra Addi-from-ZERO with imm=0 is the giveaway.
+    fn i64_shl_big_case_zeroes_low_half() {
+        // The >= 32 (big-case) sequence sets lo' = 0 via `addi rd, zero, 0`.
         let out = run_i64(&[
-            WasmOp::I32Const(5), // small enough to take the addi short path
-            WasmOp::I64ExtendI32U,
+            WasmOp::I64Const(0xABCD),
+            WasmOp::I64Const(40),
+            WasmOp::I64Shl,
             WasmOp::Drop,
         ]);
-        // We get: addi (i32const 5), addi (hi=0). Both have rs1=ZERO; the
-        // hi-zero load uses imm=0.
+        // After the Lshift_big label, an addi-from-zero with imm 0 zeroes lo.
+        let big = slice_from(
+            &out,
+            |op| matches!(op, RiscVOp::Label { name } if name.starts_with("Lshift_big")),
+        );
         assert!(
-            out.iter().any(|op| matches!(
+            big.iter().any(|op| matches!(
                 op,
                 RiscVOp::Addi {
                     rs1: Reg::ZERO,
@@ -2297,332 +3827,415 @@ mod tests {
                     ..
                 }
             )),
-            "expected addi rd, zero, 0 to zero the hi half"
+            "I64Shl big case must zero the low half"
         );
     }
 
     #[test]
-    fn i64_extend_i32_s_emits_sra_31() {
-        let out = run_i64(&[WasmOp::I32Const(5), WasmOp::I64ExtendI32S, WasmOp::Drop]);
-        assert_eq!(
-            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })),
-            1,
-            "I64ExtendI32S uses srai by 31 to sign-extend"
+    fn i64_shr_s_uses_sra_in_both_cases() {
+        // shr_s arithmetic-shifts the high half; the big case also produces
+        // the sign fill via `srai hi, 31`.
+        let out = run_i64(&[
+            WasmOp::I64Const(-1),
+            WasmOp::I64Const(3),
+            WasmOp::I64ShrS,
+            WasmOp::Drop,
+        ]);
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Sra { .. })) >= 1,
+            "I64ShrS uses arithmetic register shift on the high half"
+        );
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })) >= 1,
+            "I64ShrS big case fills the high half with the sign via srai 31"
         );
     }
 
     #[test]
-    fn i32_wrap_i64_drops_hi() {
-        // I32WrapI64 emits zero new instructions; the lo half just continues
-        // to live on the value stack as an i32. We verify the op count is
-        // exactly what the surrounding const+drop emits, with no leftover.
-        let baseline = run_i64(&[WasmOp::I64Const(42), WasmOp::Drop]);
-        let with_wrap = run_i64(&[WasmOp::I64Const(42), WasmOp::I32WrapI64, WasmOp::Drop]);
+    fn i64_shr_u_uses_only_logical_shifts() {
+        // shr_u must never emit an arithmetic shift — zero-fill throughout.
+        let out = run_i64(&[
+            WasmOp::I64Const(-1),
+            WasmOp::I64Const(5),
+            WasmOp::I64ShrU,
+            WasmOp::Drop,
+        ]);
         assert_eq!(
-            baseline.len(),
-            with_wrap.len(),
-            "I32WrapI64 must not emit any instructions"
+            count(&out, |op| matches!(op, RiscVOp::Sra { .. })),
+            0,
+            "I64ShrU is logical — no arithmetic register shift"
         );
-    }
-
-    #[test]
-    fn i64_load_emits_two_lw_at_offset_and_offset_plus_4() {
-        let out = run_i64_with_params(
-            &[
-                WasmOp::LocalGet(0), // address (treated as i32)
-                WasmOp::I64Load {
-                    offset: 16,
-                    align: 3,
-                },
-                WasmOp::Drop,
-            ],
-            1,
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { .. })),
+            0,
+            "I64ShrU is logical — no arithmetic immediate shift"
         );
-        // We expect two Lw ops with imms 16 and 20.
-        let lws: Vec<i32> = out
-            .iter()
-            .filter_map(|op| match op {
-                RiscVOp::Lw { imm, .. } => Some(*imm),
-                _ => None,
-            })
-            .collect();
-        assert_eq!(lws, vec![16, 20], "I64Load emits lw @offset and @offset+4");
-    }
-
-    #[test]
-    fn i64_store_emits_two_sw_at_offset_and_offset_plus_4() {
-        let out = run_i64_with_params(
-            &[
-                WasmOp::LocalGet(0), // address
-                WasmOp::I64Const(0xDEADBEEF_CAFEBABE_u64 as i64),
-                WasmOp::I64Store {
-                    offset: 8,
-                    align: 3,
-                },
-            ],
-            1,
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Srl { .. })) >= 1,
+            "I64ShrU uses logical register shifts"
         );
-        let sws: Vec<i32> = out
-            .iter()
-            .filter_map(|op| match op {
-                RiscVOp::Sw { imm, .. } => Some(*imm),
-                _ => None,
-            })
-            .collect();
-        assert_eq!(sws, vec![8, 12], "I64Store emits sw @offset and @offset+4");
     }
 
-    // -------- Call (v0.3.1 minimum-viable) --------
+    // -------- I64Rotl / I64Rotr --------
 
-    /// Smoke: a single-arg, single-return call. Args move to a0; the
-    /// `RiscVOp::Call` label encodes the func index.
     #[test]
-    fn call_emits_label_and_argument_marshalling() {
-        let out = s(
-            &[
-                WasmOp::LocalGet(0), // arg 0 = a0 (already)
-                WasmOp::Call(7),
-                WasmOp::End,
-            ],
-            1,
+    fn i64_rotl_composes_two_shifts() {
+        // rotl(x,n) = (x << n) | (x >> (64-n)). Each shift is a full
+        // cross-word lowering, so we get two branches and a final pair of
+        // ORs joining the two i64 results.
+        let out = run_i64(&[
+            WasmOp::I64Const(0x1),
+            WasmOp::I64Const(7),
+            WasmOp::I64Rotl,
+            WasmOp::Drop,
+        ]);
+        // Two cross-word shift branches (one per composed shift).
+        assert!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Ne,
+                    ..
+                }
+            )) >= 2,
+            "I64Rotl composes two shifts, each with its own cross-word branch"
         );
-        // Must contain a Call op with the expected label
-        let has_call = out.iter().any(|op| {
-            matches!(op,
-            RiscVOp::Call { label } if label == "synth_func_7")
-        });
+        // The two i64 shift results are ORed half-by-half at the end.
         assert!(
-            has_call,
-            "expected Call {{ label: \"synth_func_7\" }}, got: {:?}",
-            out
+            count(&out, |op| matches!(op, RiscVOp::Or { .. })) >= 2,
+            "I64Rotl ORs the two shift results half by half"
         );
     }
 
     #[test]
-    fn rv32_pmp_mode_emits_no_inline_check() {
-        let opts = SelectorOptions {
-            bounds: RvBoundsMode::Pmp,
-            signed_div_overflow_trap: true,
-        };
-        let out = s_with_opts(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::I32Load {
-                    offset: 0,
-                    align: 2,
-                },
-                WasmOp::End,
-            ],
-            1,
-            opts,
-        );
-        // PMP mode behaves the same as None in code-gen — hardware handles it.
+    fn i64_rotr_composes_two_shifts() {
+        let out = run_i64(&[
+            WasmOp::I64Const(0x8000_0000),
+            WasmOp::I64Const(9),
+            WasmOp::I64Rotr,
+            WasmOp::Drop,
+        ]);
         assert!(
             count(&out, |op| matches!(
                 op,
                 RiscVOp::Branch {
-                    cond: Branch::Geu,
+                    cond: Branch::Ne,
                     ..
                 }
-            )) == 0
+            )) >= 2
         );
+        assert!(count(&out, |op| matches!(op, RiscVOp::Or { .. })) >= 2);
     }
 
+    // -------- I64Clz / I64Ctz / I64Popcnt --------
+
     #[test]
-    fn rv32_mask_mode_emits_andi() {
-        // mask = 65535 (= 0x10000 - 1). 0xFFFF > 0x7FF so emit_load_imm + and.
-        let opts = SelectorOptions {
-            bounds: RvBoundsMode::Mask { mask: 0xFFFF },
-            signed_div_overflow_trap: true,
-        };
-        let out = s_with_opts(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::I32Load {
-                    offset: 0,
-                    align: 2,
-                },
-                WasmOp::End,
-            ],
-            1,
-            opts,
+    fn i64_clz_branches_on_high_half() {
+        // clz: `if hi != 0 { clz_word(hi) } else { 32 + clz_word(lo) }`.
+        // The hi-vs-lo decision is a `beq hi, zero, ...`.
+        let out = run_i64(&[WasmOp::I64Const(0x1234), WasmOp::I64Clz, WasmOp::Drop]);
+        assert!(
+            count(&out, |op| matches!(
+                op,
+                RiscVOp::Branch {
+                    cond: Branch::Eq,
+                    rs2: Reg::ZERO,
+                    ..
+                }
+            )) >= 1,
+            "I64Clz branches on whether the high half is zero"
         );
-        // Either Andi (small mask) or And (large mask) appears for the mask op.
+        // The hi==0 arm adds 32 to the low-word clz — `addi rd, _, 32`.
         assert!(
-            count(&out, |op| matches!(
-                op,
-                RiscVOp::And { .. } | RiscVOp::Andi { .. }
-            )) >= 1
+            count(&out, |op| matches!(op, RiscVOp::Addi { imm: 32, .. })) >= 1,
+            "I64Clz hi==0 arm adds 32 for the all-zero high word"
         );
     }
 
     #[test]
-    fn rv32_signed_div_emits_overflow_guard() {
-        let opts = SelectorOptions::wasm_compliant();
-        let out = s_with_opts(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::LocalGet(1),
-                WasmOp::I32DivS,
-                WasmOp::End,
-            ],
-            2,
-            opts,
-        );
-        // Expect: bne rs2,zero (zero-divisor guard) AND bne rs1,INT_MIN (overflow)
-        // AND bne rs2,-1 (overflow). So at least 3 BNE-shaped branches.
-        let bne_count = count(&out, |op| {
-            matches!(
+    fn i64_clz_word_helper_uses_binary_search_probes() {
+        // emit_clz_word probes widths 16/8/4/2/1; each probe shifts the
+        // value down by 32-k. So the clz lowering must contain srli ops at
+        // shamts 16, 24, 28, 30, 31 (one set per half → at least one of each).
+        let out = run_i64(&[WasmOp::I64Const(0x40), WasmOp::I64Clz, WasmOp::Drop]);
+        for shamt in [16u8, 24, 28, 30, 31] {
+            assert!(
+                count(
+                    &out,
+                    |op| matches!(op, RiscVOp::Srli { shamt: s, .. } if *s == shamt)
+                ) >= 1,
+                "I64Clz binary-search probe at shamt {} missing",
+                shamt
+            );
+        }
+    }
+
+    #[test]
+    fn i64_ctz_branches_on_low_half() {
+        // ctz mirrors clz: branch on whether the *low* half is zero.
+        let out = run_i64(&[WasmOp::I64Const(0x100), WasmOp::I64Ctz, WasmOp::Drop]);
+        assert!(
+            count(&out, |op| matches!(
                 op,
                 RiscVOp::Branch {
-                    cond: Branch::Ne,
+                    cond: Branch::Eq,
+                    rs2: Reg::ZERO,
                     ..
                 }
-            )
-        });
+            )) >= 1,
+            "I64Ctz branches on whether the low half is zero"
+        );
+        // ctz_word isolates the lowest set bit via `x & -x`: a Sub-from-zero
+        // (negate) followed by an And.
         assert!(
-            bne_count >= 3,
-            "expected at least 3 BNEs (zero + INT_MIN + -1 guards), got {} in: {:?}",
-            bne_count,
-            out
+            count(&out, |op| matches!(op, RiscVOp::Sub { rs1: Reg::ZERO, .. })) >= 1,
+            "I64Ctz isolates the lowest set bit (negate via sub from zero)"
         );
-        // And two ebreaks: one for div-by-zero, one for the overflow trap.
-        assert!(count(&out, |op| matches!(op, RiscVOp::Ebreak)) >= 2);
     }
 
     #[test]
-    fn rv32_signed_div_overflow_trap_disabled_only_emits_zero_guard() {
-        let opts = SelectorOptions {
-            bounds: RvBoundsMode::None,
-            signed_div_overflow_trap: false,
-        };
-        let out = s_with_opts(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::LocalGet(1),
-                WasmOp::I32DivS,
-                WasmOp::End,
-            ],
+    fn i64_popcnt_sums_two_word_popcounts() {
+        // popcnt(x) = popcnt(lo) + popcnt(hi). Each word popcount ends with a
+        // `mul` by 0x01010101; so two Muls, and a final Add joining them.
+        let out = run_i64(&[
+            WasmOp::I64Const(0xFFFF_FFFF),
+            WasmOp::I64Popcnt,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Mul { .. })),
             2,
-            opts,
+            "I64Popcnt runs the SWAR mul-collapse once per half"
+        );
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Srli { shamt: 24, .. })) >= 2,
+            "each word popcount finishes with `>> 24`"
         );
-        // Only the zero-divisor BNE; no overflow guards.
-        let bne_count = count(&out, |op| {
-            matches!(
-                op,
-                RiscVOp::Branch {
-                    cond: Branch::Ne,
-                    ..
-                }
-            )
-        });
-        assert_eq!(bne_count, 1);
-        assert_eq!(count(&out, |op| matches!(op, RiscVOp::Ebreak)), 1);
     }
 
+    // -------- I64 comparisons --------
+
     #[test]
-    fn rv32_unsigned_div_skips_overflow_guard() {
-        // Unsigned division has no INT_MIN/-1 special case.
-        let opts = SelectorOptions::wasm_compliant();
-        let out = s_with_opts(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::LocalGet(1),
-                WasmOp::I32DivU,
-                WasmOp::End,
-            ],
-            2,
-            opts,
+    fn i64_lt_s_uses_signed_hi_unsigned_lo() {
+        // Signed lt: hi-half compared with slt (signed), lo-half with sltu.
+        let out = run_i64(&[
+            WasmOp::I64Const(-5),
+            WasmOp::I64Const(7),
+            WasmOp::I64LtS,
+            WasmOp::Drop,
+        ]);
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Slt { .. })) >= 1,
+            "I64LtS compares the high halves signed"
         );
-        let bne_count = count(&out, |op| {
-            matches!(
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Sltu { .. })) >= 1,
+            "I64LtS tie-breaks the low halves unsigned"
+        );
+        // The hi-equal tie-break is gated by a `beq hi, hi`.
+        assert!(
+            count(&out, |op| matches!(
                 op,
                 RiscVOp::Branch {
-                    cond: Branch::Ne,
+                    cond: Branch::Eq,
                     ..
                 }
-            )
-        });
-        assert_eq!(bne_count, 1, "only zero-divisor guard expected for div_u");
+            )) >= 1,
+            "I64LtS branches when the high halves are equal"
+        );
     }
 
     #[test]
-    fn rv32_signed_rem_also_gets_overflow_guard() {
-        let opts = SelectorOptions::wasm_compliant();
-        let out = s_with_opts(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::LocalGet(1),
-                WasmOp::I32RemS,
-                WasmOp::End,
-            ],
+    fn i64_lt_u_uses_unsigned_hi_compare() {
+        // Unsigned lt: the hi-half comparison must be sltu, never slt.
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64LtU,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Slt { .. })),
+            0,
+            "I64LtU never uses a signed compare"
+        );
+        // Two sltu: one for the hi half, one for the lo tie-break.
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Sltu { .. })),
             2,
-            opts,
+            "I64LtU compares hi and lo halves, both unsigned"
         );
-        let bne_count = count(&out, |op| {
-            matches!(
-                op,
-                RiscVOp::Branch {
-                    cond: Branch::Ne,
-                    ..
-                }
-            )
-        });
-        assert!(bne_count >= 3);
-        assert!(count(&out, |op| matches!(op, RiscVOp::Rem { .. })) == 1);
     }
 
-    /// Two-arg call: top-of-stack args move to a0 and a1 in source order.
-    /// The lower arg (last-pushed) is the *first* arg per wasm semantics.
     #[test]
-    fn call_two_args_marshals_to_a0_a1() {
-        let out = s(
-            &[
-                WasmOp::I32Const(11), // pushed first → arg 0 → a0
-                WasmOp::I32Const(22), // pushed second → arg 1 → a1
-                WasmOp::Call(3),
-                WasmOp::End,
-            ],
+    fn i64_ge_s_inverts_the_lt_result() {
+        // ge = !lt, so the lowering ends with an `xori rd, _, 1`.
+        let out = run_i64(&[
+            WasmOp::I64Const(9),
+            WasmOp::I64Const(4),
+            WasmOp::I64GeS,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Xori { imm: 1, .. })),
+            1,
+            "I64GeS flips the strict-less result with xori 1"
+        );
+    }
+
+    #[test]
+    fn i64_gt_u_swaps_operands_no_invert() {
+        // gt = less(b, a): operand swap, no final invert → no xori.
+        let out = run_i64(&[
+            WasmOp::I64Const(3),
+            WasmOp::I64Const(8),
+            WasmOp::I64GtU,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Xori { imm: 1, .. })),
             0,
+            "I64GtU is a swapped less-than — no result inversion"
+        );
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Sltu { .. })),
+            2,
+            "I64GtU still does the hi + lo unsigned compares"
         );
-        let has_call = out
-            .iter()
-            .any(|op| matches!(op, RiscVOp::Call { label } if label == "synth_func_3"));
-        assert!(has_call);
-        // After the call, the return value vreg = a0 is on the stack and
-        // gets moved to a0 by the End epilogue (a no-op).
     }
 
-    /// Limitation marker: back-to-back `Call`s with surviving prior
-    /// results need function-signature info we don't yet plumb. v0.3.1
-    /// supports leaf-call patterns; v0.4 will pipe `FuncSig` from the
-    /// decoder and lift this restriction.
-    ///
-    /// This test is deliberately written to FAIL today as documentation
-    /// of the gap — it would be deleted/inverted in v0.4. Marked
-    /// `#[ignore]` so CI passes; revisit when signature plumbing lands.
     #[test]
-    #[ignore = "v0.3.1 Call lowering needs function-signature info to handle back-to-back calls with surviving results — tracked for v0.4"]
-    fn recursive_self_call_emits_two_call_ops() {
-        let out = s(
-            &[
-                WasmOp::LocalGet(0),
-                WasmOp::I32Const(1),
-                WasmOp::I32Sub,
-                WasmOp::Call(0), // first recursive call
-                WasmOp::LocalGet(0),
-                WasmOp::I32Const(2),
-                WasmOp::I32Sub,
-                WasmOp::Call(0), // second recursive call
-                WasmOp::I32Add,
-                WasmOp::End,
-            ],
+    fn i64_le_s_swaps_and_inverts() {
+        // le = !less(b, a): both a swap and a final invert.
+        let out = run_i64(&[
+            WasmOp::I64Const(2),
+            WasmOp::I64Const(2),
+            WasmOp::I64LeS,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Xori { imm: 1, .. })),
             1,
+            "I64LeS inverts the swapped less-than result"
         );
-        let call_count = out
-            .iter()
-            .filter(|op| matches!(op, RiscVOp::Call { label } if label == "synth_func_0"))
-            .count();
-        assert_eq!(call_count, 2, "expected 2 calls in fib pattern: {:?}", out);
+    }
+
+    // -------- I64Extend8S / I64Extend16S / I64Extend32S --------
+
+    #[test]
+    fn i64_extend8_s_shifts_24_then_sign_propagates() {
+        // extend8_s: low byte sign-extended via (x << 24) >>s 24, then the
+        // high word filled via srai 31.
+        let out = run_i64(&[WasmOp::I64Const(0xFF), WasmOp::I64Extend8S, WasmOp::Drop]);
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Slli { shamt: 24, .. })) == 1,
+            "I64Extend8S left-shifts the low byte by 24"
+        );
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 24, .. })) == 1,
+            "I64Extend8S arithmetic-shifts back by 24"
+        );
+        assert!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })) == 1,
+            "I64Extend8S broadcasts the sign into the high word via srai 31"
+        );
+    }
+
+    #[test]
+    fn i64_extend16_s_shifts_16() {
+        let out = run_i64(&[WasmOp::I64Const(0xFFFF), WasmOp::I64Extend16S, WasmOp::Drop]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Slli { shamt: 16, .. })),
+            1,
+            "I64Extend16S left-shifts the low halfword by 16"
+        );
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 16, .. })),
+            1,
+        );
+    }
+
+    #[test]
+    fn i64_extend32_s_only_sign_propagates() {
+        // width 32: the low word is already the value, so no 32-width shift
+        // pair — only the `srai 31` sign propagation into the high word.
+        let out = run_i64(&[
+            WasmOp::I64Const(0x7FFF_FFFF),
+            WasmOp::I64Extend32S,
+            WasmOp::Drop,
+        ]);
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Slli { .. })),
+            0,
+            "I64Extend32S needs no low-word shift — the word is already 32-bit"
+        );
+        assert_eq!(
+            count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })),
+            1,
+            "I64Extend32S propagates the sign into the high word"
+        );
+    }
+
+    // -------- deferred ops fail loudly --------
+
+    #[test]
+    fn i64_div_rem_are_unsupported_phase3() {
+        // Division/remainder on i64 are deferred to Phase 3 — they must
+        // surface as `Unsupported`, never silently miscompile.
+        for op in [
+            WasmOp::I64DivS,
+            WasmOp::I64DivU,
+            WasmOp::I64RemS,
+            WasmOp::I64RemU,
+        ] {
+            let r = select(
+                &[
+                    WasmOp::I64Const(10),
+                    WasmOp::I64Const(3),
+                    op.clone(),
+                    WasmOp::End,
+                ],
+                0,
+            )
+            .map(|_| ()); // RiscVSelection isn't Debug — drop the Ok payload.
+            assert!(
+                matches!(r, Err(SelectorError::Unsupported(_))),
+                "{:?} should be Unsupported (Phase 3), got {:?}",
+                op,
+                r
+            );
+        }
+    }
+
+    // -------- type-state plumbing --------
+
+    #[test]
+    fn i64_cmp_result_is_i32_typed() {
+        // An i64 comparison pushes an i32; a following i32 consumer must not
+        // hit a type mismatch.
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(2),
+            WasmOp::I64LtU,
+            WasmOp::I32Eqz, // consumes the i32 result of the i64 compare
+            WasmOp::Drop,
+        ]);
+        // I32Eqz lowered fine → the vstack carried an i32 after I64LtU.
+        assert!(count(&out, |op| matches!(op, RiscVOp::Sltiu { imm: 1, .. })) >= 1);
+    }
+
+    #[test]
+    fn i64_shift_result_is_i64_typed() {
+        // A shift pushes an i64; a following i64 consumer (I64Add) must lower.
+        let out = run_i64(&[
+            WasmOp::I64Const(1),
+            WasmOp::I64Const(4),
+            WasmOp::I64Shl,
+            WasmOp::I64Const(1),
+            WasmOp::I64Add,
+            WasmOp::Drop,
+        ]);
+        // I64Add emits its add/sltu/add/add quartet → at least 3 Add ops.
+        assert!(count(&out, |op| matches!(op, RiscVOp::Add { .. })) >= 3);
     }
 }