tests pass

assembly/amd: use Reg in emu
2026-06-24 02:14:17 +00:00 · 2025-12-30 13:49:05 +00:00 · 2025-12-30 12:52:03 +00:00
6 changed files with 1616 additions and 7666 deletions
--- a/extra/assembly/amd/autogen/rdna3/gen_pcode.py
+++ b/extra/assembly/amd/autogen/rdna3/gen_pcode.py
--- a/extra/assembly/amd/emu.py
+++ b/extra/assembly/amd/emu.py
@ -3,7 +3,7 @@
 from __future__ import annotations
 import ctypes, os
 from extra.assembly.amd.dsl import Inst, RawImm
-from extra.assembly.amd.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64
+from extra.assembly.amd.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64, Reg
 from extra.assembly.amd.autogen.rdna3.gen_pcode import get_compiled_functions
 from extra.assembly.amd.autogen.rdna3 import (
  SOP1, SOP2, SOPC, SOPK, SOPP, SMEM, VOP1, VOP2, VOP3, VOP3SD, VOP3P, VOPC, DS, FLAT, VOPD, SrcEnum,
@ -90,53 +90,84 @@ def _get_compiled() -> dict:
  return _COMPILED

 class WaveState:
-  __slots__ = ('sgpr', 'vgpr', 'scc', 'pc', 'literal', '_pend_sgpr')
+  __slots__ = ('sgpr', 'vgpr', 'scc', 'pc', 'literal', '_pend_sgpr', '_scc_reg', '_vcc_reg', '_exec_reg')
  def __init__(self):
-    self.sgpr, self.vgpr = [0] * SGPR_COUNT, [[0] * VGPR_COUNT for _ in range(WAVE_SIZE)]
-    self.sgpr[EXEC_LO], self.scc, self.pc, self.literal, self._pend_sgpr = 0xffffffff, 0, 0, 0, {}
+    self.sgpr = [Reg(0) for _ in range(SGPR_COUNT)]
+    self.vgpr = [[Reg(0) for _ in range(VGPR_COUNT)] for _ in range(WAVE_SIZE)]
+    self.sgpr[EXEC_LO]._val = 0xffffffff
+    self.scc, self.pc, self.literal, self._pend_sgpr = 0, 0, 0, {}
+    # Reg wrappers for pseudocode access
+    self._scc_reg = Reg(0)
+    self._vcc_reg = self.sgpr[VCC_LO]
+    self._exec_reg = self.sgpr[EXEC_LO]

  @property
-  def vcc(self) -> int: return self.sgpr[VCC_LO] | (self.sgpr[VCC_HI] << 32)
+  def vcc(self) -> int: return self.sgpr[VCC_LO]._val | (self.sgpr[VCC_HI]._val << 32)
  @vcc.setter
-  def vcc(self, v: int): self.sgpr[VCC_LO], self.sgpr[VCC_HI] = v & 0xffffffff, (v >> 32) & 0xffffffff
+  def vcc(self, v: int): self.sgpr[VCC_LO]._val, self.sgpr[VCC_HI]._val = v & 0xffffffff, (v >> 32) & 0xffffffff
  @property
-  def exec_mask(self) -> int: return self.sgpr[EXEC_LO] | (self.sgpr[EXEC_HI] << 32)
+  def exec_mask(self) -> int: return self.sgpr[EXEC_LO]._val | (self.sgpr[EXEC_HI]._val << 32)
  @exec_mask.setter
-  def exec_mask(self, v: int): self.sgpr[EXEC_LO], self.sgpr[EXEC_HI] = v & 0xffffffff, (v >> 32) & 0xffffffff
+  def exec_mask(self, v: int): self.sgpr[EXEC_LO]._val, self.sgpr[EXEC_HI]._val = v & 0xffffffff, (v >> 32) & 0xffffffff

-  def rsgpr(self, i: int) -> int: return 0 if i == NULL else self.scc if i == SCC else self.sgpr[i] if i < SGPR_COUNT else 0
+  def rsgpr(self, i: int) -> int: return 0 if i == NULL else self.scc if i == SCC else self.sgpr[i]._val if i < SGPR_COUNT else 0
  def wsgpr(self, i: int, v: int):
-    if i < SGPR_COUNT and i != NULL: self.sgpr[i] = v & 0xffffffff
+    if i < SGPR_COUNT and i != NULL: self.sgpr[i]._val = v & 0xffffffff
  def rsgpr64(self, i: int) -> int: return self.rsgpr(i) | (self.rsgpr(i+1) << 32)
  def wsgpr64(self, i: int, v: int): self.wsgpr(i, v & 0xffffffff); self.wsgpr(i+1, (v >> 32) & 0xffffffff)

  def rsrc(self, v: int, lane: int) -> int:
-    if v < SGPR_COUNT: return self.sgpr[v]
+    if v < SGPR_COUNT: return self.sgpr[v]._val
    if v == SCC: return self.scc
    if v < 255: return _INLINE_CONSTS[v - 128]
    if v == 255: return self.literal
-    return self.vgpr[lane][v - 256] if v <= 511 else 0
+    return self.vgpr[lane][v - 256]._val if v <= 511 else 0
+
+  def rsrc_reg(self, v: int, lane: int) -> Reg:
+    """Return the Reg object for a source operand."""
+    if v < SGPR_COUNT: return self.sgpr[v]
+    if v == SCC: self._scc_reg._val = self.scc; return self._scc_reg
+    if v < 255: return Reg(_INLINE_CONSTS[v - 128])
+    if v == 255: return Reg(self.literal)
+    return self.vgpr[lane][v - 256] if v <= 511 else Reg(0)

  def rsrc_f16(self, v: int, lane: int) -> int:
    """Read source operand for VOP3P packed f16 operations. Uses f16 inline constants."""
-    if v < SGPR_COUNT: return self.sgpr[v]
+    if v < SGPR_COUNT: return self.sgpr[v]._val
    if v == SCC: return self.scc
    if v < 255: return _INLINE_CONSTS_F16[v - 128]
    if v == 255: return self.literal
-    return self.vgpr[lane][v - 256] if v <= 511 else 0
+    return self.vgpr[lane][v - 256]._val if v <= 511 else 0
+
+  def rsrc_reg_f16(self, v: int, lane: int) -> Reg:
+    """Return Reg for VOP3P source. Inline constants are f16 in low 16 bits only."""
+    if v < SGPR_COUNT: return self.sgpr[v]
+    if v == SCC: self._scc_reg._val = self.scc; return self._scc_reg
+    if v < 255: return Reg(_INLINE_CONSTS_F16[v - 128])  # f16 inline constant
+    if v == 255: return Reg(self.literal)
+    return self.vgpr[lane][v - 256] if v <= 511 else Reg(0)

  def rsrc64(self, v: int, lane: int) -> int:
    """Read 64-bit source operand. For inline constants, returns 64-bit representation."""
-    # Inline constants 128-254 need special handling for 64-bit ops
    if 128 <= v < 255: return _INLINE_CONSTS_F64[v - 128]
-    if v == 255: return self.literal  # 32-bit literal, caller handles extension
+    if v == 255: return self.literal
    return self.rsrc(v, lane) | ((self.rsrc(v+1, lane) if v < VCC_LO or 256 <= v <= 511 else 0) << 32)

+  def rsrc_reg64(self, v: int, lane: int) -> Reg:
+    """Return Reg for 64-bit source operand. For inline constants, returns 64-bit f64 value."""
+    if 128 <= v < 255: return Reg(_INLINE_CONSTS_F64[v - 128])
+    if v == 255: return Reg(self.literal)
+    if v < SGPR_COUNT: return Reg(self.sgpr[v]._val | (self.sgpr[v+1]._val << 32))
+    if 256 <= v <= 511:
+      vgpr_idx = v - 256
+      return Reg(self.vgpr[lane][vgpr_idx]._val | (self.vgpr[lane][vgpr_idx + 1]._val << 32))
+    return Reg(0)
+
  def pend_sgpr_lane(self, reg: int, lane: int, val: int):
    if reg not in self._pend_sgpr: self._pend_sgpr[reg] = 0
    if val: self._pend_sgpr[reg] |= (1 << lane)
  def commit_pends(self):
-    for reg, val in self._pend_sgpr.items(): self.sgpr[reg] = val
+    for reg, val in self._pend_sgpr.items(): self.sgpr[reg]._val = val
    self._pend_sgpr.clear()

 # Instruction decode
@ -258,333 +289,237 @@ def exec_scalar(st: WaveState, inst: Inst) -> int:
  is_64bit_sop2 = is_64bit_s0 and inst_type is SOP2
  s1 = st.rsrc64(inst.ssrc1, 0) if (is_64bit_sop2 or is_64bit_s0s1) else (st.rsrc(inst.ssrc1, 0) if inst_type in (SOP2, SOPC) else inst.simm16 if inst_type is SOPK else 0)
  d0 = st.rsgpr64(sdst) if (is_64bit_s0 or is_64bit_s0s1) and sdst is not None else (st.rsgpr(sdst) if sdst is not None else 0)
-  exec_mask = st.exec_mask
  literal = inst.simm16 if inst_type is SOPK else st.literal

-  # Execute compiled function
-  result = fn(s0, s1, 0, d0, st.scc, st.vcc, 0, exec_mask, literal, None, {})
+  # Create Reg objects for new calling convention
+  S0, S1, S2, D0 = Reg(s0), Reg(s1), Reg(0), Reg(d0)
+  SCC, VCC, EXEC = Reg(st.scc), Reg(st.vcc), Reg(st.exec_mask)

-  # Apply results
+  # Execute compiled function - fn(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, SIMM16, VGPR, SRC0, VDST)
+  fn(S0, S1, S2, D0, SCC, VCC, 0, EXEC, Reg(literal), None, 0, 0)
+
+  # Apply results from Reg objects
+  is_64bit_d0 = is_64bit_s0 or is_64bit_s0s1
  if sdst is not None:
-    if result.get('d0_64'):
-      st.wsgpr64(sdst, result['d0'])
+    if is_64bit_d0:
+      st.wsgpr64(sdst, D0._val)
    else:
-      st.wsgpr(sdst, result['d0'])
-  if 'scc' in result: st.scc = result['scc']
-  if 'exec' in result: st.exec_mask = result['exec']
-  if 'pc_delta' in result: return result['pc_delta']
+      st.wsgpr(sdst, D0._val)
+  st.scc = SCC._val
+  st.exec_mask = EXEC._val
  return 0

-def exec_vector(st: WaveState, inst: Inst, lane: int, lds: bytearray | None = None) -> None:
-  """Execute vector instruction for one lane."""
+def exec_vector(st: WaveState, inst: Inst, lane: int, lds: bytearray | None = None,
+                d0_override: 'Reg | None' = None, vcc_override: 'Reg | None' = None) -> None:
+  """Execute vector instruction for one lane.
+  d0_override: For VOPC/VOP3-VOPC, use this Reg instead of st.sgpr[vdst] for D0 output.
+  vcc_override: For VOP3SD, use this Reg instead of st.sgpr[sdst] for VCC output.
+  """
  compiled = _get_compiled()
  inst_type, V = type(inst), st.vgpr[lane]

  # Memory ops (not ALU pseudocode)
  if inst_type is FLAT:
    op, addr_reg, data_reg, vdst, offset, saddr = inst.op, inst.addr, inst.data, inst.vdst, _sext(inst.offset, 13), inst.saddr
-    addr = V[addr_reg] | (V[addr_reg+1] << 32)
-    addr = (st.rsgpr64(saddr) + V[addr_reg] + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff
+    addr = V[addr_reg]._val | (V[addr_reg+1]._val << 32)
+    addr = (st.rsgpr64(saddr) + V[addr_reg]._val + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff
    if op in FLAT_LOAD:
      cnt, sz, sign = FLAT_LOAD[op]
-      for i in range(cnt): val = mem_read(addr + i * sz, sz); V[vdst + i] = _sext(val, sz * 8) & 0xffffffff if sign else val
+      for i in range(cnt): val = mem_read(addr + i * sz, sz); V[vdst + i]._val = _sext(val, sz * 8) & 0xffffffff if sign else val
    elif op in FLAT_STORE:
      cnt, sz = FLAT_STORE[op]
-      for i in range(cnt): mem_write(addr + i * sz, sz, V[data_reg + i] & ((1 << (sz * 8)) - 1))
+      for i in range(cnt): mem_write(addr + i * sz, sz, V[data_reg + i]._val & ((1 << (sz * 8)) - 1))
    elif op in FLAT_D16_LOAD:
      sz, sign, hi = FLAT_D16_LOAD[op]
      val = mem_read(addr, sz)
      if sign: val = _sext(val, sz * 8) & 0xffff
-      if hi: V[vdst] = (V[vdst] & 0xffff) | (val << 16)  # upper 16 bits
-      else: V[vdst] = (V[vdst] & 0xffff0000) | (val & 0xffff)  # lower 16 bits
+      if hi: V[vdst]._val = (V[vdst]._val & 0xffff) | (val << 16)
+      else: V[vdst]._val = (V[vdst]._val & 0xffff0000) | (val & 0xffff)
    elif op in FLAT_D16_STORE:
      sz, hi = FLAT_D16_STORE[op]
-      val = (V[data_reg] >> 16) & 0xffff if hi else V[data_reg] & 0xffff
+      val = (V[data_reg]._val >> 16) & 0xffff if hi else V[data_reg]._val & 0xffff
      mem_write(addr, sz, val & ((1 << (sz * 8)) - 1))
    else: raise NotImplementedError(f"FLAT op {op}")
    return

  if inst_type is DS:
-    op, addr, vdst = inst.op, (V[inst.addr] + inst.offset0) & 0xffff, inst.vdst
+    op, addr, vdst = inst.op, (V[inst.addr]._val + inst.offset0) & 0xffff, inst.vdst
    if op in DS_LOAD:
      cnt, sz, sign = DS_LOAD[op]
-      for i in range(cnt): val = int.from_bytes(lds[addr+i*sz:addr+i*sz+sz], 'little'); V[vdst + i] = _sext(val, sz * 8) & 0xffffffff if sign else val
+      for i in range(cnt): val = int.from_bytes(lds[addr+i*sz:addr+i*sz+sz], 'little'); V[vdst + i]._val = _sext(val, sz * 8) & 0xffffffff if sign else val
    elif op in DS_STORE:
      cnt, sz = DS_STORE[op]
-      for i in range(cnt): lds[addr+i*sz:addr+i*sz+sz] = (V[inst.data0 + i] & ((1 << (sz * 8)) - 1)).to_bytes(sz, 'little')
+      for i in range(cnt): lds[addr+i*sz:addr+i*sz+sz] = (V[inst.data0 + i]._val & ((1 << (sz * 8)) - 1)).to_bytes(sz, 'little')
    else: raise NotImplementedError(f"DS op {op}")
    return

  # VOPD: dual-issue, execute two ops using VOP2/VOP3 compiled functions
-  # Both ops execute simultaneously using pre-instruction values, so read all inputs first
  if inst_type is VOPD:
    vdsty = (inst.vdsty << 1) | ((inst.vdstx & 1) ^ 1)
    # Read all source operands BEFORE any writes (dual-issue semantics)
-    sx0, sx1 = st.rsrc(inst.srcx0, lane), V[inst.vsrcx1]
-    sy0, sy1 = st.rsrc(inst.srcy0, lane), V[inst.vsrcy1]
-    dx0, dy0 = V[inst.vdstx], V[vdsty]
-    # Execute X op
-    res_x = None
+    sx0, sx1 = Reg(st.rsrc(inst.srcx0, lane)), Reg(V[inst.vsrcx1]._val)
+    sy0, sy1 = Reg(st.rsrc(inst.srcy0, lane)), Reg(V[inst.vsrcy1]._val)
+    dx0, dy0 = Reg(V[inst.vdstx]._val), Reg(V[vdsty]._val)
+    st._scc_reg._val = st.scc
    if (op_x := _VOPD_TO_VOP.get(inst.opx)):
      if (fn_x := compiled.get(type(op_x), {}).get(op_x)):
-        res_x = fn_x(sx0, sx1, 0, dx0, st.scc, st.vcc, lane, st.exec_mask, st.literal, None, {})
-    # Execute Y op
-    res_y = None
+        fn_x(sx0, sx1, Reg(0), dx0, st._scc_reg, st.sgpr[VCC_LO], lane, st.sgpr[EXEC_LO], Reg(st.literal), None, Reg(0), Reg(inst.vdstx))
    if (op_y := _VOPD_TO_VOP.get(inst.opy)):
      if (fn_y := compiled.get(type(op_y), {}).get(op_y)):
-        res_y = fn_y(sy0, sy1, 0, dy0, st.scc, st.vcc, lane, st.exec_mask, st.literal, None, {})
-    # Write results after both ops complete
-    if res_x is not None: V[inst.vdstx] = res_x['d0']
-    if res_y is not None: V[vdsty] = res_y['d0']
+        fn_y(sy0, sy1, Reg(0), dy0, st._scc_reg, st.sgpr[VCC_LO], lane, st.sgpr[EXEC_LO], Reg(st.literal), None, Reg(0), Reg(vdsty))
+    V[inst.vdstx]._val, V[vdsty]._val = dx0._val, dy0._val
+    st.scc = st._scc_reg._val
    return

-  # VOP3SD: has extra scalar dest for carry output
-  if inst_type is VOP3SD:
-    op = VOP3SDOp(inst.op)
-    fn = compiled.get(VOP3SDOp, {}).get(op)
-    if fn is None: raise NotImplementedError(f"{op.name} not in pseudocode")
-    s0, s1, s2 = st.rsrc(inst.src0, lane), st.rsrc(inst.src1, lane), st.rsrc(inst.src2, lane)
-    # For 64-bit src2 ops (V_MAD_U64_U32, V_MAD_I64_I32), read from consecutive registers
-    mad64_ops = (VOP3SDOp.V_MAD_U64_U32, VOP3SDOp.V_MAD_I64_I32)
-    if op in mad64_ops:
-      if inst.src2 >= 256:  # VGPR
-        s2 = V[inst.src2 - 256] | (V[inst.src2 - 256 + 1] << 32)
-      else:  # SGPR - read 64-bit from consecutive SGPRs
-        s2 = st.rsgpr64(inst.src2)
-    d0 = V[inst.vdst]
-    # For carry-in operations (V_*_CO_CI_*), src2 register contains the carry bitmask (not VCC).
-    # The pseudocode uses VCC but in VOP3SD encoding, the actual carry source is inst.src2.
-    # We pass the src2 register value as 'vcc' to the interpreter so it reads the correct carry.
-    carry_ops = (VOP3SDOp.V_ADD_CO_CI_U32, VOP3SDOp.V_SUB_CO_CI_U32, VOP3SDOp.V_SUBREV_CO_CI_U32)
-    vcc_for_exec = st.rsgpr64(inst.src2) if op in carry_ops else st.vcc
-    result = fn(s0, s1, s2, d0, st.scc, vcc_for_exec, lane, st.exec_mask, st.literal, None, {})
-    # Write result - handle 64-bit destinations
-    if result.get('d0_64'):
-      V[inst.vdst] = result['d0'] & 0xffffffff
-      V[inst.vdst + 1] = (result['d0'] >> 32) & 0xffffffff
-    else:
-      V[inst.vdst] = result['d0'] & 0xffffffff
-    if result.get('vcc_lane') is not None:
-      st.pend_sgpr_lane(inst.sdst, lane, result['vcc_lane'])
-    return
-
-
-
-  # Get op enum and sources (None means "no source" for that operand)
+  # Determine instruction format and get function
+  is_vop3_vopc = False
+  is_readlane = False
  if inst_type is VOP1:
    if inst.op == VOP1Op.V_NOP: return
    op_cls, op, src0, src1, src2, vdst = VOP1Op, VOP1Op(inst.op), inst.src0, None, None, inst.vdst
+    # V_READFIRSTLANE_B32 writes to SGPR, not VGPR
+    is_readlane = inst.op == VOP1Op.V_READFIRSTLANE_B32
  elif inst_type is VOP2:
    op_cls, op, src0, src1, src2, vdst = VOP2Op, VOP2Op(inst.op), inst.src0, inst.vsrc1 + 256, None, inst.vdst
  elif inst_type is VOP3:
-    # VOP3 ops 0-255 are VOPC comparisons encoded as VOP3 (use VOPCOp pseudocode)
    if inst.op < 256:
+      # VOP3-encoded VOPC - destination is an SGPR (vdst field)
      op_cls, op, src0, src1, src2, vdst = VOPCOp, VOPCOp(inst.op), inst.src0, inst.src1, None, inst.vdst
+      is_vop3_vopc = True
    else:
      op_cls, op, src0, src1, src2, vdst = VOP3Op, VOP3Op(inst.op), inst.src0, inst.src1, inst.src2, inst.vdst
-      # V_PERM_B32: byte permutation - not in pseudocode PDF, implement directly
-      # D0[byte_i] = selector[byte_i] < 8 ? {src0, src1}[selector[byte_i]] : (selector[byte_i] >= 0xD ? 0xFF : 0x00)
-      if op == VOP3Op.V_PERM_B32:
-        s0, s1, s2 = st.rsrc(inst.src0, lane), st.rsrc(inst.src1, lane), st.rsrc(inst.src2, lane)
-        # Combine src1 and src0 into 8-byte value: src1 is bytes 0-3, src0 is bytes 4-7
-        combined = (s1 & 0xffffffff) | ((s0 & 0xffffffff) << 32)
-        result = 0
-        for i in range(4):  # 4 result bytes
-          sel = (s2 >> (i * 8)) & 0xff  # byte selector for this position
-          if sel <= 7: result |= (((combined >> (sel * 8)) & 0xff) << (i * 8))  # select byte from combined
-          elif sel >= 0xd: result |= (0xff << (i * 8))  # 0xD-0xF: constant 0xFF
-          # else 0x8-0xC: constant 0x00 (already 0)
-        V[vdst] = result & 0xffffffff
-        return
+      # V_READFIRSTLANE_B32 and V_READLANE_B32 write to SGPR
+      is_readlane = inst.op in (VOP3Op.V_READFIRSTLANE_B32, VOP3Op.V_READLANE_B32)
+  elif inst_type is VOP3SD:
+    op_cls, op, src0, src1, src2, vdst = VOP3SDOp, VOP3SDOp(inst.op), inst.src0, inst.src1, inst.src2, inst.vdst
  elif inst_type is VOPC:
    op_cls, op, src0, src1, src2, vdst = VOPCOp, VOPCOp(inst.op), inst.src0, inst.vsrc1 + 256, None, VCC_LO
  elif inst_type is VOP3P:
-    # VOP3P: Packed 16-bit operations using compiled functions
-    op = VOP3POp(inst.op)
-    # WMMA: wave-level matrix multiply-accumulate (special handling - needs cross-lane access)
-    if op in (VOP3POp.V_WMMA_F32_16X16X16_F16, VOP3POp.V_WMMA_F32_16X16X16_BF16, VOP3POp.V_WMMA_F16_16X16X16_F16):
-      if lane == 0:  # Only execute once per wave, write results for all lanes
-        exec_wmma(st, inst, op)
+    op_cls, op, src0, src1, src2, vdst = VOP3POp, VOP3POp(inst.op), inst.src0, inst.src1, inst.src2, inst.vdst
+    # WMMA instructions are handled specially (only execute for lane 0)
+    if op in (VOP3POp.V_WMMA_F32_16X16X16_F16, VOP3POp.V_WMMA_F16_16X16X16_F16):
+      if lane == 0: exec_wmma(st, inst, op)
      return
-    # V_FMA_MIX: Mixed precision FMA - inputs can be f16 or f32 controlled by opsel
-    if op in (VOP3POp.V_FMA_MIX_F32, VOP3POp.V_FMA_MIXLO_F16, VOP3POp.V_FMA_MIXHI_F16):
-      opsel = getattr(inst, 'opsel', 0)
-      opsel_hi = getattr(inst, 'opsel_hi', 0)
-      neg = getattr(inst, 'neg', 0)
-      neg_hi = getattr(inst, 'neg_hi', 0)
-      vdst = inst.vdst
-      # Read raw 32-bit values - for V_FMA_MIX, sources can be either f32 or f16
-      s0_raw = st.rsrc(inst.src0, lane)
-      s1_raw = st.rsrc(inst.src1, lane)
-      s2_raw = st.rsrc(inst.src2, lane) if inst.src2 is not None else 0
-      # opsel[i]=0: use as f32, opsel[i]=1: use hi f16 as f32
-      # For src0: opsel[0], for src1: opsel[1], for src2: opsel[2]
-      if opsel & 1: s0 = _f16((s0_raw >> 16) & 0xffff)  # hi f16 -> f32
-      else: s0 = _f32(s0_raw)  # use as f32
-      if opsel & 2: s1 = _f16((s1_raw >> 16) & 0xffff)
-      else: s1 = _f32(s1_raw)
-      if opsel & 4: s2 = _f16((s2_raw >> 16) & 0xffff)
-      else: s2 = _f32(s2_raw)
-      # Apply neg modifiers (for f32 values)
-      if neg & 1: s0 = -s0
-      if neg & 2: s1 = -s1
-      if neg & 4: s2 = -s2
-      # Compute FMA: d = s0 * s1 + s2
-      result = s0 * s1 + s2
-      V = st.vgpr[lane]
-      if op == VOP3POp.V_FMA_MIX_F32:
-        V[vdst] = _i32(result)
-      elif op == VOP3POp.V_FMA_MIXLO_F16:
-        lo = _i16(result) & 0xffff
-        V[vdst] = (V[vdst] & 0xffff0000) | lo
-      else:  # V_FMA_MIXHI_F16
-        hi = _i16(result) & 0xffff
-        V[vdst] = (V[vdst] & 0x0000ffff) | (hi << 16)
-      return
-    # Use rsrc_f16 for VOP3P to get correct f16 inline constants
-    s0_raw = st.rsrc_f16(inst.src0, lane)
-    s1_raw = st.rsrc_f16(inst.src1, lane)
-    s2_raw = st.rsrc_f16(inst.src2, lane) if inst.src2 is not None else 0
-    # Handle opsel (which 16-bit halves to use for each source)
-    opsel = getattr(inst, 'opsel', 0)
-    opsel_hi = getattr(inst, 'opsel_hi', 3)  # Default: use hi for hi result
-    opsel_hi2 = getattr(inst, 'opsel_hi2', 1)  # Default for src2
-    # Handle neg modifiers for VOP3P
-    # neg applies to lo result inputs, neg_hi applies to hi result inputs
-    neg = getattr(inst, 'neg', 0)
-    neg_hi = getattr(inst, 'neg_hi', 0)
-    # Build "virtual" sources with halves arranged for pseudocode: lo half goes to [15:0], hi half goes to [31:16]
-    # opsel bit 0/1/2 selects which half of src0/1/2 goes to the LO result
-    # opsel_hi bit 0/1 selects which half of src0/1 goes to the HI result
-    s0_lo = (s0_raw >> 16) & 0xffff if (opsel & 1) else s0_raw & 0xffff
-    s1_lo = (s1_raw >> 16) & 0xffff if (opsel & 2) else s1_raw & 0xffff
-    s2_lo = (s2_raw >> 16) & 0xffff if (opsel & 4) else s2_raw & 0xffff
-    s0_hi = (s0_raw >> 16) & 0xffff if (opsel_hi & 1) else s0_raw & 0xffff
-    s1_hi = (s1_raw >> 16) & 0xffff if (opsel_hi & 2) else s1_raw & 0xffff
-    s2_hi = (s2_raw >> 16) & 0xffff if opsel_hi2 else s2_raw & 0xffff
-    # Apply neg to lo result inputs (toggle f16 sign bit)
-    if neg & 1: s0_lo ^= 0x8000
-    if neg & 2: s1_lo ^= 0x8000
-    if neg & 4: s2_lo ^= 0x8000
-    # Apply neg_hi to hi result inputs
-    if neg_hi & 1: s0_hi ^= 0x8000
-    if neg_hi & 2: s1_hi ^= 0x8000
-    if neg_hi & 4: s2_hi ^= 0x8000
-    # Pack into format expected by pseudocode: [31:16] = hi input, [15:0] = lo input
-    s0 = (s0_hi << 16) | s0_lo
-    s1 = (s1_hi << 16) | s1_lo
-    s2 = (s2_hi << 16) | s2_lo
-    op_cls, vdst = VOP3POp, inst.vdst
-    fn = compiled.get(op_cls, {}).get(op)
-    if fn is None: raise NotImplementedError(f"{op.name} not in pseudocode")
-    result = fn(s0, s1, s2, 0, st.scc, st.vcc, lane, st.exec_mask, st.literal, None, {})
-    st.vgpr[lane][vdst] = result['d0'] & 0xffffffff
-    return
  else: raise NotImplementedError(f"Unknown vector type {inst_type}")

  fn = compiled.get(op_cls, {}).get(op)
  if fn is None: raise NotImplementedError(f"{op.name} not in pseudocode")

-  # Read sources (with VOP3 modifiers if applicable)
-  neg, abs_ = (getattr(inst, 'neg', 0), getattr(inst, 'abs', 0)) if inst_type is VOP3 else (0, 0)
-  opsel = getattr(inst, 'opsel', 0) if inst_type is VOP3 else 0
-  def mod_src(val: int, idx: int) -> int:
-    if (abs_ >> idx) & 1: val = _i32(abs(_f32(val)))
-    if (neg >> idx) & 1: val = _i32(-_f32(val))
-    return val
-  def mod_src64(val: int, idx: int) -> int:
-    if (abs_ >> idx) & 1: val = _i64(abs(_f64(val)))
-    if (neg >> idx) & 1: val = _i64(-_f64(val))
-    return val
-
-  # Determine if sources are 64-bit based on instruction type
-  # For 64-bit shift ops: src0 is 32-bit (shift amount), src1 is 64-bit (value to shift)
-  # For most other _B64/_I64/_U64/_F64 ops: all sources are 64-bit
-  is_64bit_op = op.name.endswith(('_B64', '_I64', '_U64', '_F64'))
-  # V_LDEXP_F64: src0 is 64-bit float, src1 is 32-bit integer exponent
-  is_ldexp_64 = op in (VOP3Op.V_LDEXP_F64,)
-  is_shift_64 = op in (VOP3Op.V_LSHLREV_B64, VOP3Op.V_LSHRREV_B64, VOP3Op.V_ASHRREV_I64)
-  # 16-bit source ops: use precomputed sets instead of string checks
-  has_16bit_type = op in _VOP3_16BIT_OPS or op in _VOP1_16BIT_OPS or op in _VOP2_16BIT_OPS
-  is_16bit_src = op_cls is VOP3Op and op in _VOP3_16BIT_OPS and op not in _CVT_32_64_SRC_OPS
-  # VOP2 16-bit ops use f16 inline constants for src0 (vsrc1 is always a VGPR, no inline constants)
-  is_vop2_16bit = op_cls is VOP2Op and op in _VOP2_16BIT_OPS
-
-  if is_shift_64:
-    s0 = mod_src(st.rsrc(src0, lane), 0)  # shift amount is 32-bit
-    s1 = st.rsrc64(src1, lane) if src1 is not None else 0  # value to shift is 64-bit
-    s2 = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0
-  elif is_ldexp_64:
-    s0 = mod_src64(st.rsrc64(src0, lane), 0)  # mantissa is 64-bit float
-    s1 = mod_src(st.rsrc(src1, lane), 1) if src1 is not None else 0  # exponent is 32-bit int
-    s2 = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0
-  elif is_64bit_op:
-    # 64-bit ops: apply neg/abs modifiers using f64 interpretation for float ops
-    s0 = mod_src64(st.rsrc64(src0, lane), 0)
-    s1 = mod_src64(st.rsrc64(src1, lane), 1) if src1 is not None else 0
-    s2 = mod_src64(st.rsrc64(src2, lane), 2) if src2 is not None else 0
-  elif is_16bit_src:
-    # For 16-bit source ops, opsel bits select which half to use
-    s0_raw = mod_src(st.rsrc(src0, lane), 0)
-    s1_raw = mod_src(st.rsrc(src1, lane), 1) if src1 is not None else 0
-    s2_raw = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0
-    # opsel[0] selects hi(1) or lo(0) for src0, opsel[1] for src1, opsel[2] for src2
-    s0 = ((s0_raw >> 16) & 0xffff) if (opsel & 1) else (s0_raw & 0xffff)
-    s1 = ((s1_raw >> 16) & 0xffff) if (opsel & 2) else (s1_raw & 0xffff)
-    s2 = ((s2_raw >> 16) & 0xffff) if (opsel & 4) else (s2_raw & 0xffff)
-  elif is_vop2_16bit:
-    # VOP2 16-bit ops: src0 can use f16 inline constants, vsrc1 is always a VGPR (no inline constants)
-    s0 = mod_src(st.rsrc_f16(src0, lane), 0)
-    s1 = mod_src(st.rsrc(src1, lane), 1) if src1 is not None else 0
-    s2 = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0
+  # Build source Regs - get the actual register or create temp for inline constants
+  # VOP3P uses f16 inline constants (16-bit value in low half only)
+  if inst_type is VOP3P:
+    S0 = st.rsrc_reg_f16(src0, lane)
+    S1 = st.rsrc_reg_f16(src1, lane) if src1 is not None else Reg(0)
+    S2 = st.rsrc_reg_f16(src2, lane) if src2 is not None else Reg(0)
+    # Apply op_sel_hi modifiers: control which half is used for hi-half computation
+    # opsel_hi[0]=0 means src0 hi comes from lo half, =1 means from hi half (default)
+    # opsel_hi[1]=0 means src1 hi comes from lo half, =1 means from hi half (default)
+    # opsel_hi2=0 means src2 hi comes from lo half, =1 means from hi half (default)
+    opsel_hi = getattr(inst, 'opsel_hi', 3)  # default 0b11
+    opsel_hi2 = getattr(inst, 'opsel_hi2', 1)  # default 1
+    # If opsel_hi bit is 0, replicate lo half to hi half
+    if not (opsel_hi & 1):  # src0 hi from lo
+      lo = S0._val & 0xffff
+      S0 = Reg((lo << 16) | lo)
+    if not (opsel_hi & 2):  # src1 hi from lo
+      lo = S1._val & 0xffff
+      S1 = Reg((lo << 16) | lo)
+    if not opsel_hi2:  # src2 hi from lo
+      lo = S2._val & 0xffff
+      S2 = Reg((lo << 16) | lo)
  else:
-    s0 = mod_src(st.rsrc(src0, lane), 0)
-    s1 = mod_src(st.rsrc(src1, lane), 1) if src1 is not None else 0
-    s2 = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0
-  d0 = V[vdst] if not is_64bit_op else (V[vdst] | (V[vdst + 1] << 32))
-
-  # V_CNDMASK_B32: VOP3 encoding uses src2 as mask (not VCC); VOP2 uses VCC implicitly
-  # Pass the correct mask as vcc to the function so pseudocode VCC.u64[laneId] works correctly
-  vcc_for_fn = st.rsgpr64(src2) if op in (VOP3Op.V_CNDMASK_B32,) and inst_type is VOP3 and src2 is not None and src2 < 256 else st.vcc
-
-  # Execute compiled function - pass src0_idx and vdst_idx for lane instructions
-  # For VGPR access: src0 index is the VGPR number (src0 - 256 if VGPR, else src0 for SGPR)
-  src0_idx = (src0 - 256) if src0 is not None and src0 >= 256 else (src0 if src0 is not None else 0)
-  result = fn(s0, s1, s2, d0, st.scc, vcc_for_fn, lane, st.exec_mask, st.literal, st.vgpr, {}, src0_idx, vdst)
-
-  # Apply results
-  if 'vgpr_write' in result:
-    # Lane instruction wrote to VGPR: (lane, vgpr_idx, value)
-    wr_lane, wr_idx, wr_val = result['vgpr_write']
-    st.vgpr[wr_lane][wr_idx] = wr_val
-  if 'vcc_lane' in result:
-    # VOP2 carry instructions (V_ADD_CO_CI_U32, V_SUB_CO_CI_U32, V_SUBREV_CO_CI_U32) write carry to VCC implicitly
-    # VOPC and VOP3-encoded VOPC write to vdst (which is VCC_LO for VOPC, inst.sdst for VOP3)
-    vcc_dst = VCC_LO if op_cls is VOP2Op and op in (VOP2Op.V_ADD_CO_CI_U32, VOP2Op.V_SUB_CO_CI_U32, VOP2Op.V_SUBREV_CO_CI_U32) else vdst
-    st.pend_sgpr_lane(vcc_dst, lane, result['vcc_lane'])
-  if 'exec_lane' in result:
-    # V_CMPX instructions write to EXEC per-lane
-    st.pend_sgpr_lane(EXEC_LO, lane, result['exec_lane'])
-  if 'd0' in result and op_cls not in (VOPCOp,) and 'vgpr_write' not in result:
-    # V_READFIRSTLANE_B32 and V_READLANE_B32 write to SGPR, not VGPR
-    # V_WRITELANE_B32 uses vgpr_write for cross-lane writes, don't overwrite with d0
-    writes_to_sgpr = op in (VOP1Op.V_READFIRSTLANE_B32,) or \
-                     (op_cls is VOP3Op and op in (VOP3Op.V_READFIRSTLANE_B32, VOP3Op.V_READLANE_B32))
-    # Check for 16-bit destination ops (opsel[3] controls hi/lo write)
-    is_16bit_dst = op in _VOP3_16BIT_DST_OPS or op in _VOP1_16BIT_DST_OPS
-    if writes_to_sgpr:
-      st.wsgpr(vdst, result['d0'] & 0xffffffff)
-    elif result.get('d0_64') or is_64bit_op:
-      V[vdst] = result['d0'] & 0xffffffff
-      V[vdst + 1] = (result['d0'] >> 32) & 0xffffffff
-    elif is_16bit_dst and inst_type is VOP3:
-      # VOP3 16-bit ops: opsel[3] (bit 3 of opsel field) controls hi/lo destination
-      if opsel & 8:  # opsel[3] = 1: write to high 16 bits
-        V[vdst] = (V[vdst] & 0x0000ffff) | ((result['d0'] & 0xffff) << 16)
-      else:  # opsel[3] = 0: write to low 16 bits
-        V[vdst] = (V[vdst] & 0xffff0000) | (result['d0'] & 0xffff)
+    # Check if this is a 64-bit F64 op - needs 64-bit source reads for f64 operands
+    # V_LDEXP_F64: S0 is f64, S1 is i32 (exponent)
+    # V_ADD_F64, V_MUL_F64, etc: S0 and S1 are f64
+    # VOP1 F64 ops (V_TRUNC_F64, V_FLOOR_F64, etc): S0 is f64
+    is_f64_op = hasattr(op, 'name') and '_F64' in op.name
+    is_ldexp_f64 = hasattr(op, 'name') and op.name == 'V_LDEXP_F64'
+    if is_f64_op:
+      S0 = st.rsrc_reg64(src0, lane)
+      # V_LDEXP_F64: S1 is i32 exponent, not f64
+      if is_ldexp_f64:
+        S1 = st.rsrc_reg(src1, lane) if src1 is not None else Reg(0)
+      else:
+        S1 = st.rsrc_reg64(src1, lane) if src1 is not None else Reg(0)
+      S2 = st.rsrc_reg64(src2, lane) if src2 is not None else Reg(0)
    else:
-      V[vdst] = result['d0'] & 0xffffffff
+      S0 = st.rsrc_reg(src0, lane)
+      S1 = st.rsrc_reg(src1, lane) if src1 is not None else Reg(0)
+      S2 = st.rsrc_reg(src2, lane) if src2 is not None else Reg(0)
+    # VOP3SD V_MAD_U64_U32 and V_MAD_I64_I32 need S2 as 64-bit from VGPR pair
+    if inst_type is VOP3SD and op in (VOP3SDOp.V_MAD_U64_U32, VOP3SDOp.V_MAD_I64_I32) and src2 is not None:
+      if 256 <= src2 <= 511:  # VGPR
+        vgpr_idx = src2 - 256
+        S2 = Reg(V[vgpr_idx]._val | (V[vgpr_idx + 1]._val << 32))
+
+  # Apply source modifiers (neg, abs) for VOP3/VOP3SD
+  if inst_type in (VOP3, VOP3SD):
+    neg, abs_mod = getattr(inst, 'neg', 0), getattr(inst, 'abs', 0)
+    if neg or abs_mod:
+      # Apply to f32 values - need to handle as float
+      import struct
+      def apply_mods(reg, neg_bit, abs_bit):
+        val = reg._val
+        f = struct.unpack('<f', struct.pack('<I', val & 0xffffffff))[0]
+        if abs_bit: f = abs(f)
+        if neg_bit: f = -f
+        return Reg(struct.unpack('<I', struct.pack('<f', f))[0])
+      if neg & 1 or abs_mod & 1: S0 = apply_mods(S0, neg & 1, abs_mod & 1)
+      if neg & 2 or abs_mod & 2: S1 = apply_mods(S1, neg & 2, abs_mod & 2)
+      if neg & 4 or abs_mod & 4: S2 = apply_mods(S2, neg & 4, abs_mod & 4)
+
+  # Apply opsel for VOP3 f16 operations - select which half to use
+  # opsel[0]: src0, opsel[1]: src1, opsel[2]: src2 (0=lo, 1=hi)
+  if inst_type is VOP3:
+    opsel = getattr(inst, 'opsel', 0)
+    if opsel:
+      # If opsel bit is set, swap lo and hi so that .f16 reads the hi half
+      if opsel & 1:  # src0 from hi
+        S0 = Reg(((S0._val >> 16) & 0xffff) | (S0._val << 16))
+      if opsel & 2:  # src1 from hi
+        S1 = Reg(((S1._val >> 16) & 0xffff) | (S1._val << 16))
+      if opsel & 4:  # src2 from hi
+        S2 = Reg(((S2._val >> 16) & 0xffff) | (S2._val << 16))
+
+  # For VOPC and VOP3-encoded VOPC, D0 is an SGPR (VCC_LO for VOPC, vdst for VOP3 VOPC)
+  # V_READFIRSTLANE_B32 and V_READLANE_B32 also write to SGPR
+  # Use d0_override if provided (for batch execution with shared output register)
+  is_vopc = inst_type is VOPC or (inst_type is VOP3 and is_vop3_vopc)
+  if is_vopc:
+    D0 = d0_override if d0_override is not None else st.sgpr[VCC_LO if inst_type is VOPC else vdst]
+  elif is_readlane:
+    D0 = st.sgpr[vdst]
+  else:
+    D0 = V[vdst]
+
+  # Execute compiled function - D0 is modified in place
+  st._scc_reg._val = st.scc
+  # For VOP3SD, pass sdst register as VCC parameter (carry-out destination)
+  # Use vcc_override if provided (for batch execution with shared output register)
+  # For VOP3 V_CNDMASK_B32, src2 specifies the condition selector (not VCC)
+  if inst_type is VOP3SD:
+    vcc_reg = vcc_override if vcc_override is not None else st.sgpr[inst.sdst]
+  elif inst_type is VOP3 and op == VOP3Op.V_CNDMASK_B32 and src2 is not None:
+    vcc_reg = st.rsrc_reg(src2, lane)  # Use src2 as condition
+  else:
+    vcc_reg = st.sgpr[VCC_LO]
+  # SRC0/VDST are VGPR indices (0-255), not hardware encoding (256-511)
+  src0_idx = (src0 - 256) if src0 and src0 >= 256 else (src0 if src0 else 0)
+  result = fn(S0, S1, S2, D0, st._scc_reg, vcc_reg, lane, st.sgpr[EXEC_LO], Reg(st.literal), st.vgpr, Reg(src0_idx), Reg(vdst))
+  st.scc = st._scc_reg._val
+
+  # Handle special results
+  if result:
+    if 'vgpr_write' in result:
+      wr_lane, wr_idx, wr_val = result['vgpr_write']
+      st.vgpr[wr_lane][wr_idx]._val = wr_val
+
+  # 64-bit destination: write high 32 bits to next VGPR (determined from op name)
+  is_64bit_dst = not is_vopc and not is_readlane and hasattr(op, 'name') and \
+                 any(s in op.name for s in ('_B64', '_I64', '_U64', '_F64'))
+  if is_64bit_dst:
+    V[vdst + 1]._val = (D0._val >> 32) & 0xffffffff
+    D0._val = D0._val & 0xffffffff  # Keep only low 32 bits in D0

 # ═══════════════════════════════════════════════════════════════════════════════
 # WMMA (Wave Matrix Multiply-Accumulate)
@ -635,12 +570,12 @@ def exec_wmma(st: WaveState, inst, op: VOP3POp) -> None:
      lane, reg = (i // 2) % 32, (i // 2) // 32
      lo = _i16(mat_d[i]) & 0xffff
      hi = _i16(mat_d[i + 1]) & 0xffff
-      st.vgpr[lane][vdst + reg] = (hi << 16) | lo
+      st.vgpr[lane][vdst + reg]._val = (hi << 16) | lo
  else:
    # Output is f32
    for i in range(256):
      lane, reg = i % 32, i // 32
-      st.vgpr[lane][vdst + reg] = _i32(mat_d[i])
+      st.vgpr[lane][vdst + reg]._val = _i32(mat_d[i])

 # ═══════════════════════════════════════════════════════════════════════════════
 # MAIN EXECUTION LOOP
@ -649,6 +584,113 @@ def exec_wmma(st: WaveState, inst, op: VOP3POp) -> None:
 SCALAR_TYPES = {SOP1, SOP2, SOPC, SOPK, SOPP, SMEM}
 VECTOR_TYPES = {VOP1, VOP2, VOP3, VOP3SD, VOPC, FLAT, DS, VOPD, VOP3P}

+# Pre-cache compiled functions for fast lookup
+_COMPILED_CACHE: dict | None = None
+def _get_fn(op_cls, op):
+  global _COMPILED_CACHE
+  if _COMPILED_CACHE is None: _COMPILED_CACHE = _get_compiled()
+  return _COMPILED_CACHE.get(op_cls, {}).get(op)
+
+def exec_vector_batch(st: WaveState, inst: Inst, exec_mask: int, n_lanes: int, lds: bytearray | None = None) -> None:
+  """Execute vector instruction for all active lanes at once."""
+  compiled = _get_compiled()
+  inst_type = type(inst)
+  vgpr = st.vgpr
+
+  # Memory ops - still per-lane but inlined
+  if inst_type is FLAT:
+    op, addr_reg, data_reg, vdst, offset, saddr = inst.op, inst.addr, inst.data, inst.vdst, _sext(inst.offset, 13), inst.saddr
+    if op in FLAT_LOAD:
+      cnt, sz, sign = FLAT_LOAD[op]
+      for lane in range(n_lanes):
+        if not (exec_mask & (1 << lane)): continue
+        V = vgpr[lane]
+        addr = V[addr_reg]._val | (V[addr_reg+1]._val << 32)
+        addr = (st.rsgpr64(saddr) + V[addr_reg]._val + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff
+        for i in range(cnt): val = mem_read(addr + i * sz, sz); V[vdst + i]._val = _sext(val, sz * 8) & 0xffffffff if sign else val
+    elif op in FLAT_STORE:
+      cnt, sz = FLAT_STORE[op]
+      for lane in range(n_lanes):
+        if not (exec_mask & (1 << lane)): continue
+        V = vgpr[lane]
+        addr = V[addr_reg]._val | (V[addr_reg+1]._val << 32)
+        addr = (st.rsgpr64(saddr) + V[addr_reg]._val + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff
+        for i in range(cnt): mem_write(addr + i * sz, sz, V[data_reg + i]._val & ((1 << (sz * 8)) - 1))
+    elif op in FLAT_D16_LOAD:
+      sz, sign, hi = FLAT_D16_LOAD[op]
+      for lane in range(n_lanes):
+        if not (exec_mask & (1 << lane)): continue
+        V = vgpr[lane]
+        addr = V[addr_reg]._val | (V[addr_reg+1]._val << 32)
+        addr = (st.rsgpr64(saddr) + V[addr_reg]._val + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff
+        val = mem_read(addr, sz)
+        if sign: val = _sext(val, sz * 8) & 0xffff
+        if hi: V[vdst]._val = (V[vdst]._val & 0xffff) | (val << 16)
+        else: V[vdst]._val = (V[vdst]._val & 0xffff0000) | (val & 0xffff)
+    elif op in FLAT_D16_STORE:
+      sz, hi = FLAT_D16_STORE[op]
+      for lane in range(n_lanes):
+        if not (exec_mask & (1 << lane)): continue
+        V = vgpr[lane]
+        addr = V[addr_reg]._val | (V[addr_reg+1]._val << 32)
+        addr = (st.rsgpr64(saddr) + V[addr_reg]._val + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff
+        val = (V[data_reg]._val >> 16) & 0xffff if hi else V[data_reg]._val & 0xffff
+        mem_write(addr, sz, val & ((1 << (sz * 8)) - 1))
+    else: raise NotImplementedError(f"FLAT op {op}")
+    return
+
+  if inst_type is DS:
+    op, vdst = inst.op, inst.vdst
+    if op in DS_LOAD:
+      cnt, sz, sign = DS_LOAD[op]
+      for lane in range(n_lanes):
+        if not (exec_mask & (1 << lane)): continue
+        V = vgpr[lane]
+        addr = (V[inst.addr]._val + inst.offset0) & 0xffff
+        for i in range(cnt): val = int.from_bytes(lds[addr+i*sz:addr+i*sz+sz], 'little'); V[vdst + i]._val = _sext(val, sz * 8) & 0xffffffff if sign else val
+    elif op in DS_STORE:
+      cnt, sz = DS_STORE[op]
+      for lane in range(n_lanes):
+        if not (exec_mask & (1 << lane)): continue
+        V = vgpr[lane]
+        addr = (V[inst.addr]._val + inst.offset0) & 0xffff
+        for i in range(cnt): lds[addr+i*sz:addr+i*sz+sz] = (V[inst.data0 + i]._val & ((1 << (sz * 8)) - 1)).to_bytes(sz, 'little')
+    else: raise NotImplementedError(f"DS op {op}")
+    return
+
+  # For VOPC, VOP3-encoded VOPC, and VOP3SD, we write per-lane bits to an SGPR.
+  # The pseudocode does D0.u64[laneId] = bit or VCC.u64[laneId] = bit.
+  # To avoid corrupting reads from the same SGPR, use a shared output Reg(0).
+  # Exception: CMPX instructions write to EXEC (not D0/VCC).
+  d0_override, vcc_override = None, None
+  vopc_dst, vop3sd_dst = None, None
+  is_cmpx = False
+  if inst_type is VOPC:
+    op = VOPCOp(inst.op)
+    is_cmpx = 'CMPX' in op.name
+    if not is_cmpx:  # Regular CMP writes to VCC
+      d0_override, vopc_dst = Reg(0), VCC_LO
+    else:  # CMPX writes to EXEC - clear it first, accumulate per-lane
+      st.sgpr[EXEC_LO]._val = 0
+  elif inst_type is VOP3 and inst.op < 256:  # VOP3-encoded VOPC
+    op = VOPCOp(inst.op)
+    is_cmpx = 'CMPX' in op.name
+    if not is_cmpx:  # Regular CMP writes to destination SGPR
+      d0_override, vopc_dst = Reg(0), inst.vdst
+    else:  # CMPX writes to EXEC - clear it first, accumulate per-lane
+      st.sgpr[EXEC_LO]._val = 0
+  if inst_type is VOP3SD:
+    vcc_override, vop3sd_dst = Reg(0), inst.sdst
+
+  # For other vector ops, dispatch to exec_vector per lane (can optimize later)
+  for lane in range(n_lanes):
+    if exec_mask & (1 << lane): exec_vector(st, inst, lane, lds, d0_override, vcc_override)
+
+  # Write accumulated per-lane bit results to destination SGPRs
+  # (CMPX writes directly to EXEC in the pseudocode, so no separate write needed)
+  if vopc_dst is not None: st.sgpr[vopc_dst]._val = d0_override._val
+  if vop3sd_dst is not None: st.sgpr[vop3sd_dst]._val = vcc_override._val
+
 def step_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) -> int:
  inst = program.get(st.pc)
  if inst is None: return 1
@ -666,9 +708,7 @@ def step_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) ->
    if is_readlane:
      exec_vector(st, inst, 0, lds)  # Execute once with lane 0
    else:
-      exec_mask = st.exec_mask
-      for lane in range(n_lanes):
-        if exec_mask & (1 << lane): exec_vector(st, inst, lane, lds)
+      exec_vector_batch(st, inst, st.exec_mask, n_lanes, lds)
    st.commit_pends()
    st.pc += inst_words
  return 0
@ -692,12 +732,12 @@ def exec_workgroup(program: Program, workgroup_id: tuple[int, int, int], local_s
    gx, gy, gz = workgroup_id
    # Set workgroup IDs in SGPRs based on USER_SGPR_COUNT and enable flags from COMPUTE_PGM_RSRC2
    sgpr_idx = wg_id_sgpr_base
-    if wg_id_enables[0]: st.sgpr[sgpr_idx] = gx; sgpr_idx += 1
-    if wg_id_enables[1]: st.sgpr[sgpr_idx] = gy; sgpr_idx += 1
-    if wg_id_enables[2]: st.sgpr[sgpr_idx] = gz
+    if wg_id_enables[0]: st.sgpr[sgpr_idx]._val = gx; sgpr_idx += 1
+    if wg_id_enables[1]: st.sgpr[sgpr_idx]._val = gy; sgpr_idx += 1
+    if wg_id_enables[2]: st.sgpr[sgpr_idx]._val = gz
    for i in range(n_lanes):
      tid = wave_start + i
-      st.vgpr[i][0] = tid if local_size == (lx, 1, 1) else ((tid // (lx * ly)) << 20) | (((tid // lx) % ly) << 10) | (tid % lx)
+      st.vgpr[i][0]._val = tid if local_size == (lx, 1, 1) else ((tid // (lx * ly)) << 20) | (((tid // lx) % ly) << 10) | (tid % lx)
    waves.append((st, n_lanes, wave_start))
  has_barrier = any(isinstance(inst, SOPP) and inst.op == SOPPOp.S_BARRIER for inst in program.values())
  for _ in range(2 if has_barrier else 1):
--- a/extra/assembly/amd/pcode.py
+++ b/extra/assembly/amd/pcode.py
@ -564,6 +564,8 @@ class Reg:
  def __ge__(s, o): return s._val >= int(o)
  def __eq__(s, o): return s._val == int(o)
  def __ne__(s, o): return s._val != int(o)
+  def __format__(s, spec): return format(s._val, spec)
+  def __repr__(s): return f"Reg(0x{s._val:x})"

 # ═══════════════════════════════════════════════════════════════════════════════
 # COMPILER: pseudocode -> Python (minimal transforms)
@ -644,8 +646,12 @@ def compile_pseudocode(pseudocode: str) -> str:
  return '\n'.join(lines)

 def _assign(lhs: str, rhs: str) -> str:
-  """Generate assignment. Bare tmp/SCC/etc get wrapped in Reg()."""
-  if lhs in ('tmp', 'SCC', 'VCC', 'EXEC', 'D0', 'D1', 'saveexec'):
+  """Generate assignment. Bare tmp/SCC/etc get wrapped in Reg(). For params (SCC, VCC, EXEC, D0), modify in place."""
+  # Parameters passed to function - modify in place using .b32 setter (or .b64 for 64-bit types)
+  if lhs in ('SCC', 'VCC', 'EXEC', 'D0'):
+    return f"{lhs}.b32 = {rhs}"
+  # Local variables - create new Reg
+  if lhs in ('tmp', 'D1', 'saveexec'):
    return f"{lhs} = Reg({rhs})"
  return f"{lhs} = {rhs}"

@ -948,70 +954,8 @@ from extra.assembly.amd.pcode import *
        # CLZ/CTZ: The PDF pseudocode searches for the first 1 bit but doesn't break.
        # Hardware stops at first match. SOP1 uses tmp=i, VOP1/VOP3 use D0.i32=i
        if 'CLZ' in op.name or 'CTZ' in op.name:
-          code = code.replace('tmp = Reg(i)', 'tmp = Reg(i); break')
+          code = code.replace('tmp = Reg(i)', 'tmp._val = i; break')
          code = code.replace('D0.i32 = i', 'D0.i32 = i; break')
-        # V_DIV_FMAS_F32/F64: PDF page 449 says 2^32/2^64 but hardware behavior is more complex.
-        # The scale direction depends on S2 (the addend): if exponent(S2) > 127 (i.e., S2 >= 2.0),
-        # scale by 2^+64 (to unscale a numerator that was scaled). Otherwise scale by 2^-64
-        # (to unscale a denominator that was scaled).
-        if op.name == 'V_DIV_FMAS_F32':
-          code = code.replace(
-            'D0.f32 = 2.0 ** 32 * fma(S0.f32, S1.f32, S2.f32)',
-            'D0.f32 = (2.0 ** 64 if exponent(S2.f32) > 127 else 2.0 ** -64) * fma(S0.f32, S1.f32, S2.f32)')
-        if op.name == 'V_DIV_FMAS_F64':
-          code = code.replace(
-            'D0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64)',
-            'D0.f64 = (2.0 ** 128 if exponent(S2.f64) > 1023 else 2.0 ** -128) * fma(S0.f64, S1.f64, S2.f64)')
-        # V_DIV_SCALE_F32/F64: PDF page 463-464 has several bugs vs hardware behavior:
-        # 1. Zero case: hardware sets VCC=1 (PDF doesn't)
-        # 2. Denorm denom: hardware returns NaN (PDF says scale). VCC is set independently by exp diff check.
-        # 3. Tiny numer (exp<=23): hardware sets VCC=1 (PDF doesn't)
-        # 4. Result would be denorm: hardware doesn't scale, just sets VCC=1
-        if op.name == 'V_DIV_SCALE_F32':
-          # Fix 1: Set VCC=1 when zero operands produce NaN
-          code = code.replace(
-            'D0.f32 = float("nan")',
-            'VCC = Reg(0x1); D0.f32 = float("nan")')
-          # Fix 2: Denorm denom returns NaN. Must check this AFTER all VCC-setting logic runs.
-          # Insert at end of all branches, before the final result is used
-          code = code.replace(
-            'elif S1.f32 == DENORM.f32:\n  D0.f32 = ldexp(S0.f32, 64)',
-            'elif False:\n  pass  # denorm check moved to end')
-          # Add denorm check at the very end - this overrides D0 but preserves VCC
-          code += '\nif S1.f32 == DENORM.f32:\n  D0.f32 = float("nan")'
-          # Fix 3: Tiny numer should set VCC=1
-          code = code.replace(
-            'elif exponent(S2.f32) <= 23:\n  D0.f32 = ldexp(S0.f32, 64)',
-            'elif exponent(S2.f32) <= 23:\n  VCC = Reg(0x1); D0.f32 = ldexp(S0.f32, 64)')
-          # Fix 4: S2/S1 would be denorm - don't scale, just set VCC
-          code = code.replace(
-            'elif S2.f32 / S1.f32 == DENORM.f32:\n  VCC = Reg(0x1)\n  if S0.f32 == S2.f32:\n    D0.f32 = ldexp(S0.f32, 64)',
-            'elif S2.f32 / S1.f32 == DENORM.f32:\n  VCC = Reg(0x1)')
-        if op.name == 'V_DIV_SCALE_F64':
-          # Same fixes for f64 version
-          code = code.replace(
-            'D0.f64 = float("nan")',
-            'VCC = Reg(0x1); D0.f64 = float("nan")')
-          code = code.replace(
-            'elif S1.f64 == DENORM.f64:\n  D0.f64 = ldexp(S0.f64, 128)',
-            'elif False:\n  pass  # denorm check moved to end')
-          code += '\nif S1.f64 == DENORM.f64:\n  D0.f64 = float("nan")'
-          code = code.replace(
-            'elif exponent(S2.f64) <= 52:\n  D0.f64 = ldexp(S0.f64, 128)',
-            'elif exponent(S2.f64) <= 52:\n  VCC = Reg(0x1); D0.f64 = ldexp(S0.f64, 128)')
-          code = code.replace(
-            'elif S2.f64 / S1.f64 == DENORM.f64:\n  VCC = Reg(0x1)\n  if S0.f64 == S2.f64:\n    D0.f64 = ldexp(S0.f64, 128)',
-            'elif S2.f64 / S1.f64 == DENORM.f64:\n  VCC = Reg(0x1)')
-        # V_DIV_FIXUP_F32/F64: PDF doesn't check isNAN(S0), but hardware returns OVERFLOW if S0 is NaN.
-        # When division fails (e.g., due to denorm denom), S0 becomes NaN, and fixup should return ±inf.
-        if op.name == 'V_DIV_FIXUP_F32':
-          code = code.replace(
-            'D0.f32 = ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))',
-            'D0.f32 = ((-OVERFLOW_F32) if (sign_out) else (OVERFLOW_F32)) if isNAN(S0.f32) else ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))')
-        if op.name == 'V_DIV_FIXUP_F64':
-          code = code.replace(
-            'D0.f64 = ((-abs(S0.f64)) if (sign_out) else (abs(S0.f64)))',
-            'D0.f64 = ((-OVERFLOW_F64) if (sign_out) else (OVERFLOW_F64)) if isNAN(S0.f64) else ((-abs(S0.f64)) if (sign_out) else (abs(S0.f64)))')
        # Detect flags for result handling
        is_64 = any(p in pc for p in ['D0.u64', 'D0.b64', 'D0.f64', 'D0.i64', 'D1.u64', 'D1.b64', 'D1.f64', 'D1.i64'])
        has_d1 = '{ D1' in pc
@ -1023,51 +967,35 @@ from extra.assembly.amd.pcode import *
        # VOP3SD instructions that write VCC per-lane (either via VCC.u64[laneId] or by setting VCC = 0/1)
        has_sdst = cls_name == 'VOP3SDOp' and ('VCC.u64[laneId]' in pc or is_div_scale)

-        # Generate function with indented body
+        # Generate function that takes Reg objects directly - modifies D0 in place
        fn_name = f"_{cls_name}_{op.name}"
-        lines.append(f"def {fn_name}(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0):")
+        lines.append(f"def {fn_name}(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, SIMM16, VGPR, SRC0, VDST):")
        # Add original pseudocode as comment
        for pc_line in pc.split('\n'):
          lines.append(f"  # {pc_line}")
-        # Only create Reg objects for registers actually used in the pseudocode
+        # Only create extra Reg objects for registers that need fresh state
        combined = code + pc
-        regs = [('S0', 'Reg(s0)'), ('S1', 'Reg(s1)'), ('S2', 'Reg(s2)'),
-                ('D0', 'Reg(s0)' if is_div_scale else 'Reg(d0)'), ('D1', 'Reg(0)'),
-                ('SCC', 'Reg(scc)'), ('VCC', 'Reg(vcc)'), ('EXEC', 'Reg(exec_mask)'),
-                ('tmp', 'Reg(0)'), ('saveexec', 'Reg(exec_mask)'), ('laneId', 'lane'),
-                ('SIMM16', 'Reg(literal)'), ('SIMM32', 'Reg(literal)'),
-                ('SRC0', 'Reg(src0_idx)'), ('VDST', 'Reg(vdst_idx)')]
-        used = {name for name, _ in regs if name in combined}
-        # EXEC_LO/EXEC_HI need EXEC
-        if 'EXEC_LO' in combined or 'EXEC_HI' in combined: used.add('EXEC')
-        for name, init in regs:
-          if name in used: lines.append(f"  {name} = {init}")
+        # D1 and tmp/saveexec need to be created fresh
+        if 'D1' in combined: lines.append("  D1 = Reg(0)")
+        if 'tmp' in combined: lines.append("  tmp = Reg(0)")
+        if 'saveexec' in combined: lines.append("  saveexec = Reg(EXEC._val)")
+        if 'SIMM32' in combined: lines.append("  SIMM32 = SIMM16")
        if 'EXEC_LO' in combined: lines.append("  EXEC_LO = SliceProxy(EXEC, 31, 0)")
        if 'EXEC_HI' in combined: lines.append("  EXEC_HI = SliceProxy(EXEC, 63, 32)")
-        # Add compiled pseudocode with markers
+        # For DIV_SCALE, D0 starts with S0's value
+        if is_div_scale: lines.append("  D0._val = S0._val")
+        # Add compiled pseudocode
        lines.append("  # --- compiled pseudocode ---")
+        has_code = False
        for line in code.split('\n'):
-          lines.append(f"  {line}")
+          if line.strip():
+            lines.append(f"  {line}")
+            has_code = True
        lines.append("  # --- end pseudocode ---")
-        # Generate result dict - use raw params if Reg wasn't created
-        d0_val = "D0._val" if 'D0' in used else "d0"
-        scc_val = "SCC._val & 1" if 'SCC' in used else "scc & 1"
-        lines.append(f"  result = {{'d0': {d0_val}, 'scc': {scc_val}}}")
-        if has_sdst:
-          lines.append("  result['vcc_lane'] = (VCC._val >> lane) & 1")
-        elif 'VCC' in used:
-          lines.append("  if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1")
-        if is_cmpx:
-          lines.append("  result['exec_lane'] = (EXEC._val >> lane) & 1")
-        elif 'EXEC' in used:
-          lines.append("  if EXEC._val != exec_mask: result['exec'] = EXEC._val")
-        if is_cmp:
-          lines.append("  result['vcc_lane'] = (D0._val >> lane) & 1")
-        if is_64:
-          lines.append("  result['d0_64'] = True")
-        if has_d1:
-          lines.append("  result['d1'] = D1._val & 1")
-        lines.append("  return result")
+        # All Reg objects (D0, SCC, VCC, EXEC) are modified in place
+        # The emulator determines 64-bit ops from the opcode name
+        if not has_code:
+          lines.append("  pass")
        lines.append("")

        fn_entries.append((op, fn_name))
@ -1088,9 +1016,9 @@ from extra.assembly.amd.pcode import *
  if 'VOP3Op' in enum_names:
    lines.append('''
 # V_WRITELANE_B32: Write scalar to specific lane's VGPR (not in PDF pseudocode)
-def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0):
-  wr_lane = s1 & 0x1f  # lane select (5 bits for wave32)
-  return {'d0': d0, 'scc': scc, 'vgpr_write': (wr_lane, vdst_idx, s0 & 0xffffffff)}
+def _VOP3Op_V_WRITELANE_B32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, SIMM16, VGPR, SRC0, VDST):
+  wr_lane = S1._val & 0x1f  # lane select (5 bits for wave32)
+  return {'vgpr_write': (wr_lane, VDST._val, S0._val & 0xffffffff)}
 VOP3Op_FUNCTIONS[VOP3Op.V_WRITELANE_B32] = _VOP3Op_V_WRITELANE_B32
 ''')

--- a/extra/assembly/amd/test/test_compare_emulators.py
+++ b/extra/assembly/amd/test/test_compare_emulators.py
@ -107,16 +107,16 @@ class PythonEmulator:
    return step_wave(self.program, self.state, self.lds, self.n_lanes)
  def set_sgpr(self, idx: int, val: int):
    assert self.state is not None
-    self.state.sgpr[idx] = val & 0xffffffff
+    self.state.sgpr[idx]._val = val & 0xffffffff
  def set_vgpr(self, lane: int, idx: int, val: int):
    assert self.state is not None
-    self.state.vgpr[lane][idx] = val & 0xffffffff
+    self.state.vgpr[lane][idx]._val = val & 0xffffffff

  def get_snapshot(self) -> StateSnapshot:
    assert self.state is not None
    return StateSnapshot(pc=self.state.pc, scc=self.state.scc, vcc=self.state.vcc & 0xffffffff,
-                         exec_mask=self.state.exec_mask & 0xffffffff, sgpr=list(self.state.sgpr),
-                         vgpr=[list(self.state.vgpr[i]) for i in range(WAVE_SIZE)])
+                         exec_mask=self.state.exec_mask & 0xffffffff, sgpr=[r._val for r in self.state.sgpr],
+                         vgpr=[[r._val for r in self.state.vgpr[i]] for i in range(WAVE_SIZE)])

 def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: tuple[int, int, int],
                      program, max_steps: int, debug: bool, trace_len: int, kernel_idx: int = 0,
--- a/extra/assembly/amd/test/test_emu.py
+++ b/extra/assembly/amd/test/test_emu.py
@ -86,9 +86,9 @@ def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
  for i in range(N_VGPRS):
    for lane in range(n_lanes):
      off = i * WAVE_SIZE * 4 + lane * 4
-      st.vgpr[lane][i] = struct.unpack_from('<I', out_buf, off)[0]
+      st.vgpr[lane][i]._val = struct.unpack_from('<I', out_buf, off)[0]
  for i in range(N_SGPRS):
-    st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
+    st.sgpr[i]._val = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
  st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
  st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
  return st
@ -315,11 +315,15 @@ class TestVDivScale(unittest.TestCase):
    self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=expected * 1e-6)

  def test_div_scale_f32_denorm_denom(self):
-    """V_DIV_SCALE_F32: denormalized denominator -> NaN, VCC=1.
+    """V_DIV_SCALE_F32: denormalized denominator with large exp diff -> scale by 2^64, VCC=1.

-    Hardware returns NaN when denominator is denormalized (different from PDF pseudocode).
+    Per PDF pseudocode: when numer/denom has exp diff >= 96, set VCC=1.
+    If S0==S1 (scaling denom), scale by 2^64.
+    The denorm check (S1==DENORM) comes after exp diff check, so denorm denoms
+    with normal numerators hit the exp diff branch first.
    """
    # Smallest positive denorm: 0x00000001 = 1.4e-45
+    # exp(1.0) - exp(denorm) = 127 - 0 = 127 >= 96
    denorm = 0x00000001
    instructions = [
      s_mov_b32(s[0], denorm),
@ -329,9 +333,12 @@ class TestVDivScale(unittest.TestCase):
      v_div_scale_f32(v[2], VCC, v[1], v[1], v[0]),
    ]
    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Hardware returns NaN for denorm denom")
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for denorm denom")
+    # Per PDF: exp diff >= 96, S0==S1 (denom), scale by 2^64
+    from extra.assembly.amd.pcode import _f32
+    denorm_f = _f32(denorm)
+    expected = denorm_f * (2.0 ** 64)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=abs(expected) * 1e-5)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for large exp diff")

  def test_div_scale_f32_tiny_numer_exp_le_23(self):
    """V_DIV_SCALE_F32: exponent(numer) <= 23 -> scale by 2^64, VCC=1."""
@ -354,13 +361,12 @@ class TestVDivScale(unittest.TestCase):
    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when scaling tiny numer")

  def test_div_scale_f32_result_would_be_denorm(self):
-    """V_DIV_SCALE_F32: result would be denorm -> no scaling applied, VCC=1.
+    """V_DIV_SCALE_F32: result would be denorm -> scale by 2^64, VCC=1.

-    When the result of numer/denom would be denormalized, hardware sets VCC=1
-    but does NOT scale the input (returns it unchanged). The scaling happens
-    elsewhere in the division sequence.
+    Per PDF pseudocode: when S2.f32 / S1.f32 would be denormalized and S0==S2
+    (checking numerator), scale the numerator by 2^64 and set VCC=1.
    """
-    # If S2/S1 would be denorm, set VCC but don't scale
+    # If S2/S1 would be denorm, scale and set VCC
    # Denorm result: exp < 1, i.e., |result| < 2^-126
    # Use 1.0 / 2^127 ≈ 5.9e-39 (result would be denorm)
    large_denom = 0x7f000000  # 2^127
@ -368,12 +374,13 @@ class TestVDivScale(unittest.TestCase):
      s_mov_b32(s[0], large_denom),
      v_mov_b32_e32(v[0], 1.0),   # numer = 1.0 (S2)
      v_mov_b32_e32(v[1], s[0]), # denom = 2^127 (S1)
-      # S0=numer, S1=denom, S2=numer -> check if we need to scale numer
+      # S0=numer, S1=denom, S2=numer -> scale numer
      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
    ]
    st = run_program(instructions, n_lanes=1)
-    # Hardware returns input unchanged but sets VCC=1
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
+    # Per PDF: scale by 2^64, VCC=1
+    expected = 1.0 * (2.0 ** 64)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=expected * 1e-6)
    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when result would be denorm")


@ -401,43 +408,44 @@ class TestVDivFmas(unittest.TestCase):
    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 7.0, places=5)

  def test_div_fmas_f32_scale_up(self):
-    """V_DIV_FMAS_F32: VCC=1 with S2 >= 2.0 -> scale by 2^+64."""
+    """V_DIV_FMAS_F32: VCC=1 -> scale by 2^32."""
    instructions = [
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),  # VCC = 1
+      s_mov_b32(s[106], 1),  # VCC_LO = 1
      v_mov_b32_e32(v[0], 1.0),   # S0
      v_mov_b32_e32(v[1], 1.0),   # S1
-      v_mov_b32_e32(v[2], 2.0),   # S2 >= 2.0, so scale UP
-      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # 2^+64 * (1*1+2) = 2^+64 * 3
+      v_mov_b32_e32(v[2], 2.0),   # S2
+      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # 2^32 * fma(1,1,2) = 2^32 * 3
    ]
    st = run_program(instructions, n_lanes=1)
-    expected = 3.0 * (2.0 ** 64)
+    expected = 3.0 * (2.0 ** 32)
    self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)

  def test_div_fmas_f32_scale_down(self):
-    """V_DIV_FMAS_F32: VCC=1 with S2 < 2.0 -> scale by 2^-64."""
+    """V_DIV_FMAS_F32: VCC=1 -> scale by 2^32 (not dependent on S2)."""
    instructions = [
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),  # VCC = 1
+      s_mov_b32(s[106], 1),  # VCC_LO = 1
      v_mov_b32_e32(v[0], 2.0),   # S0
      v_mov_b32_e32(v[1], 3.0),   # S1
-      v_mov_b32_e32(v[2], 1.0),   # S2 < 2.0, so scale DOWN
-      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # 2^-64 * (2*3+1) = 2^-64 * 7
+      v_mov_b32_e32(v[2], 1.0),   # S2
+      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # 2^32 * fma(2,3,1) = 2^32 * 7
    ]
    st = run_program(instructions, n_lanes=1)
-    expected = 7.0 * (2.0 ** -64)
+    expected = 7.0 * (2.0 ** 32)
    self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)

  def test_div_fmas_f32_per_lane_vcc(self):
-    """V_DIV_FMAS_F32: different VCC per lane with S2 < 2.0."""
+    """V_DIV_FMAS_F32: different VCC per lane.
+    When VCC=1, scales UP by 2^32. When VCC=0, no scaling."""
    instructions = [
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0b0101),  # VCC: lanes 0,2 set
+      s_mov_b32(s[106], 0b0101),  # VCC_LO: lanes 0,2 set
      v_mov_b32_e32(v[0], 1.0),
      v_mov_b32_e32(v[1], 1.0),
-      v_mov_b32_e32(v[2], 1.0),  # S2 < 2.0, so scale DOWN
-      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # fma(1,1,1) = 2, scaled = 2^-64 * 2
+      v_mov_b32_e32(v[2], 1.0),
+      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # fma(1,1,1) = 2, scaled = 2^32 * 2 when VCC=1
    ]
    st = run_program(instructions, n_lanes=4)
-    scaled = 2.0 * (2.0 ** -64)
-    unscaled = 2.0
+    scaled = 2.0 * (2.0 ** 32)  # VCC=1: scale UP by 2^32
+    unscaled = 2.0              # VCC=0: no scaling
    self.assertAlmostEqual(i2f(st.vgpr[0][3]), scaled, delta=abs(scaled) * 1e-6)  # lane 0: VCC=1
    self.assertAlmostEqual(i2f(st.vgpr[1][3]), unscaled, places=5)                 # lane 1: VCC=0
    self.assertAlmostEqual(i2f(st.vgpr[2][3]), scaled, delta=abs(scaled) * 1e-6)  # lane 2: VCC=1
@ -608,10 +616,10 @@ class TestVDivFixup(unittest.TestCase):
    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)

  def test_div_fixup_f32_nan_estimate_overflow(self):
-    """V_DIV_FIXUP_F32: NaN estimate returns overflow (inf).
+    """V_DIV_FIXUP_F32: NaN estimate passes through as NaN per PDF pseudocode.

-    PDF doesn't check isNAN(S0), but hardware returns OVERFLOW if S0 is NaN.
-    This happens when division fails (e.g., denorm denominator in V_DIV_SCALE).
+    PDF pseudocode only checks isNAN(S1) and isNAN(S2), not S0.
+    When S0 is NaN but S1/S2 are valid, it falls through to: D0 = abs(S0) = NaN.
    """
    quiet_nan = 0x7fc00000
    instructions = [
@ -623,11 +631,10 @@ class TestVDivFixup(unittest.TestCase):
    ]
    st = run_program(instructions, n_lanes=1)
    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
-    self.assertEqual(st.vgpr[0][3], 0x7f800000, "Should be +inf (pos/pos)")
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "NaN estimate should pass through as NaN per PDF")

  def test_div_fixup_f32_nan_estimate_sign(self):
-    """V_DIV_FIXUP_F32: NaN estimate with negative sign returns -inf."""
+    """V_DIV_FIXUP_F32: NaN estimate passes through per PDF pseudocode."""
    quiet_nan = 0x7fc00000
    instructions = [
      s_mov_b32(s[0], quiet_nan),
@ -638,8 +645,8 @@ class TestVDivFixup(unittest.TestCase):
    ]
    st = run_program(instructions, n_lanes=1)
    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
-    self.assertEqual(st.vgpr[0][3], 0xff800000, "Should be -inf (pos/neg)")
+    # PDF pseudocode: D0 = -abs(S0) when sign_out=1. abs(NaN) is NaN, -NaN is NaN.
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "NaN estimate should pass through as NaN per PDF")


 class TestVCmpClass(unittest.TestCase):
--- a/extra/assembly/amd/test/test_pcode.py
+++ b/extra/assembly/amd/test/test_pcode.py
@ -225,17 +225,17 @@ class TestPseudocodeRegressions(unittest.TestCase):
  """Regression tests for pseudocode instruction emulation bugs."""

  def test_v_div_scale_f32_vcc_always_returned(self):
-    """V_DIV_SCALE_F32 must always return vcc_lane, even when VCC=0 (no scaling needed).
-    Bug: when VCC._val == vcc (both 0), vcc_lane wasn't returned, so VCC bits weren't written.
-    This caused division to produce wrong results for multiple lanes."""
+    """V_DIV_SCALE_F32 must set VCC bit for the lane when scaling is needed.
+    The new calling convention uses Reg objects and modifies VCC in place."""
    # Normal case: 1.0 / 3.0, no scaling needed, VCC should be 0
-    s0 = 0x3f800000  # 1.0
-    s1 = 0x40400000  # 3.0
-    s2 = 0x3f800000  # 1.0 (numerator)
-    result = _VOP3SDOp_V_DIV_SCALE_F32(s0, s1, s2, 0, 0, 0, 0, 0xffffffff, 0, None, {})
-    # Must always have vcc_lane in result
-    self.assertIn('vcc_lane', result, "V_DIV_SCALE_F32 must always return vcc_lane")
-    self.assertEqual(result['vcc_lane'], 0, "vcc_lane should be 0 when no scaling needed")
+    S0 = Reg(0x3f800000)  # 1.0
+    S1 = Reg(0x40400000)  # 3.0
+    S2 = Reg(0x3f800000)  # 1.0 (numerator)
+    D0 = Reg(0)
+    VCC = Reg(0)
+    _VOP3SDOp_V_DIV_SCALE_F32(S0, S1, S2, D0, Reg(0), VCC, 0, Reg(0xffffffff), Reg(0), None, Reg(0), Reg(0))
+    # VCC bit 0 should be 0 when no scaling needed
+    self.assertEqual(VCC._val & 1, 0, "VCC bit should be 0 when no scaling needed")

  def test_v_cmp_class_f32_detects_quiet_nan(self):
    """V_CMP_CLASS_F32 must correctly identify quiet NaN vs signaling NaN.
@ -244,18 +244,22 @@ class TestPseudocodeRegressions(unittest.TestCase):
    signal_nan = 0x7f800001  # signaling NaN: exponent=255, bit22=0
    # Test quiet NaN detection (bit 1 in mask)
    s1_quiet = 0b0000000010  # bit 1 = quiet NaN
-    result = _VOPCOp_V_CMP_CLASS_F32(quiet_nan, s1_quiet, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {})
-    self.assertEqual(result['vcc_lane'], 1, "Should detect quiet NaN with quiet NaN mask")
+    D0 = Reg(0)
+    _VOPCOp_V_CMP_CLASS_F32(Reg(quiet_nan), Reg(s1_quiet), Reg(0), D0, Reg(0), Reg(0), 0, Reg(0xffffffff), Reg(0), None, Reg(0), Reg(0))
+    self.assertEqual(D0._val & 1, 1, "Should detect quiet NaN with quiet NaN mask")
    # Test signaling NaN detection (bit 0 in mask)
    s1_signal = 0b0000000001  # bit 0 = signaling NaN
-    result = _VOPCOp_V_CMP_CLASS_F32(signal_nan, s1_signal, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {})
-    self.assertEqual(result['vcc_lane'], 1, "Should detect signaling NaN with signaling NaN mask")
+    D0 = Reg(0)
+    _VOPCOp_V_CMP_CLASS_F32(Reg(signal_nan), Reg(s1_signal), Reg(0), D0, Reg(0), Reg(0), 0, Reg(0xffffffff), Reg(0), None, Reg(0), Reg(0))
+    self.assertEqual(D0._val & 1, 1, "Should detect signaling NaN with signaling NaN mask")
    # Test that quiet NaN doesn't match signaling NaN mask
-    result = _VOPCOp_V_CMP_CLASS_F32(quiet_nan, s1_signal, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {})
-    self.assertEqual(result['vcc_lane'], 0, "Quiet NaN should not match signaling NaN mask")
+    D0 = Reg(0)
+    _VOPCOp_V_CMP_CLASS_F32(Reg(quiet_nan), Reg(s1_signal), Reg(0), D0, Reg(0), Reg(0), 0, Reg(0xffffffff), Reg(0), None, Reg(0), Reg(0))
+    self.assertEqual(D0._val & 1, 0, "Quiet NaN should not match signaling NaN mask")
    # Test that signaling NaN doesn't match quiet NaN mask
-    result = _VOPCOp_V_CMP_CLASS_F32(signal_nan, s1_quiet, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {})
-    self.assertEqual(result['vcc_lane'], 0, "Signaling NaN should not match quiet NaN mask")
+    D0 = Reg(0)
+    _VOPCOp_V_CMP_CLASS_F32(Reg(signal_nan), Reg(s1_quiet), Reg(0), D0, Reg(0), Reg(0), 0, Reg(0xffffffff), Reg(0), None, Reg(0), Reg(0))
+    self.assertEqual(D0._val & 1, 0, "Signaling NaN should not match quiet NaN mask")

  def test_isnan_with_typed_view(self):
    """_isnan must work with TypedView objects, not just Python floats.
Author	SHA1	Message	Date
George Hotz	05d27abcc2	tests pass	2025-12-30 13:49:05 +00:00
George Hotz	153c5a1670	assembly/amd: use Reg in emu	2025-12-30 12:52:03 +00:00