"""Tests for VOPD instructions - dual-issue vector operations.

VOPD executes two operations simultaneously. Key behavior:
- Both ops read their sources BEFORE either writes (dual-issue semantics)
- This means if X writes to a register that Y reads, Y sees the OLD value
- Op X can use ops 0-15 (FMAC, MUL, ADD, MOV, etc.)
- Op Y can use ops 0-18 (includes ADD_NC_U32, LSHLREV, AND)
"""
import unittest
from test.amd.hw.helpers import run_program, v, v_mov_b32_e32
from tinygrad.runtime.autogen.amd.rdna3.ins import VOPD, VOPD_LIT, VOPDOp

class TestVOPDBasic(unittest.TestCase):
  """Basic VOPD functionality tests."""

  def test_vopd_dual_mov(self):
    """VOPD with two MOV operations to different registers."""
    instructions = [
      v_mov_b32_e32(v[0], 0x12345678),
      v_mov_b32_e32(v[1], 0xDEADBEEF),
      # X: v[2] = v[0], Y: v[3] = v[1]
      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[1], v[0], v[0]),
    ]
    st = run_program(instructions, n_lanes=1)
    self.assertEqual(st.vgpr[0][2], 0x12345678)
    self.assertEqual(st.vgpr[0][3], 0xDEADBEEF)

  def test_vopd_mov_and_add(self):
    """VOPD with MOV (X) and ADD_NC_U32 (Y) - ADD_NC_U32 can only be Y op."""
    instructions = [
      v_mov_b32_e32(v[0], 10),
      v_mov_b32_e32(v[1], 5),
      # X: v[2] = 100 (literal), Y: v[3] = v[0] + v[1] = 15
      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[3], 100, v[0], v[0], v[1]),
    ]
    st = run_program(instructions, n_lanes=1)
    self.assertEqual(st.vgpr[0][2], 100)
    self.assertEqual(st.vgpr[0][3], 15)


class TestVOPDReadBeforeWrite(unittest.TestCase):
  """Tests for VOPD dual-issue read-before-write semantics.

  In VOPD, both X and Y operations read their sources BEFORE either writes.
  This is critical when X's destination is Y's source.
  """

  def test_vopd_x_writes_y_reads_same_reg(self):
    """VOPD where X writes to a register that Y reads.

    X: v[2] = 0 (overwrites v[2])
    Y: v[1] = v[2] + v[0]  (srcy0=v[2], vsrcy1=v[0])

    If reads happen before writes: v[1] = OLD_v[2] + v[0] = 0xFFFFFFFF + 1 = 0
    If writes happen before reads: v[1] = 0 + v[0] = 0 + 1 = 1

    Hardware does reads-before-writes, so v[1] should be 0.
    """
    instructions = [
      v_mov_b32_e32(v[0], 1),          # v[0] = 1
      v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
      v_mov_b32_e32(v[2], 0xFFFFFFFF), # v[2] = 0xFFFFFFFF
      # X: v[2] = 0 (literal), srcx0=0, vsrcx1=v[0] (unused for MOV)
      # Y: v[1] = srcy0 + vsrcy1 = v[2] + v[0] (should read OLD v[2] = 0xFFFFFFFF)
      # vdsty encoding: (vdsty << 1) | ((vdstx & 1) ^ 1) where vdsty field = 0, vdstx = v[2]
      # So vdsty_reg = (0 << 1) | ((2 & 1) ^ 1) = 0 | 1 = 1 = v[1]
      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[0]),
    ]
    st = run_program(instructions, n_lanes=1)
    # X should have written 0 to v[2]
    self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
    # Y should have read OLD v[2] (0xFFFFFFFF) and added v[0] (1)
    # 0xFFFFFFFF + 1 = 0 (wrap around)
    self.assertEqual(st.vgpr[0][1], 0, "Y should read OLD v[2]=0xFFFFFFFF, compute 0xFFFFFFFF+1=0")

  def test_vopd_x_writes_y_reads_same_reg_v2(self):
    """VOPD where X writes to a register that Y reads - cleaner test case.

    X: v[2] = 0 (MOV)
    Y: v[1] = v[2] + v[2] (ADD_NC_U32 with both sources from v[2])

    If reads happen before writes: v[1] = OLD_v[2] + OLD_v[2] = 100 + 100 = 200
    If writes happen before reads: v[1] = 0 + 0 = 0

    Hardware does reads-before-writes, so v[1] should be 200.
    """
    instructions = [
      v_mov_b32_e32(v[0], 0x88888888), # v[0] = unused placeholder
      v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
      v_mov_b32_e32(v[2], 100),        # v[2] = 100
      # X: v[2] = 0 (literal)
      # Y: v[1] = srcy0 + vsrcy1 = v[2] + v[2] (should read OLD v[2] = 100)
      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[2]),
    ]
    st = run_program(instructions, n_lanes=1)
    # X should have written 0 to v[2]
    self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
    # Y should have read OLD v[2] (100) twice and added them
    self.assertEqual(st.vgpr[0][1], 200, "Y should read OLD v[2]=100 twice, compute 100+100=200")


class TestVOPDLiterals(unittest.TestCase):
  """Tests for VOPD instructions that use SIMM32 literals (FMAAK, FMAMK)."""

  def test_vopd_fmaak_f32(self):
    """VOPD V_DUAL_FMAAK_F32: D = S0 * S1 + SIMM32 (literal addend).

    Tests that the 32-bit literal (SIMM32) is correctly passed to the instruction.
    fma(2.0, 3.0, 10.0) = 2*3 + 10 = 16.0
    """
    from test.amd.hw.helpers import f2i, i2f
    instructions = [
      v_mov_b32_e32(v[0], f2i(2.0)),  # v[0] = 2.0
      v_mov_b32_e32(v[1], f2i(3.0)),  # v[1] = 3.0
      # VOPD args: opx, opy, vdstx, vdsty, srcx0, srcy0, vsrcx1, vsrcy1
      # X: v[2] = fma(srcx0, vsrcx1, SIMM32) = v[0]*v[1]+10.0 = 2*3+10 = 16
      # Y: v[3] = srcy0 (MOV) = v[0] = 2.0
      VOPD_LIT(VOPDOp.V_DUAL_FMAAK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(10.0)),
    ]
    st = run_program(instructions, n_lanes=1)
    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 16.0, places=5, msg="fma(2.0, 3.0, 10.0) should be 16.0")

  def test_vopd_fmamk_f32(self):
    """VOPD V_DUAL_FMAMK_F32: D = S0 * SIMM32 + S1 (literal multiplier).

    Tests that the 32-bit literal (SIMM32) is correctly used as the multiplier.
    fma(2.0, 5.0, 3.0) = 2*5 + 3 = 13.0
    """
    from test.amd.hw.helpers import f2i, i2f
    instructions = [
      v_mov_b32_e32(v[0], f2i(2.0)),  # v[0] = 2.0
      v_mov_b32_e32(v[1], f2i(3.0)),  # v[1] = 3.0
      # X: v[2] = fma(srcx0, SIMM32, vsrcx1) = v[0]*5.0+v[1] = 2*5+3 = 13
      # Y: v[3] = srcy0 (MOV) = v[0] = 2.0
      VOPD_LIT(VOPDOp.V_DUAL_FMAMK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(5.0)),
    ]
    st = run_program(instructions, n_lanes=1)
    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 13.0, places=5, msg="fma(2.0, 5.0, 3.0) should be 13.0")


class TestVOPDDot2Acc(unittest.TestCase):
  """Tests for V_DUAL_DOT2ACC_F32_F16 - packed f16 dot product accumulate."""

  def test_vopd_dot2acc_f32_f16_basic(self):
    """V_DUAL_DOT2ACC_F32_F16: D += lo(S0)*lo(S1) + hi(S0)*hi(S1).

    S0 = pack(1.0h, 2.0h), S1 = pack(3.0h, 4.0h), D = 10.0f
    result = 10.0 + 1.0*3.0 + 2.0*4.0 = 10.0 + 3.0 + 8.0 = 21.0
    """
    from test.amd.hw.helpers import f2i, i2f, f32_to_f16
    pk_s0 = f32_to_f16(1.0) | (f32_to_f16(2.0) << 16)  # lo=1.0h, hi=2.0h
    pk_s1 = f32_to_f16(3.0) | (f32_to_f16(4.0) << 16)  # lo=3.0h, hi=4.0h
    instructions = [
      v_mov_b32_e32(v[0], pk_s0),
      v_mov_b32_e32(v[1], pk_s1),
      v_mov_b32_e32(v[3], f2i(10.0)),  # accumulator in v[3] (vdsty with vdstx=v[4])
      # X: v[4] = MOV v[0] (don't care), Y: v[3] += dot2(v[0], v[1])
      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_DOT2ACC_F32_F16, v[4], v[3], v[0], v[0], v[0], v[1]),
    ]
    st = run_program(instructions, n_lanes=1)
    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 21.0, places=2, msg="10.0 + 1.0*3.0 + 2.0*4.0 = 21.0")

  def test_vopd_dot2acc_f32_f16_zero_accum(self):
    """V_DUAL_DOT2ACC_F32_F16 with zero accumulator — pure dot product.

    S0 = pack(0.5h, -1.0h), S1 = pack(2.0h, 3.0h), D = 0.0f
    result = 0.0 + 0.5*2.0 + (-1.0)*3.0 = 1.0 - 3.0 = -2.0
    """
    from test.amd.hw.helpers import f2i, i2f, f32_to_f16
    pk_s0 = f32_to_f16(0.5) | (f32_to_f16(-1.0) << 16)
    pk_s1 = f32_to_f16(2.0) | (f32_to_f16(3.0) << 16)
    instructions = [
      v_mov_b32_e32(v[0], pk_s0),
      v_mov_b32_e32(v[1], pk_s1),
      v_mov_b32_e32(v[3], f2i(0.0)),  # zero accumulator in v[3]
      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_DOT2ACC_F32_F16, v[4], v[3], v[0], v[0], v[0], v[1]),
    ]
    st = run_program(instructions, n_lanes=1)
    self.assertAlmostEqual(i2f(st.vgpr[0][3]), -2.0, places=2, msg="0.5*2.0 + (-1.0)*3.0 = -2.0")


class TestVOPDMultilane(unittest.TestCase):
  """Tests for VOPD with multiple lanes."""

  def test_vopd_multilane_mov_add(self):
    """VOPD MOV and ADD with multiple active lanes - no register conflict."""
    instructions = [
      v_mov_b32_e32(v[0], 5),
      v_mov_b32_e32(v[1], 10),
      # X: v[2] = 100 (constant), Y: v[1] = v[0] + v[1] = 5 + 10 = 15
      # vdsty_reg = (vdsty << 1) | ((vdstx.offset & 1) ^ 1) = (0 << 1) | ((258 & 1) ^ 1) = 0 | 1 = 1
      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 100, v[0], v[2], v[1]),
    ]
    st = run_program(instructions, n_lanes=4)
    for lane in range(4):
      self.assertEqual(st.vgpr[lane][2], 100, f"Lane {lane}: v[2] should be 100")
      self.assertEqual(st.vgpr[lane][1], 15, f"Lane {lane}: v[1] should be 15 (5+10)")


if __name__ == '__main__':
  unittest.main()