assembly/amd: validate dsl keyword args (#15608)

* assembly/amd: validate dsl keyword args

* hm, this should use the SOP2 s_waits

* use the sop2 s_waits
This commit is contained in:
qazal 2026-04-05 17:00:24 +03:00 committed by GitHub
commit b2d5b29f45
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 299 additions and 292 deletions

View file

@ -19,9 +19,9 @@ class TestDS2Addr(unittest.TestCase):
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[1], s[0]),
DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2:3], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
@ -40,9 +40,9 @@ class TestDS2Addr(unittest.TestCase):
s_mov_b32(s[0], 0x9ABCDEF0),
v_mov_b32_e32(v[3], s[0]),
DS(DSOp.DS_STORE_2ADDR_B64, addr=v[10], data0=v[0:1], data1=v[2:3], vdst=v[0], offset0=0, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4:7], offset0=0, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0xDEADBEEF)
@ -63,9 +63,9 @@ class TestDS2AddrMore(unittest.TestCase):
s_mov_b32(s[2], 0x22222222),
v_mov_b32_e32(v[1], s[2]),
DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=2, offset1=5),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2:3], offset0=2, offset1=5),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have value from offset 8 (2*4)")
@ -87,9 +87,9 @@ class TestDS2AddrMore(unittest.TestCase):
s_mov_b32(s[2], 0x44444444),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=12),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4:7], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should be 0x11111111")
@ -106,11 +106,11 @@ class TestDS2AddrMore(unittest.TestCase):
s_mov_b32(s[2], 0xBBBBBBBB),
v_mov_b32_e32(v[1], s[2]),
DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0xDEADBEEF),
v_mov_b32_e32(v[4], s[2]), # Sentinel
DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2:3], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
@ -137,11 +137,11 @@ class TestDS2AddrMore(unittest.TestCase):
s_mov_b32(s[2], 0xDDDDDDDD),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=12),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# addr=v[4] overlaps vdst=v[4:7]
v_mov_b32_e32(v[4], 0),
DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[4], vdst=v[4:7], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA, "v4 = LDS[0:4]")
@ -159,11 +159,11 @@ class TestDS2AddrMore(unittest.TestCase):
s_mov_b32(s[2], 0xBBBBBBBB),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=4),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# addr=v[2] overlaps vdst=v[2:3]
v_mov_b32_e32(v[2], 0),
DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[2], vdst=v[2:3], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 = LDS[0:4]")
@ -178,11 +178,11 @@ class TestDS2AddrMore(unittest.TestCase):
s_mov_b32(s[2], 0xCAFEBABE),
v_mov_b32_e32(v[1], s[2]),
ds_store_b64(addr=v[10], data0=v[0:1], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0x12345678),
v_mov_b32_e32(v[4], s[2]), # Sentinel
ds_load_b64(addr=v[10], vdst=v[2:3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xDEADBEEF)
@ -204,9 +204,9 @@ class TestDSB96(unittest.TestCase):
s_mov_b32(s[0], 0x33333333),
v_mov_b32_e32(v[2], s[0]),
ds_store_b96(addr=v[10], data0=v[0:2]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b96(addr=v[10], vdst=v[4:6]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have first dword")
@ -224,9 +224,9 @@ class TestDSB96(unittest.TestCase):
s_mov_b32(s[0], 0xCCCCCCCC),
v_mov_b32_e32(v[2], s[0]),
DS(DSOp.DS_STORE_B96, addr=v[10], data0=v[0:2], offset0=12),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_B96, addr=v[10], vdst=v[4:6], offset0=12),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA)
@ -250,9 +250,9 @@ class TestDSB128(unittest.TestCase):
s_mov_b32(s[0], 0x44444444),
v_mov_b32_e32(v[3], s[0]),
ds_store_b128(addr=v[10], data0=v[0:3]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b128(addr=v[10], vdst=v[4:7]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have first dword")
@ -273,9 +273,9 @@ class TestDSB128(unittest.TestCase):
s_mov_b32(s[0], 0xDDDDDDDD),
v_mov_b32_e32(v[3], s[0]),
DS(DSOp.DS_STORE_B128, addr=v[10], data0=v[0:3], offset0=16),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_B128, addr=v[10], vdst=v[4:7], offset0=16),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA)
@ -294,13 +294,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 200),
v_mov_b32_e32(v[1], s[2]),
ds_max_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)")
@ -313,13 +313,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 200),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[1], s[2]),
ds_min_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 200)
@ -332,13 +332,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 0xFF00FF00),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0xFFFF0000),
v_mov_b32_e32(v[1], s[2]),
ds_and_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xFF00FF00)
@ -351,13 +351,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 0x00FF0000),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0x000000FF),
v_mov_b32_e32(v[1], s[2]),
ds_or_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0x00FF0000)
@ -370,13 +370,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 0xAAAAAAAA),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0xFFFFFFFF),
v_mov_b32_e32(v[1], s[2]),
ds_xor_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
@ -389,13 +389,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 5),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 10), # limit
v_mov_b32_e32(v[1], s[2]),
ds_inc_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 5)
@ -408,13 +408,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 5),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 10), # limit
v_mov_b32_e32(v[1], s[2]),
ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 5)
@ -427,15 +427,15 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 200),
v_mov_b32_e32(v[1], s[2]), # new value
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[2], s[2]), # compare = 100 (matches)
ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[4], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 200)
@ -447,15 +447,15 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 200),
v_mov_b32_e32(v[1], s[2]), # new value
s_mov_b32(s[2], 50),
v_mov_b32_e32(v[2], s[2]), # compare = 50 (doesn't match)
ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[4], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 100)
@ -467,13 +467,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 200),
v_mov_b32_e32(v[1], s[2]),
ds_max_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][3], 200, "v3 should have max(100, 200) = 200")
@ -487,13 +487,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 50),
v_mov_b32_e32(v[1], s[2]),
ds_add_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xDEADBEEF, "v2 should preserve sentinel")
@ -508,13 +508,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 100),
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 50),
v_mov_b32_e32(v[1], s[2]),
ds_add_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)")
@ -527,13 +527,13 @@ class TestDSAtomic(unittest.TestCase):
s_mov_b32(s[2], 0), # Start at 0
v_mov_b32_e32(v[0], s[2]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 10), # limit
v_mov_b32_e32(v[1], s[2]),
ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0, "v2 should have old value (0)")
@ -551,13 +551,13 @@ class TestDSStorexchg(unittest.TestCase):
s_mov_b32(s[0], 0xAAAAAAAA),
v_mov_b32_e32(v[0], s[0]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[1], s[0]),
DS(DSOp.DS_STOREXCHG_RTN_B32, addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
@ -576,9 +576,9 @@ class TestDSRegisterWidth(unittest.TestCase):
s_mov_b32(s[0], 0x11111111),
v_mov_b32_e32(v[2], s[0]), # sentinel
ds_store_b32(addr=v[0], data0=v[1], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[0], vdst=v[1], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0xDEADBEEF)
@ -597,9 +597,9 @@ class TestDS2AddrStride64(unittest.TestCase):
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[1], s[0]),
DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[2:3], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 from addr 256")
@ -618,9 +618,9 @@ class TestDS2AddrStride64(unittest.TestCase):
s_mov_b32(s[0], 0x9ABCDEF0),
v_mov_b32_e32(v[3], s[0]),
DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0:1], data1=v[2:3], vdst=v[0], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[4:7], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0xDEADBEEF)
@ -637,15 +637,15 @@ class TestDS2AddrStride64(unittest.TestCase):
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[1], s[0]),
DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xAAAAAAAA),
v_mov_b32_e32(v[2], s[0]),
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[3], s[0]),
DS(DSOp.DS_STOREXCHG_2ADDR_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4:5], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[6:7], offset0=0, offset1=1),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0x11111111, "old val 0")
@ -662,15 +662,15 @@ class TestDS2AddrStride64(unittest.TestCase):
s_mov_b32(s[0], 0xCAFEBABE),
v_mov_b32_e32(v[1], s[0]), # initial high
DS(DSOp.DS_STORE_B64, addr=v[10], data0=v[0:1], vdst=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0x12345678),
v_mov_b32_e32(v[2], s[0]), # new low
s_mov_b32(s[0], 0x9ABCDEF0),
v_mov_b32_e32(v[3], s[0]), # new high
DS(DSOp.DS_STOREXCHG_RTN_B64, addr=v[10], data0=v[2:3], vdst=v[4:5], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_B64, addr=v[10], vdst=v[6:7], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have old low dword")
@ -687,9 +687,9 @@ class TestDS2AddrStride64(unittest.TestCase):
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[1], s[0]),
DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0:1], data1=v[0:1], vdst=v[0], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[2:5], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have val1 low")
@ -706,15 +706,15 @@ class TestDS2AddrStride64(unittest.TestCase):
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[1], s[0]),
DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xAAAAAAAA),
v_mov_b32_e32(v[2], s[0]),
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[3], s[0]),
DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4:5], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[6:7], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have old value")
@ -731,13 +731,13 @@ class TestDS2AddrStride64(unittest.TestCase):
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[1], s[0]),
DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0:1], data1=v[0:1], vdst=v[0], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xAAAAAAAA),
v_mov_b32_e32(v[6], s[0]),
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[7], s[0]),
DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64, addr=v[10], data0=v[6:7], data1=v[6:7], vdst=v[8:11], offset0=1, offset1=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][8], 0x11111111, "v8 should have old val1 low")
@ -755,14 +755,14 @@ class TestAtomicOrdering(unittest.TestCase):
v_mov_b32_e32(v[10], 0),
v_mov_b32_e32(v[0], 100),
DS(DSOp.DS_STORE_B32, addr=v[10], data0=v[0], vdst=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[1], 25),
DS(DSOp.DS_ADD_RTN_U32, addr=v[10], data0=v[1], vdst=v[2], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_ADD_RTN_U32, addr=v[10], data0=v[1], vdst=v[3], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
DS(DSOp.DS_LOAD_B32, addr=v[10], vdst=v[4], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 100, "First add should return 100")
@ -780,7 +780,7 @@ class TestDsPermute(unittest.TestCase):
v_mov_b32_e32(v[0], 0), # addr = 0 (lane 0)
v_mov_b32_e32(v[1], 0xDEADBEEF), # data
ds_permute_b32(v[2], v[0], v[1]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
# Lane 0 sends to lane 0, so lane 0 gets 0xDEADBEEF
@ -792,7 +792,7 @@ class TestDsPermute(unittest.TestCase):
v_mov_b32_e32(v[0], 0), # addr = 0 (read from lane 0)
v_mov_b32_e32(v[1], 0xCAFEBABE), # data in lane 0
ds_bpermute_b32(v[2], v[0], v[1]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
# Lane 0 reads from lane 0's v[1]
@ -805,7 +805,7 @@ class TestDsPermute(unittest.TestCase):
v_mov_b32_e32(v[0], 0), # All lanes send to addr 0 (lane 0)
v_mov_b32_e32(v[1], 0x11111111), # All lanes send same data
ds_permute_b32(v[2], v[0], v[1]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=4)
# Lane 0 receives data (highest numbered active lane wins)
@ -826,7 +826,7 @@ class TestDsPermute(unittest.TestCase):
v_add_nc_u32_e32(v[1], s[0], v[255]),
# ds_bpermute: v[2] = v[1] from lane (lane_id ^ 1)
ds_bpermute_b32(vdst=v[2], addr=v[0], data0=v[1]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=32)
for lane in range(32):
@ -845,9 +845,9 @@ class TestDSSubDword(unittest.TestCase):
v_mov_b32_e32(v[1], 0xBEEF1234),
DS(DSOp.DS_STORE_B16, addr=v[0], data0=v[1], offset0=0),
DS(DSOp.DS_STORE_B16_D16_HI, addr=v[0], data0=v[1], offset0=2),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(vdst=v[2], addr=v[0], offset0=0),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xBEEF1234, "lo=0x1234 at byte 0, hi=0xBEEF at byte 2")
@ -867,9 +867,9 @@ class TestDSLargeOffset(unittest.TestCase):
s_mov_b32(s[0], 0xDEADBEEF),
v_mov_b32_e32(v[0], s[0]),
ds_store_b32(addr=v[10], data0=v[0], offset0=0, offset1=1), # offset = 256
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[1], offset0=0, offset1=1), # offset = 256
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0xDEADBEEF)
@ -881,9 +881,9 @@ class TestDSLargeOffset(unittest.TestCase):
s_mov_b32(s[0], 0xCAFEBABE),
v_mov_b32_e32(v[0], s[0]),
ds_store_b32(addr=v[10], data0=v[0], offset0=44, offset1=1), # offset = 300
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[1], offset0=44, offset1=1), # offset = 300
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0xCAFEBABE)
@ -897,9 +897,9 @@ class TestDSLargeOffset(unittest.TestCase):
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[1], s[0]),
ds_store_b64(addr=v[10], data0=v[0:1], offset0=0, offset1=2), # offset = 512
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b64(addr=v[10], vdst=v[2:3], offset0=0, offset1=2), # offset = 512
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0x11111111)
@ -916,11 +916,11 @@ class TestDSLargeOffset(unittest.TestCase):
# Store 0xAAAAAAAA at offset=0, 0xBBBBBBBB at offset=256
ds_store_b32(addr=v[10], data0=v[0], offset0=0, offset1=0), # offset = 0
ds_store_b32(addr=v[10], data0=v[1], offset0=0, offset1=1), # offset = 256
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Read back both
ds_load_b32(addr=v[10], vdst=v[2], offset0=0, offset1=0), # offset = 0
ds_load_b32(addr=v[10], vdst=v[3], offset0=0, offset1=1), # offset = 256
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "offset=0 should read 0xAAAAAAAA")
@ -933,9 +933,9 @@ class TestDSLargeOffset(unittest.TestCase):
s_mov_b32(s[0], 0x12345678),
v_mov_b32_e32(v[0], s[0]),
ds_store_b32(addr=v[10], data0=v[0], offset0=192, offset1=1), # offset = 448
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b32(addr=v[10], vdst=v[1], offset0=192, offset1=1), # offset = 448
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0x12345678)
@ -949,9 +949,9 @@ class TestDSLargeOffset(unittest.TestCase):
s_mov_b32(s[0], 0x11223344),
v_mov_b32_e32(v[1], s[0]),
ds_store_b64(addr=v[10], data0=v[0:1], offset0=136, offset1=1), # offset = 392
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
ds_load_b64(addr=v[10], vdst=v[2:3], offset0=136, offset1=1), # offset = 392
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xAABBCCDD)

View file

@ -12,10 +12,10 @@ class TestFlatAtomic(unittest.TestCase):
"""Helper to create atomic test instructions."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
] + setup_instrs + [atomic_instr, s_waitcnt(vmcnt=0),
] + setup_instrs + [atomic_instr, s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -31,7 +31,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 100),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 50),
v_mov_b32_e32(v[3], s[0]),
]
@ -47,7 +47,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 0xAAAAAAAA),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[3], s[0]),
]
@ -63,7 +63,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 0xFF00FF00),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xFFFF0000),
v_mov_b32_e32(v[3], s[0]),
]
@ -79,7 +79,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 0x00FF0000),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0x0000FF00),
v_mov_b32_e32(v[3], s[0]),
]
@ -95,7 +95,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 10),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 100), # threshold
v_mov_b32_e32(v[3], s[0]),
]
@ -111,7 +111,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 10),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 100),
v_mov_b32_e32(v[3], s[0]),
]
@ -127,7 +127,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 100),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 30),
v_mov_b32_e32(v[3], s[0]), # sub 30
]
@ -143,7 +143,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 0xAAAAAAAA),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xFFFFFFFF),
v_mov_b32_e32(v[3], s[0]), # XOR mask
]
@ -159,7 +159,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 100),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 50),
v_mov_b32_e32(v[3], s[0]), # compare value (smaller)
]
@ -175,7 +175,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 50),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 100),
v_mov_b32_e32(v[3], s[0]), # compare value (larger)
]
@ -194,7 +194,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 0xCAFEBABE),
v_mov_b32_e32(v[3], s[0]),
global_store_b64(addr=v[0:1], data=v[2:3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Threshold: 0xFFFFFFFF_FFFFFFFF
s_mov_b32(s[0], 0xFFFFFFFF),
v_mov_b32_e32(v[4], s[0]),
@ -215,7 +215,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[3], s[0]),
global_store_b64(addr=v[0:1], data=v[2:3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0x00000001), # add 1
v_mov_b32_e32(v[4], s[0]),
s_mov_b32(s[0], 0x00000000),
@ -236,7 +236,7 @@ class TestFlatAtomic(unittest.TestCase):
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[3], s[0]),
global_store_b64(addr=v[0:1], data=v[2:3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0xCCCCCCCC),
v_mov_b32_e32(v[4], s[0]),
s_mov_b32(s[0], 0xDDDDDDDD),
@ -257,15 +257,15 @@ class TestFlatLoad(unittest.TestCase):
TEST_OFFSET = 2000
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
s_mov_b32(s[0], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
FLAT(FLATOp.FLAT_LOAD_B32, addr=v[0:1], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -279,7 +279,7 @@ class TestFlatLoad(unittest.TestCase):
TEST_OFFSET = 2000
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
s_mov_b32(s[0], 0xDEADBEEF),
@ -287,9 +287,9 @@ class TestFlatLoad(unittest.TestCase):
s_mov_b32(s[0], 0xCAFEBABE),
v_mov_b32_e32(v[3], s[0]),
global_store_b64(addr=v[0:1], data=v[2:3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
FLAT(FLATOp.FLAT_LOAD_B64, addr=v[0:1], vdst=v[4:5], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -304,7 +304,7 @@ class TestFlatLoad(unittest.TestCase):
TEST_OFFSET = 2000
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
s_mov_b32(s[0], 0x11111111),
@ -314,9 +314,9 @@ class TestFlatLoad(unittest.TestCase):
s_mov_b32(s[0], 0x33333333),
v_mov_b32_e32(v[4], s[0]),
global_store_b96(addr=v[0:1], data=v[2:4], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
FLAT(FLATOp.FLAT_LOAD_B96, addr=v[0:1], vdst=v[5:7], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -332,7 +332,7 @@ class TestFlatLoad(unittest.TestCase):
TEST_OFFSET = 2000
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
s_mov_b32(s[0], 0x11111111),
@ -344,9 +344,9 @@ class TestFlatLoad(unittest.TestCase):
s_mov_b32(s[0], 0x44444444),
v_mov_b32_e32(v[5], s[0]),
global_store_b128(addr=v[0:1], data=v[2:5], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
FLAT(FLATOp.FLAT_LOAD_B128, addr=v[0:1], vdst=v[6:9], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),

View file

@ -12,10 +12,10 @@ class TestGlobalAtomic(unittest.TestCase):
"""Helper to create atomic test instructions."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
] + setup_instrs + [atomic_instr, s_waitcnt(vmcnt=0),
] + setup_instrs + [atomic_instr, s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -31,7 +31,7 @@ class TestGlobalAtomic(unittest.TestCase):
s_mov_b32(s[0], 100),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 50),
v_mov_b32_e32(v[3], s[0]),
]
@ -49,7 +49,7 @@ class TestGlobalAtomic(unittest.TestCase):
s_mov_b32(s[0], 0x00000000),
v_mov_b32_e32(v[3], s[0]),
global_store_b64(addr=v[0:1], data=v[2:3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[0], 0x00000001),
v_mov_b32_e32(v[4], s[0]),
s_mov_b32(s[0], 0x00000000),
@ -70,7 +70,7 @@ class TestGlobalLoad(unittest.TestCase):
TEST_OFFSET = 2000
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
s_mov_b32(s[0], 0xAAAAAAAA),
@ -80,9 +80,9 @@ class TestGlobalLoad(unittest.TestCase):
s_mov_b32(s[0], 0xCCCCCCCC),
v_mov_b32_e32(v[4], s[0]),
global_store_b96(addr=v[0:1], data=v[2:4], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B96, addr=v[0:1], vdst=v[5:7], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -98,7 +98,7 @@ class TestGlobalLoad(unittest.TestCase):
TEST_OFFSET = 2000
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
s_mov_b32(s[0], 0xDEADBEEF),
@ -110,9 +110,9 @@ class TestGlobalLoad(unittest.TestCase):
s_mov_b32(s[0], 0x9ABCDEF0),
v_mov_b32_e32(v[5], s[0]),
global_store_b128(addr=v[0:1], data=v[2:5], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B128, addr=v[0:1], vdst=v[6:9], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -133,20 +133,20 @@ class TestGlobalStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# First store 0xDEADBEEF to memory
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Now store single byte 0x42 to same address (should only change byte 0)
v_mov_b32_e32(v[2], 0x42),
global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Read back and check
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -160,17 +160,17 @@ class TestGlobalStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[2], 0x42),
global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -183,18 +183,18 @@ class TestGlobalStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -207,18 +207,18 @@ class TestGlobalStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+2),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -231,19 +231,19 @@ class TestGlobalStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDDCCBBAA),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Store 0xBEEF at byte offset 1 (bytes 1-2)
s_mov_b32(s[4], 0xBEEF),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -259,7 +259,7 @@ class TestGlobalStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Initialize two consecutive words
s_mov_b32(s[4], 0xDDCCBBAA),
v_mov_b32_e32(v[2], s[4]),
@ -268,18 +268,18 @@ class TestGlobalStore(unittest.TestCase):
s_mov_b32(s[4], 0x44332211),
v_mov_b32_e32(v[2], s[4]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Store 0xBEEF at byte offset 3 (crosses word boundary)
# Low byte (0xEF) goes to byte 3 of first word
# High byte (0xBE) goes to byte 0 of second word
s_mov_b32(s[4], 0xBEEF),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+3),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load back both words
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[4], data=v[4], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
v_mov_b32_e32(v[1], v[4]),
s_mov_b32(s[2], 0),
@ -296,16 +296,16 @@ class TestGlobalStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDEADBEEF),
s_mov_b32(s[5], 0xCAFEBABE),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[3], s[5]),
v_mov_b32_e32(v[0], 0),
global_store_b64(addr=v[0], data=v[2:3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B64, addr=v[0], vdst=v[4:5], data=v[4:5], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[4]),
v_mov_b32_e32(v[1], v[5]),
s_mov_b32(s[2], 0),
@ -324,17 +324,17 @@ class TestD16HiLoads(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0x0000BEEF),
v_mov_b32_e32(v[3], s[4]),
GLOBAL(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[0:1], vdst=v[3], data=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
v_mov_b32_e32(v[1], 0),
s_mov_b32(s[2], 0),
@ -349,17 +349,17 @@ class TestD16HiLoads(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[3], 0),
global_store_b16(addr=v[3], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0x0000DEAD),
v_mov_b32_e32(v[0], s[4]), # data field - should NOT affect result
v_mov_b32_e32(v[1], 0), # vdst - low bits should be preserved
GLOBAL(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[1]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -373,19 +373,19 @@ class TestD16HiLoads(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xAB),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[3], 0),
global_store_b8(addr=v[3], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0x0000DEAD),
v_mov_b32_e32(v[4], s[4]), # data field
s_mov_b32(s[4], 0x0000BEEF),
v_mov_b32_e32(v[5], s[4]), # vdst
v_mov_b32_e32(v[3], 0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_D16_HI_U8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[5]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -399,15 +399,15 @@ class TestD16HiLoads(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[3], 0),
global_store_b16(addr=v[3], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[1], 0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[1]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -421,13 +421,13 @@ class TestD16HiLoads(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0x01010101),
v_mov_b32_e32(v[10], s[4]),
v_mov_b32_e32(v[3], 0),
global_store_b32(addr=v[3], data=v[10], saddr=s[2:3], offset=TEST_OFFSET),
global_store_b32(addr=v[3], data=v[10], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Set v[0] to 0x0101 (simulating prior u16 load result)
s_mov_b32(s[4], 0x0101),
v_mov_b32_e32(v[0], s[4]),
@ -435,7 +435,7 @@ class TestD16HiLoads(unittest.TestCase):
v_mov_b32_e32(v[1], 0),
# Load using v[1] as addr AND vdst, but v[0] as data
GLOBAL(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2:3], offset=TEST_OFFSET+6),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[1]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -450,19 +450,19 @@ class TestD16HiLoads(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0x80), # negative signed byte = -128
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[3], 0),
global_store_b8(addr=v[3], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0x0000DEAD),
v_mov_b32_e32(v[4], s[4]), # data field
s_mov_b32(s[4], 0x0000BEEF),
v_mov_b32_e32(v[5], s[4]), # vdst
v_mov_b32_e32(v[3], 0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_D16_HI_I8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[5]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -477,7 +477,7 @@ class TestD16HiLoads(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0x01010101),
v_mov_b32_e32(v[10], s[4]),
v_mov_b32_e32(v[11], s[4]),
@ -486,7 +486,7 @@ class TestD16HiLoads(unittest.TestCase):
v_mov_b32_e32(v[0], 0),
global_store_b64(addr=v[0], data=v[10:11], saddr=s[2:3], offset=TEST_OFFSET),
global_store_b8(addr=v[0], data=v[12], saddr=s[2:3], offset=TEST_OFFSET+8),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[2], 0),
v_mov_b32_e32(v[1], 0),
@ -494,7 +494,7 @@ class TestD16HiLoads(unittest.TestCase):
GLOBAL(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2:3], offset=TEST_OFFSET+6),
GLOBAL(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
GLOBAL(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[4], data=v[4], saddr=s[2:3], offset=TEST_OFFSET+8),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_and_b32_e32(v[5], 0xffff, v[0]),
v_lshlrev_b32_e32(v[0], 24, v[0]),
@ -503,10 +503,10 @@ class TestD16HiLoads(unittest.TestCase):
v_or_b32_e32(v[1], v[5], v[1]),
global_store_b64(addr=v[2], data=v[0:1], saddr=s[2:3], offset=TEST_OFFSET+16),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B64, addr=v[2], vdst=v[6:7], data=v[6:7], saddr=s[2:3], offset=TEST_OFFSET+16),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[6]),
v_mov_b32_e32(v[1], v[7]),
s_mov_b32(s[2], 0),
@ -534,7 +534,7 @@ class TestGlobalOffset(unittest.TestCase):
"""Load from two different offsets and verify correct values."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
# Store 0xAAAAAAAA at offset 100
@ -545,12 +545,12 @@ class TestGlobalOffset(unittest.TestCase):
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=200),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load from offset 100 -> should get 0xAAAAAAAA
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0:1], vdst=v[3], saddr=SrcEnum.NULL, offset=100),
# Load from offset 200 -> should get 0xBBBBBBBB
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0:1], vdst=v[4], saddr=SrcEnum.NULL, offset=200),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
v_mov_b32_e32(v[1], v[4]),
s_mov_b32(s[2], 0),
@ -564,7 +564,7 @@ class TestGlobalOffset(unittest.TestCase):
"""Store to two different offsets and verify correct values."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
# Store 0x11111111 at offset 300
@ -575,11 +575,11 @@ class TestGlobalOffset(unittest.TestCase):
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[3], s[0]),
global_store_b32(addr=v[0:1], data=v[3], saddr=SrcEnum.NULL, offset=400),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load back to verify
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0:1], vdst=v[4], saddr=SrcEnum.NULL, offset=300),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0:1], vdst=v[5], saddr=SrcEnum.NULL, offset=400),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[4]),
v_mov_b32_e32(v[1], v[5]),
s_mov_b32(s[2], 0),
@ -596,7 +596,7 @@ class TestGlobalOffset(unittest.TestCase):
Load with offset -100 from vaddr pointing to base+300 -> should get 0xBBBB (at 200)."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], s[2]),
v_mov_b32_e32(v[1], s[3]),
# Store 0xAAAAAAAA at offset 100, 0xBBBBBBBB at offset 200
@ -606,7 +606,7 @@ class TestGlobalOffset(unittest.TestCase):
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0:1], data=v[2], saddr=SrcEnum.NULL, offset=200),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# vaddr = base+200, load with offset -100 -> should get value at 100
s_add_u32(s[4], s[2], 200),
s_addc_u32(s[5], s[3], 0),
@ -619,7 +619,7 @@ class TestGlobalOffset(unittest.TestCase):
v_mov_b32_e32(v[4], s[4]),
v_mov_b32_e32(v[5], s[5]),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[4:5], vdst=v[7], saddr=SrcEnum.NULL, offset=-100),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[6]),
v_mov_b32_e32(v[1], v[7]),
v_mov_b32_e32(v[4], 0),
@ -642,7 +642,7 @@ class TestGlobalOffset(unittest.TestCase):
Load with offset -100 from saddr pointing to base+300 -> should get 0xBBBB (at 200)."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
# Store 0xAAAAAAAA at offset 100, 0xBBBBBBBB at offset 200
s_mov_b32(s[0], 0xAAAAAAAA),
@ -651,7 +651,7 @@ class TestGlobalOffset(unittest.TestCase):
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[2], s[0]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=200),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# saddr = base+200, load with offset -100 -> should get value at 100
s_add_u32(s[4], s[2], 200),
s_addc_u32(s[5], s[3], 0),
@ -660,7 +660,7 @@ class TestGlobalOffset(unittest.TestCase):
s_add_u32(s[4], s[2], 300),
s_addc_u32(s[5], s[3], 0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[7], saddr=s[4:5], offset=-100),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[6]),
v_mov_b32_e32(v[1], v[7]),
v_mov_b32_e32(v[6], 0),

View file

@ -13,16 +13,16 @@ class TestScratchStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
# Store via scratch
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load back via scratch
scratch_load_b32(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -35,16 +35,16 @@ class TestScratchStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDEADBEEF),
s_mov_b32(s[5], 0xCAFEBABE),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[3], s[5]),
v_mov_b32_e32(v[0], 0),
scratch_store_b64(addr=v[0], data=v[2:3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_b64(addr=v[0], vdst=v[4:5], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[4]),
v_mov_b32_e32(v[1], v[5]),
s_mov_b32(s[2], 0),
@ -59,20 +59,20 @@ class TestScratchStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# First store full word
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Store single byte
v_mov_b32_e32(v[2], 0x42),
scratch_store_b8(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load back
scratch_load_b32(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -86,18 +86,18 @@ class TestScratchStore(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
scratch_store_b16(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_b32(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -114,7 +114,7 @@ class TestScratchLoad(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0xAAAAAAAA),
v_mov_b32_e32(v[2], s[4]),
@ -123,9 +123,9 @@ class TestScratchLoad(unittest.TestCase):
s_mov_b32(s[4], 0xCCCCCCCC),
v_mov_b32_e32(v[4], s[4]),
scratch_store_b96(addr=v[0], data=v[2:4], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_b96(addr=v[0], vdst=v[5:7], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[5]),
v_mov_b32_e32(v[1], v[6]),
v_mov_b32_e32(v[2], v[7]),
@ -142,7 +142,7 @@ class TestScratchLoad(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
@ -153,9 +153,9 @@ class TestScratchLoad(unittest.TestCase):
s_mov_b32(s[4], 0x9ABCDEF0),
v_mov_b32_e32(v[5], s[4]),
scratch_store_b128(addr=v[0], data=v[2:5], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_b128(addr=v[0], vdst=v[6:9], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[6]),
v_mov_b32_e32(v[1], v[7]),
v_mov_b32_e32(v[2], v[8]),
@ -174,14 +174,14 @@ class TestScratchLoad(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0xDEADBEAB),
v_mov_b32_e32(v[2], s[4]),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_u8(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -194,14 +194,14 @@ class TestScratchLoad(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0x80), # -128 as signed byte
v_mov_b32_e32(v[2], s[4]),
scratch_store_b8(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_i8(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -214,14 +214,14 @@ class TestScratchLoad(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0xDEADCAFE),
v_mov_b32_e32(v[2], s[4]),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_u16(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -234,14 +234,14 @@ class TestScratchLoad(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0x8000), # -32768 as signed 16-bit
v_mov_b32_e32(v[2], s[4]),
scratch_store_b16(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
scratch_load_i16(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -262,24 +262,24 @@ class TestScratchSVE(unittest.TestCase):
# and the store should go to offset 256, not 256+100=356
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# First, store 0xAAAAAAAA at offset 256 with v[0]=0
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0xAAAAAAAA),
v_mov_b32_e32(v[2], s[4]),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET, sve=0),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Now set v[0] to 100 (non-zero) and store 0xBBBBBBBB with SVE=0
# With SVE=0, v[0] should be IGNORED, so this should overwrite offset 256
v_mov_b32_e32(v[0], 100),
s_mov_b32(s[4], 0xBBBBBBBB),
v_mov_b32_e32(v[2], s[4]),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET, sve=0),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load back from offset 256 (with v[0]=0) - should get 0xBBBBBBBB
v_mov_b32_e32(v[0], 0),
scratch_load_b32(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET, sve=0),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -296,24 +296,24 @@ class TestScratchSVE(unittest.TestCase):
# With SVE=1, the second store should go to 256+100=356, not 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# First, store 0xAAAAAAAA at offset 256 with v[0]=0
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[4], 0xAAAAAAAA),
v_mov_b32_e32(v[2], s[4]),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET, sve=1),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Now set v[0] to 100 and store 0xBBBBBBBB with SVE=1
# With SVE=1, v[0] IS used, so this should go to offset 256+100=356
v_mov_b32_e32(v[0], 100),
s_mov_b32(s[4], 0xBBBBBBBB),
v_mov_b32_e32(v[2], s[4]),
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET, sve=1),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load back from offset 256 (with v[0]=0) - should still be 0xAAAAAAAA
v_mov_b32_e32(v[0], 0),
scratch_load_b32(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET, sve=1),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
@ -331,16 +331,16 @@ class TestScratchMultiLane(unittest.TestCase):
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Each lane stores its lane ID
v_mov_b32_e32(v[0], 0),
v_mov_b32_e32(v[2], v[255]), # v[255] has packed workitem IDs, low 10 bits = x
v_and_b32_e32(v[2], 0x3FF, v[2]), # extract lane ID
scratch_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
# Load back
scratch_load_b32(addr=v[0], vdst=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),

View file

@ -26,7 +26,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
instructions = [
# Load output buffer pointer from args
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Store test values to output buffer: 0xAAAAAAAA at offset, 0xBBBBBBBB at offset+4
s_mov_b32(s[4], 0xAAAAAAAA),
s_mov_b32(s[5], 0xBBBBBBBB),
@ -35,7 +35,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
global_store_b32(addr=v[0], data=v[3], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Now test s_load with register offset
# Put offset value in s[4]: offset = 4 bytes (1 dword)
@ -55,7 +55,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
instructions = [
# Load output buffer pointer from args
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Store test values: 0xAAAAAAAA at offset, 0xBBBBBBBB at offset+4
s_mov_b32(s[4], 0xAAAAAAAA),
s_mov_b32(s[5], 0xBBBBBBBB),
@ -64,7 +64,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
global_store_b32(addr=v[0], data=v[3], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load with immediate offset 0
s_load_b32(s[5], s[2:3], NULL, offset=TEST_OFFSET),
@ -94,7 +94,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
instructions = [
# Load output buffer pointer from args
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Store test values: 0xAAAAAAAA at offset, 0xBBBBBBBB at offset+4
s_mov_b32(s[6], 0xAAAAAAAA),
s_mov_b32(s[7], 0xBBBBBBBB),
@ -103,7 +103,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
global_store_b32(addr=v[0], data=v[3], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Set up s[4] = 4 (offset in bytes)
s_mov_b32(s[4], 4),
@ -129,13 +129,13 @@ class TestSLoadRegisterOffset(unittest.TestCase):
instructions = [
# Load output buffer pointer from args
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Store test value: 0xDEADBEEF at offset
s_mov_b32(s[7], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[7]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load with register offset 0
s_mov_b32(s[4], 0),
@ -158,7 +158,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
instructions = [
# Load output buffer pointer from args
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Store test values: 0xAAAAAAAA at offset, 0xBBBBBBBB at offset+4
s_mov_b32(s[8], 0xAAAAAAAA),
s_mov_b32(s[9], 0xBBBBBBBB),
@ -167,7 +167,7 @@ class TestSLoadRegisterOffset(unittest.TestCase):
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
global_store_b32(addr=v[0], data=v[3], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# reg offset = 4, imm offset = 0 -> total offset = 4
s_mov_b32(s[4], 4),
@ -197,7 +197,7 @@ class TestSLoadMultiDword(unittest.TestCase):
instructions = [
# Load output buffer pointer from args
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Store test values: 0xAAAAAAAA, 0xBBBBBBBB at offset
s_mov_b32(s[10], 0xAAAAAAAA),
s_mov_b32(s[11], 0xBBBBBBBB),
@ -206,7 +206,7 @@ class TestSLoadMultiDword(unittest.TestCase):
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
global_store_b32(addr=v[0], data=v[3], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load with register offset 0
s_mov_b32(s[4], 0),
@ -229,7 +229,7 @@ class TestSLoadMultiDword(unittest.TestCase):
instructions = [
# Load output buffer pointer from args
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Store test values: 0xAAAAAAAA, 0xBBBBBBBB, 0xCCCCCCCC, 0xDDDDDDDD at offset
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[14], 0xAAAAAAAA),
@ -244,7 +244,7 @@ class TestSLoadMultiDword(unittest.TestCase):
s_mov_b32(s[14], 0xDDDDDDDD),
v_mov_b32_e32(v[2], s[14]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+12),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load with register offset 0 (s_load_b128 requires 4-aligned dest: s[4], s[8], s[12], ...)
s_mov_b32(s[15], 0),
@ -272,7 +272,7 @@ class TestSLoadLarge(unittest.TestCase):
"""s_load_b256 loads 8 consecutive dwords."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
# Store 8 test values
s_mov_b32(s[20], 0x11111111),
@ -299,11 +299,11 @@ class TestSLoadLarge(unittest.TestCase):
s_mov_b32(s[20], 0x88888888),
v_mov_b32_e32(v[2], s[20]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+28),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load all 8 dwords with s_load_b256
s_load_b256(s[4:11], s[2:3], NULL, offset=TEST_OFFSET),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0), s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
@ -320,7 +320,7 @@ class TestSLoadLarge(unittest.TestCase):
"""s_load_b512 loads 16 consecutive dwords."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
# Store 16 test values (use a pattern: 0x10, 0x20, ..., 0x100)
*[instr for i in range(16) for instr in [
@ -328,11 +328,11 @@ class TestSLoadLarge(unittest.TestCase):
v_mov_b32_e32(v[2], s[20]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET + i * 4),
]],
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load all 16 dwords with s_load_b512
s_load_b512(s[64:79], s[2:3], NULL, offset=TEST_OFFSET),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
# Copy results to lower regs for verification (since st.sgpr only has 16 regs in test)
s_mov_b32(s[4], s[64]),
s_mov_b32(s[5], s[65]),
@ -350,7 +350,7 @@ class TestSLoadLarge(unittest.TestCase):
"""s_load_b256 with register offset should add reg offset to address."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
# Store pattern at TEST_OFFSET+8: skip first 2 dwords
*[instr for i in range(8) for instr in [
@ -358,12 +358,12 @@ class TestSLoadLarge(unittest.TestCase):
v_mov_b32_e32(v[2], s[20]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET + 8 + i * 4),
]],
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load with register offset 8
s_mov_b32(s[20], 8),
s_load_b256(s[4:11], s[2:3], s[20], offset=TEST_OFFSET),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0), s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
@ -383,7 +383,7 @@ class TestSLoadOffset(unittest.TestCase):
"""Load from two different offsets and verify correct values."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
# Store 0xAAAAAAAA at offset 100
s_mov_b32(s[4], 0xAAAAAAAA),
@ -393,13 +393,13 @@ class TestSLoadOffset(unittest.TestCase):
s_mov_b32(s[4], 0xBBBBBBBB),
v_mov_b32_e32(v[2], s[4]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=200),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# Load from offset 100 -> should get 0xAAAAAAAA
s_load_b32(s[4], s[2:3], NULL, offset=100),
# Load from offset 200 -> should get 0xBBBBBBBB
s_load_b32(s[5], s[2:3], NULL, offset=200),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0), s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
@ -413,7 +413,7 @@ class TestSLoadOffset(unittest.TestCase):
Load with offset -100 from base+300 -> should get 0xBBBB."""
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[0], 0),
# Store 0xAAAAAAAA at offset 100, 0xBBBBBBBB at offset 200
s_mov_b32(s[8], 0xAAAAAAAA),
@ -422,7 +422,7 @@ class TestSLoadOffset(unittest.TestCase):
s_mov_b32(s[8], 0xBBBBBBBB),
v_mov_b32_e32(v[2], s[8]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=200),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
*CACHE_INV,
# base+200, load with offset -100 -> should get value at 100
s_add_u32(s[6], s[2], 200),
@ -432,7 +432,7 @@ class TestSLoadOffset(unittest.TestCase):
s_add_u32(s[6], s[2], 300),
s_addc_u32(s[7], s[3], 0),
s_load_b32(s[5], s[6:7], NULL, offset=-100),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
s_mov_b32(s[6], 0),

View file

@ -951,13 +951,13 @@ class TestBarrier(unittest.TestCase):
v_lshlrev_b32_e32(v[3], 2, v[1]),
# Store (tid+1) to LDS[tid*4]
ds_store_b32(addr=v[3], data0=v[2]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_barrier(),
# Read from the other wave's slot: LDS[(tid^32)*4]
v_xor_b32_e32(v[4], 32, v[1]),
v_lshlrev_b32_e32(v[5], 2, v[4]),
ds_load_b32(addr=v[5], vdst=v[0]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=64)
for tid in range(64):
@ -979,24 +979,24 @@ class TestBarrier(unittest.TestCase):
v_lshlrev_b32_e32(v[3], 2, v[1]),
# Phase 1: write (tid+100) to LDS[tid*4]
ds_store_b32(addr=v[3], data0=v[2]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_barrier(),
# Phase 2: read from other wave, add 1000, write to separate LDS region
v_xor_b32_e32(v[4], 32, v[1]),
v_lshlrev_b32_e32(v[5], 2, v[4]),
ds_load_b32(addr=v[5], vdst=v[6]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_add_nc_u32_e32(v[7], 0x3e8, v[6]),
v_add_nc_u32_e32(v[8], 64, v[1]),
v_lshlrev_b32_e32(v[9], 2, v[8]),
ds_store_b32(addr=v[9], data0=v[7]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
s_barrier(),
# Phase 3: read other wave's phase-2 output into v[0]
v_add_nc_u32_e32(v[10], 64, v[4]),
v_lshlrev_b32_e32(v[11], 2, v[10]),
ds_load_b32(addr=v[11], vdst=v[0]),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
]
st = run_program(instructions, n_lanes=64)
for tid in range(64):

View file

@ -1598,7 +1598,7 @@ class TestModifierInteractions(unittest.TestCase):
instructions = [
s_mov_b32(s[0], quiet_nan),
v_mov_b32_e32(v[0], s[0]),
VOP3(VOP3Op.V_ADD_F32, vdst=v[1], src0=v[0], src1=0.0, clamp=1),
VOP3(VOP3Op.V_ADD_F32, vdst=v[1], src0=v[0], src1=0.0, clmp=1),
]
st = run_program(instructions, n_lanes=1)
self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
@ -2760,7 +2760,7 @@ class TestVOP3VOPC(unittest.TestCase):
s_mov_b32(s[1], 0x00000000), # 0.0
v_mov_b32_e32(v[5], s[0]),
v_mov_b32_e32(v[3], s[1]),
VOP3_SDST(VOP3Op.V_CMP_GE_F32, vdst=s[5], src0=v[5], src1=v[3], abs_=3),
VOP3_SDST(VOP3Op.V_CMP_GE_F32, vdst=s[5], src0=v[5], src1=v[3], abs=3),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[5], 0) # NaN comparison is always FALSE

View file

@ -18,10 +18,10 @@ def custom_add_one(A:UOp) -> UOp:
threads = UOp.special(A.size, "lidx0")
insts = [
s_load_b64(s[0:1], s[0:1], soffset=NULL),
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_lshlrev_b32_e32(v[0], 2, v[0]), # element offset
global_load_b32(v[1], v[0], saddr=s[0:1]),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_mov_b32_e32(v[2], 1.0),
v_add_f32_e32(v[1], v[1], v[2]),
global_store_b32(addr=v[0], data=v[1], saddr=s[0:1]),
@ -38,10 +38,10 @@ def custom_add_var(A:UOp, B:UOp) -> UOp:
insts = [
s_load_b128(s[4:7], s[0:1]),
s_load_b32(s[8], s[0:1], offset=0x10), # all threads load the same variable
s_waitcnt(lgkmcnt=0),
s_waitcnt_lgkmcnt(sdst=NULL, simm16=0),
v_lshlrev_b32_e32(v[0], 2, v[0]), # element offset, different per thread
global_load_b32(v[1], v[0], saddr=s[6:7]),
s_waitcnt(vmcnt=0),
s_waitcnt_vmcnt(sdst=NULL, simm16=0),
v_add_nc_u32_e32(v[1], s[8], v[1]),
global_store_b32(addr=v[0], data=v[1], saddr=s[4:5]),
s_endpgm(),
@ -70,8 +70,8 @@ def custom_lds_sync(A:UOp, arch:str) -> UOp:
wg = UOp.special(1, "gidx0")
lds = UOp(Ops.DEFINE_LOCAL, dtypes.uint8.ptr(size=512, addrspace=AddrSpace.LOCAL), (), 'lds') # 128 * 4 bytes
isa = r4 if arch == "rdna4" else r3
wait_kmcnt = [isa.s_wait_kmcnt(simm16=0)] if arch == "rdna4" else [isa.s_waitcnt(lgkmcnt=0)]
wait_dscnt = [isa.s_wait_dscnt(simm16=0)] if arch == "rdna4" else [isa.s_waitcnt(lgkmcnt=0)]
wait_kmcnt = [isa.s_wait_kmcnt(simm16=0)] if arch == "rdna4" else [isa.s_waitcnt_lgkmcnt(sdst=NULL, simm16=0)]
wait_dscnt = [isa.s_wait_dscnt(simm16=0)] if arch == "rdna4" else [isa.s_waitcnt_lgkmcnt(sdst=NULL, simm16=0)]
barrier = [isa.s_barrier_signal(ssrc0=-1), isa.s_barrier_wait(simm16=-1)] if arch == "rdna4" else [isa.s_barrier()]
global_store = [isa.global_store_b32(vaddr=v[6:7], saddr=s[0:1], vsrc=v[5])] if arch == "rdna4" \
else [isa.global_store_b32(addr=v[6], data=v[5], saddr=s[0:1])]

View file

@ -88,6 +88,10 @@ class TestIntegration(IntegrationTestBase):
with self.assertRaises(TypeError):
self.inst = v_mov_b32_e32(1, v[0])
def test_invalid_field(self):
with self.assertRaises(TypeError):
self.inst = s_load_b128(s[4:7], s[0:1], NULL, ioffset=0x8)
def test_simple_int_to_v(self):
self.inst = v_mov_b32_e32(v[0], 1)

View file

@ -306,6 +306,9 @@ class Inst:
elif name in kwargs: vals[name] = kwargs[name]
else: vals[name] = next(args_iter, None)
assert not (remaining := list(args_iter)), f"too many positional args: {remaining}"
known_field_names = [name for name,field in self._fields if not isinstance(field, FixedBitField)]
for name in kwargs:
if name not in known_field_names: raise TypeError(f"{self.__class__.__name__}() got an unexpected keyword argument {name!r}")
# Extract modifiers from Reg objects and merge into neg/abs/opsel
neg_bits, abs_bits, opsel_bits = 0, 0, 0
for name, bit in [('src0', 0), ('src1', 1), ('src2', 2)]: