fix amd exec_update for locals (#5966)

This commit is contained in:
nimlgen 2024-08-07 21:02:56 +03:00 committed by GitHub
commit 8d8704af2d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 35 additions and 2 deletions

View file

@ -3,7 +3,8 @@ from tinygrad import Device, Tensor, dtypes, TinyJit
from tinygrad.helpers import CI, getenv, Context
from tinygrad.device import Buffer, BufferOptions, HCQCompiled
from tinygrad.engine.schedule import create_schedule
from tinygrad.engine.realize import get_runner
from tinygrad.engine.realize import get_runner, CompiledRunner
from tinygrad.codegen.kernel import Kernel, Opt, OptOps
MOCKGPU = getenv("MOCKGPU")
@ -134,6 +135,38 @@ class TestHCQ(unittest.TestCase):
assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}"
assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 0.0, f"got val {val}, should not be updated"
def test_exec_update_fuzz(self):
a = Tensor.rand((3, 3, 3), dtype=dtypes.int, device=Device.DEFAULT).realize()
b = a + 1
si = create_schedule([b.lazydata])[-1]
k = Kernel(si.ast, opts=TestHCQ.d0.renderer)
for i in range(3): k.apply_opt(Opt(op=OptOps.LOCAL, axis=0, amt=3))
runner = CompiledRunner(k.to_program())
zb = Buffer(Device.DEFAULT, 3 * 3 * 3, dtypes.int, options=BufferOptions(cpu_access=True, nolru=True)).ensure_allocated()
zt = Buffer(Device.DEFAULT, 3 * 3 * 3, dtypes.int, options=BufferOptions(cpu_access=True, nolru=True)).ensure_allocated()
ctypes.memset(zb._buf.va_addr, 0, zb.nbytes)
kernargs = runner.clprg.fill_kernargs([zt._buf, zb._buf])
q = TestHCQ.d0.hw_compute_queue_t()
q.memory_barrier() \
.exec(runner.clprg, kernargs, (1,1,1), (1,1,1)) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
for x in range(1, 4):
for y in range(1, 4):
for z in range(1, 4):
ctypes.memset(zt._buf.va_addr, 0, zb.nbytes)
q.update_exec(1, local_size=(x,y,z)) \
.update_signal(2, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
res_sum = sum(x for x in zt.as_buffer().cast("I"))
assert x * y * z == res_sum, f"want {x * y * z}, got {res_sum}"
# Test copy
def test_copy(self):
if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue")

View file

@ -124,7 +124,7 @@ class AMDComputeQueue(HWComputeQueue):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 4 # +1 to skip PACKET3_SET_SH_REG + 3 zeros.
self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]