hcq: hcqbuf in copy (#15595)

This commit is contained in:
nimlgen 2026-04-03 22:47:36 +03:00 committed by GitHub
commit 902edc3781
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 38 additions and 38 deletions

View file

@ -194,7 +194,7 @@ class TestHCQ(unittest.TestCase):
if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue")
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8) \
.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
@ -212,7 +212,7 @@ class TestHCQ(unittest.TestCase):
ctypes.memset(buf2._buf.va_addr, 1, sz)
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(buf1._buf.va_addr, buf2._buf.va_addr, sz) \
.copy(buf1._buf, buf2._buf, sz) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
@ -236,7 +236,7 @@ class TestHCQ(unittest.TestCase):
for j in range(32): buf2_q_view[min(max(i + j - 16, 0), (sz // 8) - 1)] = random.randint(0, 0xffffffffffffffff)
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(buf1._buf.va_addr, buf2._buf.va_addr, sz) \
.copy(buf1._buf, buf2._buf, sz) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
@ -252,7 +252,7 @@ class TestHCQ(unittest.TestCase):
virt_dest_addr = Variable("virt_dest_addr", 0, 0xffffffffffffffff, dtypes.uint64)
q = TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(virt_dest_addr, virt_src_addr, 8) \
.copy(HCQBuffer(virt_dest_addr, 8), HCQBuffer(virt_src_addr, 8), 8) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
q.submit(TestHCQ.d0, {virt_src_addr.expr: TestHCQ.a.uop.buffer._buf.va_addr, virt_dest_addr.expr: TestHCQ.b.uop.buffer._buf.va_addr})
@ -275,7 +275,7 @@ class TestHCQ(unittest.TestCase):
ctypes.memset(buf2._buf.va_addr, 1, sz)
q = TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(virt_dest_addr, virt_src_addr, sz) \
.copy(HCQBuffer(virt_dest_addr, sz), HCQBuffer(virt_src_addr, sz), sz) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
q.submit(TestHCQ.d0, {virt_src_addr.expr: buf2._buf.va_addr, virt_dest_addr.expr: buf1._buf.va_addr})
@ -350,7 +350,7 @@ class TestHCQ(unittest.TestCase):
sig_st, sig_en = TestHCQ.d0.new_signal(), TestHCQ.d0.new_signal()
TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \
.copy(a._buf.va_addr, b._buf.va_addr, SZ) \
.copy(a._buf, b._buf, SZ) \
.timestamp(sig_en) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
@ -377,7 +377,7 @@ class TestHCQ(unittest.TestCase):
sig_st, sig_en = TestHCQ.d0.new_signal(), TestHCQ.d0.new_signal()
TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \
.copy(a._buf.va_addr, b._buf.va_addr, SZ) \
.copy(a._buf, b._buf, SZ) \
.timestamp(sig_en) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
@ -416,7 +416,7 @@ class TestHCQ(unittest.TestCase):
ctypes.memset(buf2._buf.va_addr, i, 1)
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(buf1._buf.va_addr, buf2._buf.va_addr, 1) \
.copy(buf1._buf, buf2._buf, 1) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@ -434,8 +434,8 @@ class TestHCQ(unittest.TestCase):
ctypes.memset(buf3._buf.va_addr, i, 1)
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \
.copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \
.copy(buf1._buf, buf3._buf, 1) \
.copy(buf2._buf, buf1._buf, 1) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@ -457,8 +457,8 @@ class TestHCQ(unittest.TestCase):
ctypes.memset(buf3._buf.va_addr, i, 1)
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \
.copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \
.copy(buf1._buf, buf3._buf, 1) \
.copy(buf2._buf, buf1._buf, 1) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@ -505,8 +505,8 @@ class TestHCQ(unittest.TestCase):
TestHCQ.d0.timeline_value += 1
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \
.copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \
.copy(buf1._buf, buf3._buf, 1) \
.copy(buf2._buf, buf1._buf, 1) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@ -531,7 +531,7 @@ class TestHCQ(unittest.TestCase):
d.allocator.map(cpu_buffer._buf)
d.hw_copy_queue_t().wait(d.timeline_signal, d.timeline_value - 1) \
.copy(local_buf._buf.va_addr, cpu_buffer._buf.va_addr, sz) \
.copy(local_buf._buf, cpu_buffer._buf, sz) \
.signal(d.timeline_signal, d.timeline_value).submit(d)
d.timeline_signal.wait(d.timeline_value)
d.timeline_value += 1

View file

@ -24,8 +24,8 @@ class TestHCQ(unittest.TestCase):
TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast)
TestHCQ.b.uop.buffer.allocate()
# wow that's a lot of abstraction layers
TestHCQ.addr = struct.pack("QQ", TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr)
TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr)
TestHCQ.addr = struct.pack("QQ", TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf)
TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf)
TestHCQ.kernargs_off = TestHCQ.runner._prg.kernargs_offset
TestHCQ.kernargs_size = TestHCQ.runner._prg.kernargs_alloc_size
ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_off, TestHCQ.addr, len(TestHCQ.addr))
@ -211,8 +211,8 @@ class TestHCQ(unittest.TestCase):
def test_copy_1000_times(self):
q = TestHCQ.copy_queue()
q.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8)
q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8)
q.copy(TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf, 8)
q.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8)
for _ in range(1000):
q.submit(TestHCQ.d0)
TestHCQ.copy_queue().signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
@ -226,7 +226,7 @@ class TestHCQ(unittest.TestCase):
def test_copy(self):
q = TestHCQ.copy_queue()
q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8)
q.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8)
q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
q.submit(TestHCQ.d0)
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
@ -237,8 +237,8 @@ class TestHCQ(unittest.TestCase):
@unittest.skipUnless(Device.DEFAULT == "NV", "Only NV supports bind")
def test_bind_copy(self):
q = TestHCQ.copy_queue()
q.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8)
q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8)
q.copy(TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf, 8)
q.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8)
q.bind(TestHCQ.d0)
for _ in range(1000):
q.submit(TestHCQ.d0)
@ -257,7 +257,7 @@ class TestHCQ(unittest.TestCase):
a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferSpec(nolru=True)).allocate()
b = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferSpec(nolru=True)).allocate()
q = TestHCQ.copy_queue()
q.copy(a._buf.va_addr, b._buf.va_addr, SZ)
q.copy(a._buf, b._buf, SZ)
et = _time_queue(q, TestHCQ.d0)
gb_s = (SZ/1e9)/et
print(f"same device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
@ -269,7 +269,7 @@ class TestHCQ(unittest.TestCase):
a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferSpec(nolru=True)).allocate()
TestHCQ.d0._gpu_map(b._buf)
q = TestHCQ.copy_queue()
q.copy(a._buf.va_addr, b._buf.va_addr, SZ)
q.copy(a._buf, b._buf, SZ)
et = _time_queue(q, TestHCQ.d0)
gb_s = (SZ/1e9)/et
print(f"cross device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
@ -281,7 +281,7 @@ class TestHCQ(unittest.TestCase):
q.exec(TestHCQ.runner._prg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) # b = [1, 2]
q.signal(sig:=TestHCQ.d0._alloc_signal(value=0), value=1)
qc.wait(sig, value=1)
qc.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8)
qc.copy(TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf, 8)
qc.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
qc.submit(TestHCQ.d0)
time.sleep(0.02) # give it time for the wait to fail

View file

@ -19,7 +19,7 @@ if __name__ == "__main__":
q.signal(sig:=AMDDevice._alloc_signal(10))
qc = HWQueue()
qc.wait(sig)
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
qc.copy(b1, b2, min(b1.size, b2.size))
d1.completion_signal.value = 1
qc.signal(d1.completion_signal)
qc.submit(d1)

View file

@ -170,7 +170,7 @@ class HCQGraph(MultiGraphRunner):
for bufid, src in enumerate(cast(list[Buffer], ji.bufs)):
if (inprep_idx:=self.input_replace.get((j, bufid))) is not None: self.input_replace_map[enqueue_dev].add(inprep_idx)
else: cast(HCQAllocator, enqueue_dev.allocator).map(self.hcq_bufs[j][bufid])
enqueue_queue.copy(self.hcq_bufs[j][0].va_addr, self.hcq_bufs[j][1].va_addr, dest.nbytes)
enqueue_queue.copy(self.hcq_bufs[j][0], self.hcq_bufs[j][1], dest.nbytes)
self.copy_to_devs[cast(HCQCompiled, Device[dest.device])].add(cast(HCQCompiled, Device[src.device]))
# Encode finish profile timestamp (if needed).

View file

@ -467,14 +467,14 @@ class AMDCopyQueue(HWQueue):
super().q(*arr)
self.internal_cmd_sizes.append(len(arr))
def copy(self, dest:sint, src:sint, copy_size:int):
def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
for _ in range(copy_commands):
step_copy_size = min(copy_size - copied, self.max_copy_size)
self.q(self.sdma.SDMA_OP_COPY | self.sdma.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_COPY_LINEAR),
self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src.va_addr + copied), *data64_le(dest.va_addr + copied))
copied += step_copy_size
return self

View file

@ -185,9 +185,9 @@ class NVCopyQueue(NVCommandQueue):
self.queue_idx = queue_idx
super().__init__()
def copy(self, dest:sint, src:sint, copy_size:int):
def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
for off in range(0, copy_size, step:=(1 << 31)):
self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src+off), *data64(dest+off))
self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src.va_addr+off), *data64(dest.va_addr+off))
self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, min(copy_size-off, step))
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA,
nv_flags("NVC6B5_LAUNCH_DMA", data_transfer_type="non_pipelined", src_memory_layout="pitch", dst_memory_layout="pitch"))

View file

@ -149,13 +149,13 @@ class HWQueue(Generic[SignalType, HCQDeviceType, ProgramType, ArgsStateType]):
# *** commands for copy queues ***
def copy(self, dest:sint, src:sint, copy_size:int):
def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
"""
Enqueues a copy command to transfer data. Only on copy queues.
Args:
dest: The destination of the copy
src: The source of the copy
dest: The destination buffer of the copy
src: The source buffer of the copy
copy_size: The size of data to copy
"""
@ -534,7 +534,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
lsize = min(self.b[self.b_next].size, src.nbytes - i)
self.b[self.b_next].cpu_view().view(size=lsize, fmt='B')[:] = src.cast('B')[i:i+lsize]
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
.copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
.copy(dest.offset(i), self.b[self.b_next], lsize) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
self.b_timeline[self.b_next] = self.dev.timeline_value - 1
@ -552,7 +552,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size,
use_ioring=type(self.b[0].cpu_view()) is MMIOInterface):
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
.copy(dest.va_addr + dst_off, self.b[batch_info[1]].va_addr + src_off, copy_size) \
.copy(dest.offset(dst_off), self.b[batch_info[1]].offset(src_off), copy_size) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
self.b_timeline[batch_info[1]] = self.dev.timeline_value - 1
@ -566,7 +566,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
dev_suff="SDMA:0"):
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \
.copy(self.b[0], src.offset(i), lsize:=min(cp_size, dest.nbytes-i)) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
self.dev.timeline_signal.wait(self.dev.timeline_value - 1)
dest.cast('B')[i:i+lsize] = self.b[0].cpu_view().view(size=lsize, fmt='B')[:]
@ -579,7 +579,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
dev_suff="SDMA:0"):
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
.copy(dest.va_addr, src.va_addr, sz) \
.copy(dest, src, sz) \
.signal(src_dev.timeline_signal, src_dev.next_timeline()).submit(src_dev)
if src_dev != dest_dev: