mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
hcq: hcqbuf in copy (#15595)
This commit is contained in:
parent
2c4271209e
commit
902edc3781
7 changed files with 38 additions and 38 deletions
|
|
@ -194,7 +194,7 @@ class TestHCQ(unittest.TestCase):
|
|||
if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue")
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8) \
|
||||
.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
|
|
@ -212,7 +212,7 @@ class TestHCQ(unittest.TestCase):
|
|||
ctypes.memset(buf2._buf.va_addr, 1, sz)
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(buf1._buf.va_addr, buf2._buf.va_addr, sz) \
|
||||
.copy(buf1._buf, buf2._buf, sz) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
|
|
@ -236,7 +236,7 @@ class TestHCQ(unittest.TestCase):
|
|||
for j in range(32): buf2_q_view[min(max(i + j - 16, 0), (sz // 8) - 1)] = random.randint(0, 0xffffffffffffffff)
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(buf1._buf.va_addr, buf2._buf.va_addr, sz) \
|
||||
.copy(buf1._buf, buf2._buf, sz) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
|
|
@ -252,7 +252,7 @@ class TestHCQ(unittest.TestCase):
|
|||
virt_dest_addr = Variable("virt_dest_addr", 0, 0xffffffffffffffff, dtypes.uint64)
|
||||
|
||||
q = TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(virt_dest_addr, virt_src_addr, 8) \
|
||||
.copy(HCQBuffer(virt_dest_addr, 8), HCQBuffer(virt_src_addr, 8), 8) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
|
||||
q.submit(TestHCQ.d0, {virt_src_addr.expr: TestHCQ.a.uop.buffer._buf.va_addr, virt_dest_addr.expr: TestHCQ.b.uop.buffer._buf.va_addr})
|
||||
|
|
@ -275,7 +275,7 @@ class TestHCQ(unittest.TestCase):
|
|||
ctypes.memset(buf2._buf.va_addr, 1, sz)
|
||||
|
||||
q = TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(virt_dest_addr, virt_src_addr, sz) \
|
||||
.copy(HCQBuffer(virt_dest_addr, sz), HCQBuffer(virt_src_addr, sz), sz) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
|
||||
q.submit(TestHCQ.d0, {virt_src_addr.expr: buf2._buf.va_addr, virt_dest_addr.expr: buf1._buf.va_addr})
|
||||
|
|
@ -350,7 +350,7 @@ class TestHCQ(unittest.TestCase):
|
|||
|
||||
sig_st, sig_en = TestHCQ.d0.new_signal(), TestHCQ.d0.new_signal()
|
||||
TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \
|
||||
.copy(a._buf.va_addr, b._buf.va_addr, SZ) \
|
||||
.copy(a._buf, b._buf, SZ) \
|
||||
.timestamp(sig_en) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
||||
|
|
@ -377,7 +377,7 @@ class TestHCQ(unittest.TestCase):
|
|||
|
||||
sig_st, sig_en = TestHCQ.d0.new_signal(), TestHCQ.d0.new_signal()
|
||||
TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \
|
||||
.copy(a._buf.va_addr, b._buf.va_addr, SZ) \
|
||||
.copy(a._buf, b._buf, SZ) \
|
||||
.timestamp(sig_en) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
||||
|
|
@ -416,7 +416,7 @@ class TestHCQ(unittest.TestCase):
|
|||
ctypes.memset(buf2._buf.va_addr, i, 1)
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(buf1._buf.va_addr, buf2._buf.va_addr, 1) \
|
||||
.copy(buf1._buf, buf2._buf, 1) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
|
@ -434,8 +434,8 @@ class TestHCQ(unittest.TestCase):
|
|||
ctypes.memset(buf3._buf.va_addr, i, 1)
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \
|
||||
.copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \
|
||||
.copy(buf1._buf, buf3._buf, 1) \
|
||||
.copy(buf2._buf, buf1._buf, 1) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
|
@ -457,8 +457,8 @@ class TestHCQ(unittest.TestCase):
|
|||
ctypes.memset(buf3._buf.va_addr, i, 1)
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \
|
||||
.copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \
|
||||
.copy(buf1._buf, buf3._buf, 1) \
|
||||
.copy(buf2._buf, buf1._buf, 1) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
|
@ -505,8 +505,8 @@ class TestHCQ(unittest.TestCase):
|
|||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
|
||||
.copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \
|
||||
.copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \
|
||||
.copy(buf1._buf, buf3._buf, 1) \
|
||||
.copy(buf2._buf, buf1._buf, 1) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
|
@ -531,7 +531,7 @@ class TestHCQ(unittest.TestCase):
|
|||
d.allocator.map(cpu_buffer._buf)
|
||||
|
||||
d.hw_copy_queue_t().wait(d.timeline_signal, d.timeline_value - 1) \
|
||||
.copy(local_buf._buf.va_addr, cpu_buffer._buf.va_addr, sz) \
|
||||
.copy(local_buf._buf, cpu_buffer._buf, sz) \
|
||||
.signal(d.timeline_signal, d.timeline_value).submit(d)
|
||||
d.timeline_signal.wait(d.timeline_value)
|
||||
d.timeline_value += 1
|
||||
|
|
|
|||
20
test/external/external_test_hcq.py
vendored
20
test/external/external_test_hcq.py
vendored
|
|
@ -24,8 +24,8 @@ class TestHCQ(unittest.TestCase):
|
|||
TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast)
|
||||
TestHCQ.b.uop.buffer.allocate()
|
||||
# wow that's a lot of abstraction layers
|
||||
TestHCQ.addr = struct.pack("QQ", TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr)
|
||||
TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr)
|
||||
TestHCQ.addr = struct.pack("QQ", TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf)
|
||||
TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf)
|
||||
TestHCQ.kernargs_off = TestHCQ.runner._prg.kernargs_offset
|
||||
TestHCQ.kernargs_size = TestHCQ.runner._prg.kernargs_alloc_size
|
||||
ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_off, TestHCQ.addr, len(TestHCQ.addr))
|
||||
|
|
@ -211,8 +211,8 @@ class TestHCQ(unittest.TestCase):
|
|||
|
||||
def test_copy_1000_times(self):
|
||||
q = TestHCQ.copy_queue()
|
||||
q.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8)
|
||||
q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8)
|
||||
q.copy(TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf, 8)
|
||||
q.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8)
|
||||
for _ in range(1000):
|
||||
q.submit(TestHCQ.d0)
|
||||
TestHCQ.copy_queue().signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
|
@ -226,7 +226,7 @@ class TestHCQ(unittest.TestCase):
|
|||
|
||||
def test_copy(self):
|
||||
q = TestHCQ.copy_queue()
|
||||
q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8)
|
||||
q.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8)
|
||||
q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
q.submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
|
|
@ -237,8 +237,8 @@ class TestHCQ(unittest.TestCase):
|
|||
@unittest.skipUnless(Device.DEFAULT == "NV", "Only NV supports bind")
|
||||
def test_bind_copy(self):
|
||||
q = TestHCQ.copy_queue()
|
||||
q.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8)
|
||||
q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8)
|
||||
q.copy(TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf, 8)
|
||||
q.copy(TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf, 8)
|
||||
q.bind(TestHCQ.d0)
|
||||
for _ in range(1000):
|
||||
q.submit(TestHCQ.d0)
|
||||
|
|
@ -257,7 +257,7 @@ class TestHCQ(unittest.TestCase):
|
|||
a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferSpec(nolru=True)).allocate()
|
||||
b = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferSpec(nolru=True)).allocate()
|
||||
q = TestHCQ.copy_queue()
|
||||
q.copy(a._buf.va_addr, b._buf.va_addr, SZ)
|
||||
q.copy(a._buf, b._buf, SZ)
|
||||
et = _time_queue(q, TestHCQ.d0)
|
||||
gb_s = (SZ/1e9)/et
|
||||
print(f"same device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
|
||||
|
|
@ -269,7 +269,7 @@ class TestHCQ(unittest.TestCase):
|
|||
a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferSpec(nolru=True)).allocate()
|
||||
TestHCQ.d0._gpu_map(b._buf)
|
||||
q = TestHCQ.copy_queue()
|
||||
q.copy(a._buf.va_addr, b._buf.va_addr, SZ)
|
||||
q.copy(a._buf, b._buf, SZ)
|
||||
et = _time_queue(q, TestHCQ.d0)
|
||||
gb_s = (SZ/1e9)/et
|
||||
print(f"cross device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
|
||||
|
|
@ -281,7 +281,7 @@ class TestHCQ(unittest.TestCase):
|
|||
q.exec(TestHCQ.runner._prg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) # b = [1, 2]
|
||||
q.signal(sig:=TestHCQ.d0._alloc_signal(value=0), value=1)
|
||||
qc.wait(sig, value=1)
|
||||
qc.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8)
|
||||
qc.copy(TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf, 8)
|
||||
qc.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
qc.submit(TestHCQ.d0)
|
||||
time.sleep(0.02) # give it time for the wait to fail
|
||||
|
|
|
|||
2
test/external/fuzz_kfd.py
vendored
2
test/external/fuzz_kfd.py
vendored
|
|
@ -19,7 +19,7 @@ if __name__ == "__main__":
|
|||
q.signal(sig:=AMDDevice._alloc_signal(10))
|
||||
qc = HWQueue()
|
||||
qc.wait(sig)
|
||||
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
|
||||
qc.copy(b1, b2, min(b1.size, b2.size))
|
||||
d1.completion_signal.value = 1
|
||||
qc.signal(d1.completion_signal)
|
||||
qc.submit(d1)
|
||||
|
|
|
|||
|
|
@ -170,7 +170,7 @@ class HCQGraph(MultiGraphRunner):
|
|||
for bufid, src in enumerate(cast(list[Buffer], ji.bufs)):
|
||||
if (inprep_idx:=self.input_replace.get((j, bufid))) is not None: self.input_replace_map[enqueue_dev].add(inprep_idx)
|
||||
else: cast(HCQAllocator, enqueue_dev.allocator).map(self.hcq_bufs[j][bufid])
|
||||
enqueue_queue.copy(self.hcq_bufs[j][0].va_addr, self.hcq_bufs[j][1].va_addr, dest.nbytes)
|
||||
enqueue_queue.copy(self.hcq_bufs[j][0], self.hcq_bufs[j][1], dest.nbytes)
|
||||
self.copy_to_devs[cast(HCQCompiled, Device[dest.device])].add(cast(HCQCompiled, Device[src.device]))
|
||||
|
||||
# Encode finish profile timestamp (if needed).
|
||||
|
|
|
|||
|
|
@ -467,14 +467,14 @@ class AMDCopyQueue(HWQueue):
|
|||
super().q(*arr)
|
||||
self.internal_cmd_sizes.append(len(arr))
|
||||
|
||||
def copy(self, dest:sint, src:sint, copy_size:int):
|
||||
def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
|
||||
copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
|
||||
|
||||
for _ in range(copy_commands):
|
||||
step_copy_size = min(copy_size - copied, self.max_copy_size)
|
||||
|
||||
self.q(self.sdma.SDMA_OP_COPY | self.sdma.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_COPY_LINEAR),
|
||||
self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
||||
self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src.va_addr + copied), *data64_le(dest.va_addr + copied))
|
||||
|
||||
copied += step_copy_size
|
||||
return self
|
||||
|
|
|
|||
|
|
@ -185,9 +185,9 @@ class NVCopyQueue(NVCommandQueue):
|
|||
self.queue_idx = queue_idx
|
||||
super().__init__()
|
||||
|
||||
def copy(self, dest:sint, src:sint, copy_size:int):
|
||||
def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
|
||||
for off in range(0, copy_size, step:=(1 << 31)):
|
||||
self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src+off), *data64(dest+off))
|
||||
self.nvm(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, *data64(src.va_addr+off), *data64(dest.va_addr+off))
|
||||
self.nvm(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, min(copy_size-off, step))
|
||||
self.nvm(4, nv_gpu.NVC6B5_LAUNCH_DMA,
|
||||
nv_flags("NVC6B5_LAUNCH_DMA", data_transfer_type="non_pipelined", src_memory_layout="pitch", dst_memory_layout="pitch"))
|
||||
|
|
|
|||
|
|
@ -149,13 +149,13 @@ class HWQueue(Generic[SignalType, HCQDeviceType, ProgramType, ArgsStateType]):
|
|||
|
||||
# *** commands for copy queues ***
|
||||
|
||||
def copy(self, dest:sint, src:sint, copy_size:int):
|
||||
def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
|
||||
"""
|
||||
Enqueues a copy command to transfer data. Only on copy queues.
|
||||
|
||||
Args:
|
||||
dest: The destination of the copy
|
||||
src: The source of the copy
|
||||
dest: The destination buffer of the copy
|
||||
src: The source buffer of the copy
|
||||
copy_size: The size of data to copy
|
||||
"""
|
||||
|
||||
|
|
@ -534,7 +534,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
|||
lsize = min(self.b[self.b_next].size, src.nbytes - i)
|
||||
self.b[self.b_next].cpu_view().view(size=lsize, fmt='B')[:] = src.cast('B')[i:i+lsize]
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr+i, self.b[self.b_next].va_addr, lsize) \
|
||||
.copy(dest.offset(i), self.b[self.b_next], lsize) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
self.b_timeline[self.b_next] = self.dev.timeline_value - 1
|
||||
|
||||
|
|
@ -552,7 +552,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
|||
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size,
|
||||
use_ioring=type(self.b[0].cpu_view()) is MMIOInterface):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr + dst_off, self.b[batch_info[1]].va_addr + src_off, copy_size) \
|
||||
.copy(dest.offset(dst_off), self.b[batch_info[1]].offset(src_off), copy_size) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
self.b_timeline[batch_info[1]] = self.dev.timeline_value - 1
|
||||
|
||||
|
|
@ -566,7 +566,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
|||
dev_suff="SDMA:0"):
|
||||
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \
|
||||
.copy(self.b[0], src.offset(i), lsize:=min(cp_size, dest.nbytes-i)) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
self.dev.timeline_signal.wait(self.dev.timeline_value - 1)
|
||||
dest.cast('B')[i:i+lsize] = self.b[0].cpu_view().view(size=lsize, fmt='B')[:]
|
||||
|
|
@ -579,7 +579,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
|||
dev_suff="SDMA:0"):
|
||||
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
|
||||
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr, src.va_addr, sz) \
|
||||
.copy(dest, src, sz) \
|
||||
.signal(src_dev.timeline_signal, src_dev.next_timeline()).submit(src_dev)
|
||||
|
||||
if src_dev != dest_dev:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue