mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
hcq2: add kfd (#16537)
This commit is contained in:
parent
03943cd1a0
commit
8baca185d5
2 changed files with 179 additions and 28 deletions
|
|
@ -3,7 +3,7 @@ from typing import cast, Callable, TypeVar, Generic, Any, TYPE_CHECKING
|
|||
import struct, functools, time, collections, importlib, itertools, weakref
|
||||
from dataclasses import replace
|
||||
if TYPE_CHECKING: from tinygrad.engine.realize import ExecContext
|
||||
from tinygrad.helpers import DEV, getenv, select_first_inited, select_by_name, suppress_finalizing, mv_address, round_up, DEBUG, dedup, pluralize
|
||||
from tinygrad.helpers import DEV, getenv, select_first_inited, select_by_name, suppress_finalizing, mv_address, DEBUG, dedup, pluralize
|
||||
from tinygrad.device import Device, Buffer, BufferSpec, Compiled, LRUAllocator, MultiBuffer
|
||||
from tinygrad.uop.ops import Ops, sint, UOp, UPat, PatternMatcher, KernelInfo, graph_rewrite, track_rewrites, GroupOp
|
||||
from tinygrad.uop.symbolic import symbolic_simple, symbolic
|
||||
|
|
@ -109,7 +109,7 @@ class HCQAllocator(LRUAllocator[HCQDeviceType], Generic[HCQDeviceType]):
|
|||
|
||||
def _unmap(self, mb):
|
||||
self.dev.synchronize()
|
||||
self.dev.iface.dev_impl.mm.unmap_range(int(mb.va_addr), round_up(mb.size, 0x1000))
|
||||
self.dev.iface.free(mb)
|
||||
|
||||
def _offset(self, buf, size:int, offset:int) -> HCQ2Buffer: return buf.offset(offset=offset, size=size)
|
||||
|
||||
|
|
@ -393,7 +393,7 @@ def encode_cmdbuf(submit:UOp, lin:UOp) -> UOp|None:
|
|||
pm_encode_cmdbufs = PatternMatcher([(UPat(Ops.CUSTOM_FUNCTION, arg="submit", src=(UPat(Ops.LINEAR, name="lin"),), name="submit"), encode_cmdbuf)])
|
||||
|
||||
# *****************
|
||||
# 4.2. lift patches to the command buffer (root)
|
||||
# 5.2. lift patches to the command buffer (root)
|
||||
|
||||
def lift_patches_to_cmdbuf(cmdbuf:UOp) -> UOp|None:
|
||||
if not (patches:=dedup(u for store in cmdbuf.src[1:] for u in store.toposort() if u.op is Ops.AFTER)): return None
|
||||
|
|
@ -404,7 +404,7 @@ pm_lift_patches_to_cmdbuf = PatternMatcher([
|
|||
])
|
||||
|
||||
# *****************
|
||||
# 5. bufferize placeholders: replace placeholders with real buffers.
|
||||
# 6. bufferize placeholders: replace placeholders with real buffers.
|
||||
|
||||
def bufferize_buf(buf:UOp) -> UOp|None:
|
||||
if buf.tag is None: return None
|
||||
|
|
@ -413,7 +413,7 @@ def bufferize_buf(buf:UOp) -> UOp|None:
|
|||
pm_bufferize = PatternMatcher([(UPat(Ops.BUFFER, name="buf"), bufferize_buf)])
|
||||
|
||||
# *****************
|
||||
# 6.1. capture buffers reachable from each hcq call as BIND, so we don't drop their refs
|
||||
# 7.1. capture buffers reachable from each hcq call as BIND, so we don't drop their refs
|
||||
|
||||
def hold_call_buffers(call:UOp) -> UOp|None:
|
||||
if not (bufs:=tuple(dedup(u for u in call.src[0].toposort() if u.op is Ops.BUFFER and u not in call.src))): return None
|
||||
|
|
@ -421,7 +421,7 @@ def hold_call_buffers(call:UOp) -> UOp|None:
|
|||
pm_hold_call_buffers = PatternMatcher([(UPat(Ops.CALL, tag="hcq", name="call"), hold_call_buffers)])
|
||||
|
||||
# *****************
|
||||
# 6.2. resolve patches
|
||||
# 7.2. resolve patches
|
||||
|
||||
def push_stack(op, s): return UOp(Ops.STACK, op.dtype.scalar().vec(len(s.src)),
|
||||
tuple(op.replace(dtype=op.dtype.scalar(), src=tuple(x if y is s else y for y in op.src)) for x in s.src))
|
||||
|
|
@ -456,7 +456,7 @@ pm_resolve_patches = PatternMatcher([
|
|||
]) + symbolic_simple
|
||||
|
||||
# *****************
|
||||
# 7. callify hcq programs
|
||||
# 8. callify hcq programs
|
||||
|
||||
def to_param(bufs:list[UOp], ref:UOp) -> UOp:
|
||||
bufs.append(ref)
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
|||
from tinygrad.runtime.autogen import kfd, hsa, sqtt, amdgpu_kd, amdgpu_drm
|
||||
from tinygrad.runtime.autogen.am import am
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.hcq import FileIOInterface, HCQBuffer, MMIOInterface, hcq_filter_visible_devices
|
||||
from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager
|
||||
from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, import_pmc
|
||||
from tinygrad.runtime.support.system import PCIIfaceBase, PCIAllocationMeta, USBPCIDevice, MAP_FIXED, MAP_NORESERVE
|
||||
|
|
@ -294,12 +295,163 @@ class AMDAllocator(HCQAllocator['AMDDevice']):
|
|||
|
||||
@dataclass
|
||||
class AMDQueueDesc:
|
||||
ring: Buffer # uint32[ring_size//4]
|
||||
read_ptr: Buffer # uint64[1]
|
||||
write_ptr: Buffer # uint64[1]
|
||||
doorbell: Buffer # uint64[1]
|
||||
put_value: Buffer # uint64[1]
|
||||
params: tuple|None = None # setup_ring params for recovery
|
||||
ring: Buffer; read_ptr: Buffer; write_ptr: Buffer; doorbell: Buffer; put_value: Buffer # noqa: E702
|
||||
eop_buffer: Buffer|None = None; cwsr_buffer: Buffer|None = None; params: tuple|None = None # noqa: E702
|
||||
|
||||
class KFDIface:
|
||||
kfd:FileIOInterface|None = None
|
||||
event_page:HCQBuffer|None = None
|
||||
gpus:list[FileIOInterface] = []
|
||||
count:int = 0
|
||||
|
||||
def _is_usable_gpu(self, gpu_id):
|
||||
with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
|
||||
return False
|
||||
|
||||
def __init__(self, dev, device_id):
|
||||
self.dev = dev
|
||||
|
||||
kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes"
|
||||
|
||||
# Initialize KFD interface during first run
|
||||
if KFDIface.kfd is None:
|
||||
KFDIface.kfd = FileIOInterface("/dev/kfd", os.O_RDWR)
|
||||
gpus = [g for g in FileIOInterface(kfd_topo_path).listdir() if self._is_usable_gpu(FileIOInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
|
||||
KFDIface.gpus = hcq_filter_visible_devices(sorted(gpus, key=lambda x: int(x.split('/')[-1])), "AMD")
|
||||
KFDIface.count = len(KFDIface.gpus)
|
||||
|
||||
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
|
||||
|
||||
self.gpu_id = int(FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
|
||||
self.props = {(p:=l.split())[0]: int(p[1]) for l in FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
|
||||
self.dev_sysfs_path = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device"
|
||||
ip_base = f"{self.dev_sysfs_path}/ip_discovery/die/0"
|
||||
id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP}
|
||||
ip_hw = [(id2ip[int(hwid)], int(hwid)) for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip]
|
||||
self.ip_versions = {ip:tuple(int(FileIOInterface(f'{ip_base}/{hw}/0/{part}').read()) for part in ['major','minor','revision']) for ip,hw in ip_hw}
|
||||
self.drm_fd = FileIOInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
|
||||
|
||||
self.kfd_ver = ((ver_st:=kfd.AMDKFD_IOC_GET_VERSION(KFDIface.kfd)).major_version, ver_st.minor_version)
|
||||
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
|
||||
if self.kfd_ver >= (1,14): kfd.AMDKFD_IOC_RUNTIME_ENABLE(KFDIface.kfd, mode_mask=0)
|
||||
|
||||
# Set these for our device.
|
||||
if KFDIface.event_page is None:
|
||||
KFDIface.event_page = self.alloc(0x8000, uncached=True)
|
||||
kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_page_offset=KFDIface.event_page.meta.handle)
|
||||
else: self.map(KFDIface.event_page)
|
||||
|
||||
# Event to wait for queues completion
|
||||
self.dev.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1)
|
||||
self.dev.queue_event_mailbox_ptr = KFDIface.event_page.va_addr + self.dev.queue_event.event_slot_index * 8
|
||||
|
||||
# OS events to collect memory and hardware faults
|
||||
self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
|
||||
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(KFDIface.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
||||
|
||||
self.queue_event_arr = (kfd.struct_kfd_event_data * 3)(kfd.struct_kfd_event_data(event_id=self.dev.queue_event.event_id),
|
||||
kfd.struct_kfd_event_data(event_id=self.mem_fault_event.event_id), kfd.struct_kfd_event_data(event_id=self.hw_fault_event.event_id))
|
||||
self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr)
|
||||
|
||||
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, cpu_addr=None) -> HCQBuffer:
|
||||
flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
|
||||
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
|
||||
else: flags |= (kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR if host else kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
|
||||
# Make mapped cpu address to be uncachable
|
||||
if cpu_addr is not None: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
|
||||
|
||||
if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
||||
|
||||
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
||||
buf = addr = cpu_addr or FileIOInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
||||
else: buf, addr = 0, FileIOInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
|
||||
|
||||
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
||||
except OSError as e:
|
||||
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access:
|
||||
raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
|
||||
if e.errno == errno.ENOMEM: raise MemoryError(f"Cannot allocate {size} bytes: no memory is available.") from e
|
||||
raise
|
||||
|
||||
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
||||
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
|
||||
assert addr == buf == mem.va_addr
|
||||
|
||||
view = MMIOInterface(mem.va_addr, mem.size, fmt='B') if cpu_access or host else None
|
||||
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem, view=view, owner=self.dev))
|
||||
return hcqbuf
|
||||
|
||||
def free(self, mem):
|
||||
gpus = (ctypes.c_int32 * 1)(self.gpu_id)
|
||||
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(gpus), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
if mem.owner == self.dev:
|
||||
if mem.va_addr: FileIOInterface.munmap(mem.va_addr, mem.size)
|
||||
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
|
||||
|
||||
def map(self, mem):
|
||||
if mem.owner is not None and mem.owner._is_cpu(): return self.alloc(mem.size, host=True, cpu_addr=mem.va_addr)
|
||||
|
||||
c_gpus = (ctypes.c_int32 * 1)(self.gpu_id)
|
||||
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
return HCQBuffer(mem.va_addr, mem.size, meta=mem.meta, owner=mem.owner)
|
||||
|
||||
def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0,
|
||||
xcc_id=0, idx=0):
|
||||
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring._buf.va_addr, ring_size=ring._buf.size, gpu_id=self.gpu_id,
|
||||
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE|(xcc_id<<8), queue_priority=getenv("AMD_KFD_QUEUE_PRIORITY", 7),
|
||||
eop_buffer_address=eop_buffer._buf.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer._buf.size if eop_buffer else 0,
|
||||
ctl_stack_size=ctl_stack_size, ctx_save_restore_address=cwsr_buffer._buf.va_addr if cwsr_buffer else 0, ctx_save_restore_size=ctx_save_restore_size,
|
||||
write_pointer_address=gart._buf.va_addr+wptr, read_pointer_address=gart._buf.va_addr+rptr+8*xcc_id)
|
||||
|
||||
if not hasattr(self, 'doorbells'):
|
||||
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||
self.doorbells = cast(FileIOInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
|
||||
|
||||
(put_value := Buffer("CPU", 1, dtypes.uint64, preallocate=True))._buf.view.view(fmt='Q')[0] = 0
|
||||
doorbell = Buffer("CPU", 1, dtypes.uint64,
|
||||
options=BufferSpec(external_ptr=self.doorbells + queue.doorbell_offset - self.doorbells_base), preallocate=True)
|
||||
return AMDQueueDesc(ring=ring, doorbell=doorbell, read_ptr=gart.view(1, dtypes.uint64, rptr+8*xcc_id).ensure_allocated(),
|
||||
write_ptr=gart.view(1, dtypes.uint64, wptr).ensure_allocated(), put_value=put_value, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer)
|
||||
|
||||
def sleep(self, tm:int):
|
||||
kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=3, wait_for_all=0, timeout=tm)
|
||||
if self.queue_event_arr[1].memory_exception_data.gpu_id or self.queue_event_arr[2].hw_exception_data.gpu_id: self.on_device_hang()
|
||||
|
||||
def on_device_hang(self):
|
||||
def _str(st): return ' '.join(f'{k[0]}={getattr(st, k[0])}' for k in st._real_fields_)
|
||||
|
||||
# try to collect fault info if not already set from sleep().
|
||||
if not self.queue_event_arr[1].memory_exception_data.gpu_id and not self.queue_event_arr[2].hw_exception_data.gpu_id:
|
||||
with contextlib.suppress(RuntimeError): self.sleep(tm=1)
|
||||
|
||||
report = []
|
||||
if self.queue_event_arr[1].memory_exception_data.gpu_id:
|
||||
report += [f"MMU fault: 0x{self.queue_event_arr[1].memory_exception_data.va:X} | {_str(self.queue_event_arr[1].memory_exception_data.failure)}"]
|
||||
if self.queue_event_arr[2].hw_exception_data.gpu_id: report += [f"HW fault: {_str(self.queue_event_arr[2].hw_exception_data)}"]
|
||||
|
||||
raise RuntimeError("\n".join(report))
|
||||
|
||||
def require_profile_mode(self, can_set_mode=True):
|
||||
if self.dev.target[0] == 9: return
|
||||
fn = f'{self.dev_sysfs_path}/power_dpm_force_performance_level'
|
||||
if (perflevel:=FileIOInterface(fn).read().strip()) != 'profile_standard':
|
||||
if can_set_mode:
|
||||
atexit.register(lambda: os.system(f"echo '{perflevel}' | sudo tee {fn} > /dev/null"))
|
||||
os.system(f"echo 'profile_standard' | sudo tee {fn} > /dev/null")
|
||||
self.require_profile_mode(can_set_mode=False)
|
||||
else:
|
||||
raise RuntimeError("PMC/SQTT requires stable power state: run `amd-smi set -l stable_std` for KFD iface")
|
||||
|
||||
@functools.cached_property
|
||||
def drm_dev_info(self) -> amdgpu_drm.struct_drm_amdgpu_info_device:
|
||||
amdgpu_drm.DRM_IOCTL_AMDGPU_INFO(self.drm_fd, query=amdgpu_drm.AMDGPU_INFO_DEV_INFO,
|
||||
return_pointer=ctypes.addressof(inf:=amdgpu_drm.struct_drm_amdgpu_info_device()), return_size=ctypes.sizeof(inf))
|
||||
return inf
|
||||
def is_wgp_active(self, xcc, se, sa, wgp) -> bool: return ((self.drm_dev_info.cu_bitmap[se % 4][sa + (se // 4) * 2] >> (2 * wgp)) & 0x3) == 0x3
|
||||
|
||||
class PCIIface(PCIIfaceBase):
|
||||
def __init__(self, dev, dev_id):
|
||||
|
|
@ -336,17 +488,16 @@ class PCIIface(PCIIfaceBase):
|
|||
|
||||
rcvr_params: tuple
|
||||
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
||||
doorbell_index = self.dev_impl.sdma.setup_ring(*(rcvr_params:=(ring.va_addr, ring.size, gart.va_addr+rptr, gart.va_addr+wptr, idx)))
|
||||
doorbell_index = self.dev_impl.sdma.setup_ring(*(rcvr_params:=(ring._buf.va_addr, ring._buf.size, gart._buf.va_addr+rptr,
|
||||
gart._buf.va_addr+wptr, idx)))
|
||||
else:
|
||||
doorbell_index = self.dev_impl.gfx.setup_ring(*(rcvr_params:=(ring.va_addr, ring.size, gart.va_addr+rptr, gart.va_addr+wptr,
|
||||
eop_buffer.va_addr, eop_buffer.size, is_aql:=(queue_type==kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL), is_aql)))
|
||||
doorbell_index = self.dev_impl.gfx.setup_ring(*(rcvr_params:=(ring._buf.va_addr, ring._buf.size, gart._buf.va_addr+rptr,
|
||||
gart._buf.va_addr+wptr, eop_buffer._buf.va_addr, eop_buffer._buf.size, is_aql:=(queue_type==kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL), is_aql)))
|
||||
|
||||
ext = lambda addr,n,dt: Buffer("CPU", n, dt, options=BufferSpec(external_ptr=addr), preallocate=True)
|
||||
(put_value := Buffer("CPU", 1, dtypes.uint64, preallocate=True))._buf.view.view(fmt='Q')[0] = 0
|
||||
return AMDQueueDesc(ring=ext(ring.va_addr, ring.size//4, dtypes.uint32),
|
||||
doorbell=ext(self.dev_impl.doorbell64.addr + doorbell_index*8, 1, dtypes.uint64),
|
||||
read_ptr=ext(gart.va_addr+rptr, 1, dtypes.uint64), write_ptr=ext(gart.va_addr+wptr, 1, dtypes.uint64),
|
||||
put_value=put_value, params=rcvr_params)
|
||||
doorbell = Buffer("CPU", 1, dtypes.uint64, options=BufferSpec(external_ptr=self.dev_impl.doorbell64.addr + doorbell_index*8), preallocate=True)
|
||||
return AMDQueueDesc(ring=ring, doorbell=doorbell, read_ptr=gart.view(1, dtypes.uint64, rptr).ensure_allocated(),
|
||||
write_ptr=gart.view(1, dtypes.uint64, wptr).ensure_allocated(), put_value=put_value, eop_buffer=eop_buffer, params=rcvr_params)
|
||||
|
||||
def _collect_interrupts(self, reset=False, drain_only=False):
|
||||
d = self.dev
|
||||
|
|
@ -385,7 +536,7 @@ pm_lower = PatternMatcher([
|
|||
class AMDDevice(HCQ2Compiled):
|
||||
timestamp_divider = 100.0 # AMD GPU clock: ticks/us
|
||||
|
||||
ifaces = [PCIIface]
|
||||
ifaces = [KFDIface, PCIIface]
|
||||
|
||||
def is_am(self) -> bool: return isinstance(self.iface, (PCIIface,))
|
||||
def is_usb(self) -> bool: return False
|
||||
|
|
@ -457,19 +608,19 @@ class AMDDevice(HCQ2Compiled):
|
|||
self.sqtt_next_cmd_id = itertools.count(0)
|
||||
|
||||
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0, idx=0):
|
||||
ring = self.iface.alloc(ring_size, uncached=True, cpu_access=True)
|
||||
gart = self.iface.alloc(0x100, uncached=True, cpu_access=True)
|
||||
ring = Buffer(self.device, ring_size // 4, dtypes.uint32, options=BufferSpec(uncached=True, cpu_access=True), preallocate=True)
|
||||
gart = Buffer(self.device, 0x100, dtypes.uint8, options=BufferSpec(uncached=True, cpu_access=True), preallocate=True)
|
||||
|
||||
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL:
|
||||
self.aql_gart = gart
|
||||
self.aql_desc = hsa.amd_queue_t(queue_properties=hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING,
|
||||
read_dispatch_id_field_base_byte_offset=getattr(hsa.amd_queue_t, 'read_dispatch_id').offset,
|
||||
max_cu_id=(self.cu_cnt * self.xccs) - 1, max_wave_id=self.waves_per_cu - 1)
|
||||
self.aql_gart.cpu_view().view(fmt='B')[:ctypes.sizeof(self.aql_desc)] = bytes(self.aql_desc)
|
||||
self.aql_gart._buf.cpu_view().view(fmt='B')[:ctypes.sizeof(self.aql_desc)] = bytes(self.aql_desc)
|
||||
|
||||
cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.xccs, mmap.PAGESIZE)
|
||||
cwsr_buffer = self.iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None
|
||||
eop_buffer = self.iface.alloc(eop_buffer_size) if eop_buffer_size else None
|
||||
cwsr_buffer = Buffer(self.device, cwsr_buffer_size, dtypes.uint8, preallocate=True) if ctx_save_restore_size else None
|
||||
eop_buffer = Buffer(self.device, eop_buffer_size, dtypes.uint8, preallocate=True) if eop_buffer_size else None
|
||||
|
||||
queue = (self.iface.create_queue(queue_type, ring, gart, rptr=getattr(hsa.amd_queue_t, 'read_dispatch_id').offset,
|
||||
wptr=getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer,
|
||||
|
|
@ -534,7 +685,7 @@ class AMDDevice(HCQ2Compiled):
|
|||
int.from_bytes(rsrc1_t(BASE_ADDRESS_HI=hi32(self.scratch.get_buf().va_addr), SWIZZLE_ENABLE=1), 'little'),
|
||||
lo32(size_per_xcc), int.from_bytes(bytes(rsrc3_t(**rsrc)), 'little')]
|
||||
self.aql_desc.compute_tmpring_size = tmpring
|
||||
self.aql_gart.cpu_view()[:ctypes.sizeof(self.aql_desc)] = bytes(self.aql_desc)
|
||||
self.aql_gart._buf.cpu_view()[:ctypes.sizeof(self.aql_desc)] = bytes(self.aql_desc)
|
||||
|
||||
return tmpring
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue