mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
amd: support relocatable elf (#9380)
Co-authored-by: b1tg <b1tg@users.noreply.github.com>
This commit is contained in:
parent
243078dda9
commit
bde0347618
1 changed files with 9 additions and 9 deletions
|
|
@ -242,26 +242,26 @@ class AMDProgram(HCQProgram):
|
|||
image, sections, _ = elf_loader(self.lib)
|
||||
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
||||
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
||||
|
||||
entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
|
||||
self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
|
||||
self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
|
||||
self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
|
||||
|
||||
rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
|
||||
text_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".text"), -1)
|
||||
assert rodata_entry >= 0 and text_entry >= 0, ".text or .rodata section not found"
|
||||
self.group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
|
||||
self.private_segment_size = image[rodata_entry+4:rodata_entry+8].cast("I")[0]
|
||||
self.kernargs_segment_size = image[rodata_entry+8:rodata_entry+12].cast("I")[0]
|
||||
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
|
||||
if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
|
||||
|
||||
# Ensure scratch size
|
||||
self.dev._ensure_has_local_memory(self.private_segment_size)
|
||||
|
||||
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
|
||||
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + rodata_entry) # NOTE: this is wrong, it's not this object
|
||||
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
||||
|
||||
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
|
||||
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
|
||||
self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
|
||||
self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
|
||||
|
||||
self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
|
||||
if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
|
||||
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
|
||||
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
|
||||
self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue