amd: support relocatable elf (#9380)

Co-authored-by: b1tg <b1tg@users.noreply.github.com>
This commit is contained in:
b1tg 2025-03-08 02:21:49 +08:00 committed by GitHub
commit bde0347618
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -242,26 +242,26 @@ class AMDProgram(HCQProgram):
image, sections, _ = elf_loader(self.lib)
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
text_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".text"), -1)
assert rodata_entry >= 0 and text_entry >= 0, ".text or .rodata section not found"
self.group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
self.private_segment_size = image[rodata_entry+4:rodata_entry+8].cast("I")[0]
self.kernargs_segment_size = image[rodata_entry+8:rodata_entry+12].cast("I")[0]
lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")
# Ensure scratch size
self.dev._ensure_has_local_memory(self.private_segment_size)
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + rodata_entry) # NOTE: this is wrong, it's not this object
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR