amd: support relocatable elf (#9380)

Co-authored-by: b1tg <b1tg@users.noreply.github.com>
2026-06-24 02:14:17 +00:00 · 2025-03-08 02:21:49 +08:00 · 2025-03-08 02:21:49 +08:00 · bde0347618
commit bde0347618
parent 243078dda9
1 changed files with 9 additions and 9 deletions
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@ -242,26 +242,26 @@ class AMDProgram(HCQProgram):
    image, sections, _ = elf_loader(self.lib)
    self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
    ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
-
-    entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
-    self.group_segment_size = image[entry_point:entry_point+4].cast("I")[0]
-    self.private_segment_size = image[entry_point+4:entry_point+8].cast("I")[0]
-    self.kernargs_segment_size = image[entry_point+8:entry_point+12].cast("I")[0]
-
+    rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
+    text_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".text"), -1)
+    assert rodata_entry >= 0 and text_entry >= 0, ".text or .rodata section not found"
+    self.group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
+    self.private_segment_size = image[rodata_entry+4:rodata_entry+8].cast("I")[0]
+    self.kernargs_segment_size = image[rodata_entry+8:rodata_entry+12].cast("I")[0]
    lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
    if lds_size > (self.dev.dev_iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size")

    # Ensure scratch size
    self.dev._ensure_has_local_memory(self.private_segment_size)

-    code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
+    code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + rodata_entry) # NOTE: this is wrong, it's not this object
    assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32

    # Set rsrc1.priv=1 on gfx11 to workaround cwsr.
    self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
    self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
-    self.prog_addr: int = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
-
+    self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
+    if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
    # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
    # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
    self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR