no early

2026-06-24 02:14:17 +00:00 · 2026-02-13 06:48:01 +00:00 · 2026-02-13 06:48:01 +00:00 · 10dce913bb
commit 10dce913bb
parent d89cb880b2
4 changed files with 35 additions and 301 deletions
--- a/test/amd/bench_emu.py
+++ b/test/amd/bench_emu.py
@ -1,267 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark comparing Python vs Rust RDNA3 emulators on real tinygrad kernels."""
-import ctypes, time, os
-from pathlib import Path
-
-from tinygrad.renderer.amd.emu import run_asm as python_run_asm, decode_program
-from tinygrad.renderer.amd import decode_inst
-from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP, SOPPOp
-
-import tinygrad
-EXTRA_DIR = Path(tinygrad.__file__).parent.parent / "extra"
-REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.so"
-if not REMU_PATH.exists():
-  REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.dylib"
-
-def get_rust_remu():
-  """Load the Rust libremu shared library."""
-  if not REMU_PATH.exists(): return None
-  remu = ctypes.CDLL(str(REMU_PATH))
-  remu.run_asm.restype = ctypes.c_int32
-  remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32,
-                           ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p]
-  return remu
-
-def count_instructions(kernel: bytes) -> int:
-  """Count instructions in a kernel."""
-  return len(decode_program(kernel))
-
-def setup_buffers(buf_sizes: list[int], init_data: dict[int, bytes] | None = None):
-  """Allocate buffers and return args pointer + valid ranges."""
-  if init_data is None: init_data = {}
-  buffers = []
-  for i, size in enumerate(buf_sizes):
-    padded = ((size + 15) // 16) * 16 + 16
-    data = init_data.get(i, b'\x00' * padded)
-    data_list = list(data) + [0] * (padded - len(data))
-    buf = (ctypes.c_uint8 * padded)(*data_list[:padded])
-    buffers.append(buf)
-  args = (ctypes.c_uint64 * len(buffers))(*[ctypes.addressof(b) for b in buffers])
-  args_ptr = ctypes.addressof(args)
-  ranges = {(ctypes.addressof(b), len(b)) for b in buffers}
-  ranges.add((args_ptr, ctypes.sizeof(args)))
-  return buffers, args, args_ptr, ranges
-
-def benchmark_emulator(name: str, run_fn, kernel: bytes, global_size, local_size, args_ptr, rsrc2: int, iterations: int = 5):
-  """Benchmark an emulator and return average time."""
-  gx, gy, gz = global_size
-  lx, ly, lz = local_size
-  kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
-  lib_ptr = ctypes.addressof(kernel_buf)
-
-  # Warmup
-  run_fn(lib_ptr, len(kernel), gx, gy, gz, lx, ly, lz, args_ptr, rsrc2)
-
-  # Timed runs
-  times = []
-  for _ in range(iterations):
-    start = time.perf_counter()
-    result = run_fn(lib_ptr, len(kernel), gx, gy, gz, lx, ly, lz, args_ptr, rsrc2)
-    end = time.perf_counter()
-    if result != 0:
-      print(f"  {name} returned error: {result}")
-      return None
-    times.append(end - start)
-
-  return sum(times) / len(times)
-
-def profile_instructions(kernel: bytes):
-  """Profile individual instruction compile times."""
-  from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
-  from tinygrad.helpers import Context
-  _get_runner.cache_clear()
-  _canonical_runner_cache.clear()
-
-  results = []
-  i = 0
-  while i < len(kernel):
-    inst = decode_inst(kernel[i:])
-    if isinstance(inst, SOPP) and inst.op == SOPPOp.S_CODE_END: break
-    inst_bytes = bytes(kernel[i:i + inst.size() + 4])
-    try: inst_str = repr(inst)
-    except Exception: inst_str = f"<{type(inst).__name__}>"
-
-    # Time the full compile (sink + render + compile)
-    start = time.perf_counter()
-    with Context(CCACHE=0):
-      runner, is_new = _get_runner(inst_bytes)
-    compile_time = time.perf_counter() - start
-
-    results.append({
-      'inst_str': inst_str + ('' if is_new else ' [CACHED]'),
-      'compile_ms': compile_time * 1000 if is_new else 0,
-    })
-    i += inst.size()
-
-  return sorted(results, key=lambda x: x['compile_ms'], reverse=True)
-
-def benchmark_python_split(kernel: bytes, global_size, local_size, args_ptr, rsrc2: int, iterations: int = 5):
-  """Benchmark Python emulator with compile and execution times."""
-  from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
-  from tinygrad.helpers import Context
-  _get_runner.cache_clear()
-  _canonical_runner_cache.clear()
-  decode_program.cache_clear()
-
-  # Measure compile time (decode_program builds sinks, renders, and compiles)
-  compile_start = time.perf_counter()
-  with Context(CCACHE=0):
-    program = decode_program(kernel)
-  compile_time = time.perf_counter() - compile_start
-  n_compiled = len(_canonical_runner_cache)
-
-  # Execution time
-  exec_time = benchmark_emulator("Python", python_run_asm, kernel, global_size, local_size, args_ptr, rsrc2, iterations)
-  return compile_time, exec_time, len(program), n_compiled
-
-def get_tinygrad_kernel(op_name: str) -> tuple[bytes, tuple, tuple, list[int], dict[int, bytes], int] | None:
-  """Get a real tinygrad kernel by operation name. Returns (code, global_size, local_size, buf_sizes, buf_data, rsrc2)."""
-  try:
-    from tinygrad import Tensor
-    from tinygrad.runtime.support.elf import elf_loader
-    from tinygrad.runtime.autogen import hsa
-    import numpy as np
-    np.random.seed(42)
-
-    ops = {
-      "add": lambda: Tensor.empty(1024) + Tensor.empty(1024),
-      "mul": lambda: Tensor.empty(1024) * Tensor.empty(1024),
-      "matmul_small": lambda: Tensor.empty(16, 16) @ Tensor.empty(16, 16),
-      "matmul_medium": lambda: Tensor.empty(64, 64) @ Tensor.empty(64, 64),
-      "reduce_sum": lambda: Tensor.empty(4096).sum(),
-      "reduce_max": lambda: Tensor.empty(4096).max(),
-      "softmax": lambda: Tensor.empty(256).softmax(),
-      "layernorm": lambda: Tensor.empty(32, 64).layernorm(),
-      "conv2d": lambda: Tensor.empty(1, 4, 16, 16).conv2d(Tensor.empty(4, 4, 3, 3)),
-      "gelu": lambda: Tensor.empty(1024).gelu(),
-      "exp": lambda: Tensor.empty(1024).exp(),
-      "sin": lambda: Tensor.empty(1024).sin(),
-    }
-
-    if op_name not in ops: return None
-    out = ops[op_name]()
-    sched = out.schedule()
-
-    for ei in sched:
-      lowered = ei.lower()
-      if ei.ast.op.name == 'SINK' and lowered.prg and lowered.prg.p.lib:
-        lib = bytes(lowered.prg.p.lib)
-        image = memoryview(bytearray(lib))
-        _, sections, _ = elf_loader(lib)
-        rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
-        for sec in sections:
-          if sec.name == '.text':
-            buf_sizes = [b.nbytes for b in lowered.bufs]
-            # Get initial data from numpy arrays if available
-            buf_data = {}
-            for i, buf in enumerate(lowered.bufs):
-              if hasattr(buf, 'base') and buf.base is not None and hasattr(buf.base, '_buf'):
-                try: buf_data[i] = bytes(buf.base._buf)
-                except Exception: pass
-            # Extract rsrc2 from ELF (same as ops_amd.py)
-            group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
-            lds_size = ((group_segment_size + 511) // 512) & 0x1FF
-            code = hsa.amd_kernel_code_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+256]) + b'\x00'*256)
-            rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
-            return (bytes(sec.content), tuple(lowered.prg.p.global_size), tuple(lowered.prg.p.local_size), buf_sizes, buf_data, rsrc2)
-    return None
-  except Exception as e:
-    print(f"  Error getting kernel: {e}")
-    return None
-
-TINYGRAD_TESTS = ["add", "mul", "reduce_sum", "softmax", "exp", "sin", "gelu", "matmul_small"]
-
-def main():
-  import argparse
-  parser = argparse.ArgumentParser(description="Benchmark RDNA3 emulators")
-  parser.add_argument("--iterations", type=int, default=3, help="Number of iterations per benchmark")
-  parser.add_argument("--profile", type=str, default=None, help="Profile instructions for a specific kernel (e.g. 'sin')")
-  parser.add_argument("--top", type=int, default=20, help="Number of top instructions to show in profile")
-  args = parser.parse_args()
-
-  # Profile mode: show individual instruction timing
-  if args.profile:
-    kernel_info = get_tinygrad_kernel(args.profile)
-    if kernel_info is None:
-      print(f"Failed to get kernel for '{args.profile}'")
-      return
-    kernel = kernel_info[0]
-    print(f"Profiling instructions for '{args.profile}' kernel...")
-    print("=" * 110)
-    results = profile_instructions(kernel)
-    print(f"{'Instruction':<90} {'Compile(ms)':>12}")
-    print("-" * 110)
-    for r in results[:args.top]:
-      inst = r['inst_str'][:87] + "..." if len(r['inst_str']) > 90 else r['inst_str']
-      print(f"{inst:<90} {r['compile_ms']:>12.3f}")
-    print("-" * 110)
-    total = sum(r['compile_ms'] for r in results)
-    print(f"{'TOTAL':<90} {total:>12.3f}")
-    return
-
-  rust_remu = get_rust_remu()
-  if rust_remu is None:
-    print("Rust libremu not found. Build with: cargo build --release --manifest-path extra/remu/Cargo.toml")
-    print("Running Python-only benchmarks...\n")
-
-  print("=" * 90)
-  print("RDNA3 Emulator Benchmark: Python vs Rust")
-  print("=" * 90)
-
-  results = []
-
-  print("\n[TINYGRAD KERNELS]")
-  print("-" * 90)
-
-  for op_name in TINYGRAD_TESTS:
-    print(f"\n{op_name}:", end=" ", flush=True)
-    kernel_info = get_tinygrad_kernel(op_name)
-    if kernel_info is None:
-      print("failed to compile")
-      continue
-
-    kernel, global_size, local_size, buf_sizes, buf_data, rsrc2 = kernel_info
-    buffers, args_arr, args_ptr, ranges = setup_buffers(buf_sizes, buf_data)
-
-    # Benchmark Python emulator (must be first to measure compile time before cache is populated)
-    py_compile, py_exec, n_insts, n_compiled = benchmark_python_split(kernel, global_size, local_size, args_ptr, rsrc2, args.iterations)
-
-    n_workgroups = global_size[0] * global_size[1] * global_size[2]
-    n_threads = local_size[0] * local_size[1] * local_size[2]
-    total_work = n_insts * n_workgroups * n_threads
-
-    print(f"{n_insts} insts ({n_compiled} unique) × {n_workgroups} WGs × {n_threads} threads = {total_work:,} ops")
-    rust_time = benchmark_emulator("Rust", rust_remu.run_asm, kernel, global_size, local_size,
-                                   args_ptr, rsrc2, args.iterations) if rust_remu else None
-
-    if py_compile is not None:
-      py_exec_rate = total_work / py_exec / 1e6
-      print(f"  Compile:        {py_compile*1000:8.3f} ms  ({n_compiled} unique)")
-      print(f"  Exec:           {py_exec*1000:8.3f} ms  ({py_exec_rate:7.2f} M ops/s)")
-    if rust_time:
-      rust_rate = total_work / rust_time / 1e6
-      speedup = py_exec / rust_time if py_exec else 0
-      print(f"  Rust:           {rust_time*1000:8.3f} ms  ({rust_rate:7.2f} M ops/s)  [{speedup:.1f}x faster]")
-
-    results.append((op_name, n_insts, n_compiled, n_workgroups, py_compile, py_exec, rust_time))
-
-  # Summary table
-  print("\n" + "=" * 110)
-  print("SUMMARY")
-  print("=" * 110)
-  print(f"{'Name':<16} {'Insts':<6} {'Unique':<6} {'WGs':<5} {'Compile (ms)':<14} {'Exec (ms)':<12} {'Rust (ms)':<12} {'Speedup':<10}")
-  print("-" * 110)
-
-  for name, n_insts, n_compiled, n_wgs, py_compile, py_exec, rust_time in results:
-    compile_ms = f"{py_compile*1000:.3f}" if py_compile else "error"
-    exec_ms = f"{py_exec*1000:.3f}" if py_exec else "error"
-    if rust_time:
-      rust_ms = f"{rust_time*1000:.3f}"
-      speedup = f"{py_exec/rust_time:.1f}x" if py_exec else "N/A"
-    else:
-      rust_ms, speedup = "N/A", "N/A"
-    print(f"{name:<16} {n_insts:<6} {n_compiled:<6} {n_wgs:<5} {compile_ms:<14} {exec_ms:<12} {rust_ms:<12} {speedup:<10}")
-
-if __name__ == "__main__":
-  os.environ["AMD"] = "1"
-  main()
--- a/test/amd/test_compare_emulators.py
+++ b/test/amd/test_compare_emulators.py
@ -3,7 +3,7 @@ import unittest, ctypes
 from dataclasses import dataclass
 from tinygrad import Device

-from tinygrad.renderer.amd.emu import WaveState, decode_program, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
+from tinygrad.renderer.amd.emu import WaveState, _decode_at, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
 from tinygrad.renderer.amd import decode_inst
 from test.amd.helpers import KernelInfo
 from test.amd.bench_emu import REMU_PATH
@ -89,7 +89,7 @@ class RustEmulator:
 class PythonEmulator:
  def __init__(self):
    self.state: WaveState | None = None
-    self.program: dict | None = None
+    self.program: dict[int, tuple] = {}  # lazily populated: pc -> (name, fxn, globals)
    self.vmem_buf = None
    self.lds_buf = None
    self.kernel_buf = None  # Keep kernel bytes alive
@ -99,27 +99,29 @@ class PythonEmulator:
    import ctypes
    from tinygrad.device import Buffer, BufferSpec
    from tinygrad.dtype import dtypes
-    # Store kernel in a ctypes buffer so generic instructions can read from vmem at actual PC address
+    # Store kernel in a ctypes buffer so _decode_at can read from memory at actual PC address
    self.kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
    self.lib_addr = ctypes.addressof(self.kernel_buf)
-    # Remap program dict to use actual addresses (like run_asm does)
-    program_raw = decode_program(kernel)
-    self.program = {self.lib_addr + offset: val for offset, val in program_raw.items()}
+    self.program = {}
    self.state = WaveState(n_lanes)
    self.state.pc = self.lib_addr  # Set PC to code base address
    self.vmem_buf = Buffer('CPU', 1 << 40, dtypes.uint32, options=BufferSpec(external_ptr=0)).ensure_allocated()
    self.lds_buf = Buffer('CPU', 65536 // 4, dtypes.uint32).ensure_allocated()

+  def _ensure_decoded(self, pc: int):
+    if pc not in self.program:
+      runner = _decode_at(pc, "rdna3")
+      self.program[pc] = (runner.p.function_name, runner._prg.fxn, runner.p.globals)
+
  def step(self) -> int:
    import ctypes
-    assert self.program is not None and self.state is not None
+    assert self.state is not None
    pc = self.state.pc
-    if pc == 0xFFFFFFFFFFFFFFFF or pc not in self.program: return -1
-    name, fxn, globals_list, _runner = self.program[pc]
-    if fxn is None: return 1  # unsupported instruction
+    if pc == 0xFFFFFFFFFFFFFFFF: return -1
+    self._ensure_decoded(pc)
+    name, fxn, globals_list = self.program[pc]
    buf_addrs = {0: self.state.sgpr_buf._buf.va_addr, 1: self.state.vgpr_buf._buf.va_addr,  # type: ignore[union-attr]
                 2: self.vmem_buf._buf.va_addr, 3: self.lds_buf._buf.va_addr}  # type: ignore[union-attr]
-    # Direct ctypes call - bypasses HCQ overhead
    fxn(*[ctypes.c_uint64(buf_addrs[g]) for g in globals_list], ctypes.c_int32(0))
    return -1 if self.state.pc == 0xFFFFFFFFFFFFFFFF else 0

@ -140,7 +142,7 @@ class PythonEmulator:
                         exec_mask=sgpr[EXEC_LO.offset], sgpr=sgpr, vgpr=vgpr)

 def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: tuple[int, int, int],
-                      local_size: tuple[int, int, int], program, max_steps: int, debug: bool, trace_len: int,
+                      local_size: tuple[int, int, int], max_steps: int, debug: bool, trace_len: int,
                      kernel_idx: int = 0, max_workgroups: int = 8) -> tuple[bool, str, int]:
  """Run a single kernel through both emulators. Returns (success, message, total_steps)."""
  gx, gy, gz = global_size
@ -181,9 +183,9 @@ def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: t
            rust_before = rust.get_snapshot()
            python_before = python.get_snapshot()

-            assert python.program is not None
-            inst_info = python.program.get(python.lib_addr + python_before.pc * 4)  # Convert word offset to actual address
-            inst_hex_name = inst_info[0] if inst_info else f"unknown at PC={python_before.pc}"
+            pc_addr = python.lib_addr + python_before.pc * 4  # Convert word offset to actual address
+            python._ensure_decoded(pc_addr)
+            inst_hex_name = python.program[pc_addr][0]
            # Decode the instruction to get mnemonic for sync_after checks
            try:
              # Format is mnemonic_hexbytes, e.g. v_exp_f32_e32_014b027e -> hex is 014b027e
@ -310,12 +312,11 @@ def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int
    kernel_ranges = ranges | {(args_ptr, ctypes.sizeof(args))}
    set_valid_mem_ranges(kernel_ranges)

-    program = decode_program(kernel.code)
    n_lanes = kernel.local_size[0] * kernel.local_size[1] * kernel.local_size[2]

    ok, msg, steps = run_single_kernel(
      kernel.code, min(n_lanes, 32), args_ptr, kernel.global_size,
-      kernel.local_size, program, max_steps, debug, trace_len, ki
+      kernel.local_size, max_steps, debug, trace_len, ki
    )
    total_steps += steps
    if not ok:
@ -341,9 +342,8 @@ def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list,
  ranges.add((args_ptr, ctypes.sizeof(args)))
  set_valid_mem_ranges(ranges)

-  program = decode_program(kernel)
  # Legacy wrapper assumes local_size = (n_lanes, 1, 1)
-  ok, msg, _ = run_single_kernel(kernel, n_lanes, args_ptr, global_size, (n_lanes, 1, 1), program, max_steps, debug, trace_len)
+  ok, msg, _ = run_single_kernel(kernel, n_lanes, args_ptr, global_size, (n_lanes, 1, 1), max_steps, debug, trace_len)
  return ok, msg

 def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelInfo], dict[int, int], dict[int, bytes]]:
--- a/test/amd/test_rdna4_emu.py
+++ b/test/amd/test_rdna4_emu.py
@ -1,7 +1,7 @@
 import unittest, ctypes
 from tinygrad.runtime.autogen.amd.rdna4 import ins as ir4
 from tinygrad.renderer.amd.dsl import v, s
-from tinygrad.renderer.amd.emu import WaveState, decode_program
+from tinygrad.renderer.amd.emu import WaveState, _decode_at
 from tinygrad.device import Buffer, BufferSpec
 from tinygrad.dtype import dtypes

@ -12,12 +12,10 @@ class TestRDNA4Emu(unittest.TestCase):
    if not any(isinstance(i, ir4.SOPP) and i.op == ir4.SOPPOp.S_ENDPGM for i in insts):
      insts = list(insts) + [ir4.SOPP(ir4.SOPPOp.S_ENDPGM, simm=0)]

-    # Assemble and decode
+    # Assemble into ctypes buffer (must stay alive for _decode_at to read from memory)
    code = b''.join(i.to_bytes() for i in insts)
    code_buf = (ctypes.c_uint8 * len(code)).from_buffer_copy(code)
    code_addr = ctypes.addressof(code_buf)
-    program_raw = decode_program(code, "rdna4")
-    program = {code_addr + offset: val for offset, val in program_raw.items()}

    # Setup wave state
    st = WaveState(n_lanes=1)
@ -28,12 +26,16 @@ class TestRDNA4Emu(unittest.TestCase):
    # Setup vmem buffer with external_ptr=0 (maps to address 0, allows any pointer access)
    vmem_buf = Buffer('CPU', 1 << 40, dtypes.uint32, options=BufferSpec(external_ptr=0)).ensure_allocated()

-    # Execute
+    # Execute with lazy decoding (same pattern as run_asm)
+    program: dict[int, tuple] = {}
    c_bufs = [ctypes.c_uint64(st.sgpr_buf._buf.va_addr), ctypes.c_uint64(st.vgpr_buf._buf.va_addr),
              ctypes.c_uint64(vmem_buf._buf.va_addr), ctypes.c_uint64(0), ctypes.c_uint64(0)]
    for _ in range(100):
-      if (pc := st.pc) == 0xFFFFFFFFFFFFFFFF or pc not in program: break
-      _, fxn, globals_list, _ = program[pc]
+      if (pc := st.pc) == 0xFFFFFFFFFFFFFFFF: break
+      if pc not in program:
+        runner = _decode_at(pc, "rdna4")
+        program[pc] = (runner._prg.fxn, runner.p.globals)
+      fxn, globals_list = program[pc]
      fxn(*[c_bufs[g] for g in globals_list])
    return st

--- a/tinygrad/renderer/amd/emu.py
+++ b/tinygrad/renderer/amd/emu.py
@ -7,7 +7,7 @@
 #   arg=4: scratch - per-lane scratch memory
 from __future__ import annotations
 import ctypes, functools, re, platform, subprocess, tempfile
-from typing import Any, Callable
+from typing import Callable

 # Set/restore DAZ+FTZ (denormals-are-zero + flush-to-zero) to match RDNA3 default float mode
 # x86: MXCSR bits DAZ(6)+FTZ(15), ARM64: FPCR bit FZ(24)
@ -1183,17 +1183,15 @@ def _get_runner(inst_bytes: bytes, arch: str = "rdna3"):
  _canonical_runner_cache.append((base, mask, size, runner))
  return runner

-def _decode_at(pc: int, arch: str) -> tuple[Callable, list[int]]:
-  """Decode and compile instruction at absolute address pc. Returns (fxn, globals)."""
+def _decode_at(pc: int, arch: str):
+  """Decode and compile instruction at absolute address pc. Returns CompiledRunner."""
  inst_bytes = bytes((ctypes.c_char * 16).from_address(pc).raw)
  inst = decode_inst(inst_bytes, arch)
-  try:
-    runner = _get_runner(bytes(inst_bytes[:inst.size() + 4]), arch)
+  try: return _get_runner(bytes(inst_bytes[:inst.size() + 4]), arch)
  except Exception as e:
    try: inst_str = repr(inst)
    except Exception: inst_str = f"<{type(inst).__name__}>"
    raise RuntimeError(f"[emu] Failed to compile {inst_str}: {type(e).__name__}: {e}") from e
-  return runner._prg.fxn, runner.p.globals

 # ═══════════════════════════════════════════════════════════════════════════════
 # WAVE STATE
@ -1242,7 +1240,7 @@ class WaveState:
 def run_asm(lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int, rsrc2: int = 0x19c,
            scratch_size: int = 0, arch: str = "rdna3", user_data: list[int]|None = None) -> int:
  """Execute AMD assembly program. scratch_size is private_segment_fixed_size from kernel descriptor (per-lane)."""
-  program: dict[int, tuple[Callable, list[int]]] = {}  # lazily populated: pc -> (fxn, globals)
+  program: dict[int, tuple[Callable, list[int]]] = {}  # lazily populated: pc -> (fxn, globals) extracted from runner
  lds_size = ((rsrc2 & hsa.AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE) >> hsa.AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT) * 512
  total_threads = lx * ly * lz

@ -1295,7 +1293,8 @@ def run_asm(lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int,
              if (pc := st.pc) == 0xFFFFFFFFFFFFFFFF: break
              if pc not in program:
                prev_len = len(_canonical_runner_cache)
-                program[pc] = _decode_at(pc, arch)
+                runner = _decode_at(pc, arch)
+                program[pc] = (runner._prg.fxn, runner.p.globals)
                if DEBUG >= 3:
                  inst = decode_inst(bytes((ctypes.c_char * 16).from_address(pc).raw), arch)
                  msg = f"[emu] PC={pc - lib}: {inst!r}"