This commit is contained in:
George Hotz 2026-02-13 06:48:01 +00:00
commit 10dce913bb
4 changed files with 35 additions and 301 deletions

View file

@ -1,267 +0,0 @@
#!/usr/bin/env python3
"""Benchmark comparing Python vs Rust RDNA3 emulators on real tinygrad kernels."""
import ctypes, time, os
from pathlib import Path
from tinygrad.renderer.amd.emu import run_asm as python_run_asm, decode_program
from tinygrad.renderer.amd import decode_inst
from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP, SOPPOp
import tinygrad
EXTRA_DIR = Path(tinygrad.__file__).parent.parent / "extra"
REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.so"
if not REMU_PATH.exists():
REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.dylib"
def get_rust_remu():
"""Load the Rust libremu shared library."""
if not REMU_PATH.exists(): return None
remu = ctypes.CDLL(str(REMU_PATH))
remu.run_asm.restype = ctypes.c_int32
remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32,
ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p]
return remu
def count_instructions(kernel: bytes) -> int:
"""Count instructions in a kernel."""
return len(decode_program(kernel))
def setup_buffers(buf_sizes: list[int], init_data: dict[int, bytes] | None = None):
"""Allocate buffers and return args pointer + valid ranges."""
if init_data is None: init_data = {}
buffers = []
for i, size in enumerate(buf_sizes):
padded = ((size + 15) // 16) * 16 + 16
data = init_data.get(i, b'\x00' * padded)
data_list = list(data) + [0] * (padded - len(data))
buf = (ctypes.c_uint8 * padded)(*data_list[:padded])
buffers.append(buf)
args = (ctypes.c_uint64 * len(buffers))(*[ctypes.addressof(b) for b in buffers])
args_ptr = ctypes.addressof(args)
ranges = {(ctypes.addressof(b), len(b)) for b in buffers}
ranges.add((args_ptr, ctypes.sizeof(args)))
return buffers, args, args_ptr, ranges
def benchmark_emulator(name: str, run_fn, kernel: bytes, global_size, local_size, args_ptr, rsrc2: int, iterations: int = 5):
"""Benchmark an emulator and return average time."""
gx, gy, gz = global_size
lx, ly, lz = local_size
kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
lib_ptr = ctypes.addressof(kernel_buf)
# Warmup
run_fn(lib_ptr, len(kernel), gx, gy, gz, lx, ly, lz, args_ptr, rsrc2)
# Timed runs
times = []
for _ in range(iterations):
start = time.perf_counter()
result = run_fn(lib_ptr, len(kernel), gx, gy, gz, lx, ly, lz, args_ptr, rsrc2)
end = time.perf_counter()
if result != 0:
print(f" {name} returned error: {result}")
return None
times.append(end - start)
return sum(times) / len(times)
def profile_instructions(kernel: bytes):
"""Profile individual instruction compile times."""
from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
from tinygrad.helpers import Context
_get_runner.cache_clear()
_canonical_runner_cache.clear()
results = []
i = 0
while i < len(kernel):
inst = decode_inst(kernel[i:])
if isinstance(inst, SOPP) and inst.op == SOPPOp.S_CODE_END: break
inst_bytes = bytes(kernel[i:i + inst.size() + 4])
try: inst_str = repr(inst)
except Exception: inst_str = f"<{type(inst).__name__}>"
# Time the full compile (sink + render + compile)
start = time.perf_counter()
with Context(CCACHE=0):
runner, is_new = _get_runner(inst_bytes)
compile_time = time.perf_counter() - start
results.append({
'inst_str': inst_str + ('' if is_new else ' [CACHED]'),
'compile_ms': compile_time * 1000 if is_new else 0,
})
i += inst.size()
return sorted(results, key=lambda x: x['compile_ms'], reverse=True)
def benchmark_python_split(kernel: bytes, global_size, local_size, args_ptr, rsrc2: int, iterations: int = 5):
"""Benchmark Python emulator with compile and execution times."""
from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
from tinygrad.helpers import Context
_get_runner.cache_clear()
_canonical_runner_cache.clear()
decode_program.cache_clear()
# Measure compile time (decode_program builds sinks, renders, and compiles)
compile_start = time.perf_counter()
with Context(CCACHE=0):
program = decode_program(kernel)
compile_time = time.perf_counter() - compile_start
n_compiled = len(_canonical_runner_cache)
# Execution time
exec_time = benchmark_emulator("Python", python_run_asm, kernel, global_size, local_size, args_ptr, rsrc2, iterations)
return compile_time, exec_time, len(program), n_compiled
def get_tinygrad_kernel(op_name: str) -> tuple[bytes, tuple, tuple, list[int], dict[int, bytes], int] | None:
"""Get a real tinygrad kernel by operation name. Returns (code, global_size, local_size, buf_sizes, buf_data, rsrc2)."""
try:
from tinygrad import Tensor
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.autogen import hsa
import numpy as np
np.random.seed(42)
ops = {
"add": lambda: Tensor.empty(1024) + Tensor.empty(1024),
"mul": lambda: Tensor.empty(1024) * Tensor.empty(1024),
"matmul_small": lambda: Tensor.empty(16, 16) @ Tensor.empty(16, 16),
"matmul_medium": lambda: Tensor.empty(64, 64) @ Tensor.empty(64, 64),
"reduce_sum": lambda: Tensor.empty(4096).sum(),
"reduce_max": lambda: Tensor.empty(4096).max(),
"softmax": lambda: Tensor.empty(256).softmax(),
"layernorm": lambda: Tensor.empty(32, 64).layernorm(),
"conv2d": lambda: Tensor.empty(1, 4, 16, 16).conv2d(Tensor.empty(4, 4, 3, 3)),
"gelu": lambda: Tensor.empty(1024).gelu(),
"exp": lambda: Tensor.empty(1024).exp(),
"sin": lambda: Tensor.empty(1024).sin(),
}
if op_name not in ops: return None
out = ops[op_name]()
sched = out.schedule()
for ei in sched:
lowered = ei.lower()
if ei.ast.op.name == 'SINK' and lowered.prg and lowered.prg.p.lib:
lib = bytes(lowered.prg.p.lib)
image = memoryview(bytearray(lib))
_, sections, _ = elf_loader(lib)
rodata_entry = next((sh.header.sh_addr for sh in sections if sh.name == ".rodata"), -1)
for sec in sections:
if sec.name == '.text':
buf_sizes = [b.nbytes for b in lowered.bufs]
# Get initial data from numpy arrays if available
buf_data = {}
for i, buf in enumerate(lowered.bufs):
if hasattr(buf, 'base') and buf.base is not None and hasattr(buf.base, '_buf'):
try: buf_data[i] = bytes(buf.base._buf)
except Exception: pass
# Extract rsrc2 from ELF (same as ops_amd.py)
group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0]
lds_size = ((group_segment_size + 511) // 512) & 0x1FF
code = hsa.amd_kernel_code_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+256]) + b'\x00'*256)
rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
return (bytes(sec.content), tuple(lowered.prg.p.global_size), tuple(lowered.prg.p.local_size), buf_sizes, buf_data, rsrc2)
return None
except Exception as e:
print(f" Error getting kernel: {e}")
return None
TINYGRAD_TESTS = ["add", "mul", "reduce_sum", "softmax", "exp", "sin", "gelu", "matmul_small"]
def main():
import argparse
parser = argparse.ArgumentParser(description="Benchmark RDNA3 emulators")
parser.add_argument("--iterations", type=int, default=3, help="Number of iterations per benchmark")
parser.add_argument("--profile", type=str, default=None, help="Profile instructions for a specific kernel (e.g. 'sin')")
parser.add_argument("--top", type=int, default=20, help="Number of top instructions to show in profile")
args = parser.parse_args()
# Profile mode: show individual instruction timing
if args.profile:
kernel_info = get_tinygrad_kernel(args.profile)
if kernel_info is None:
print(f"Failed to get kernel for '{args.profile}'")
return
kernel = kernel_info[0]
print(f"Profiling instructions for '{args.profile}' kernel...")
print("=" * 110)
results = profile_instructions(kernel)
print(f"{'Instruction':<90} {'Compile(ms)':>12}")
print("-" * 110)
for r in results[:args.top]:
inst = r['inst_str'][:87] + "..." if len(r['inst_str']) > 90 else r['inst_str']
print(f"{inst:<90} {r['compile_ms']:>12.3f}")
print("-" * 110)
total = sum(r['compile_ms'] for r in results)
print(f"{'TOTAL':<90} {total:>12.3f}")
return
rust_remu = get_rust_remu()
if rust_remu is None:
print("Rust libremu not found. Build with: cargo build --release --manifest-path extra/remu/Cargo.toml")
print("Running Python-only benchmarks...\n")
print("=" * 90)
print("RDNA3 Emulator Benchmark: Python vs Rust")
print("=" * 90)
results = []
print("\n[TINYGRAD KERNELS]")
print("-" * 90)
for op_name in TINYGRAD_TESTS:
print(f"\n{op_name}:", end=" ", flush=True)
kernel_info = get_tinygrad_kernel(op_name)
if kernel_info is None:
print("failed to compile")
continue
kernel, global_size, local_size, buf_sizes, buf_data, rsrc2 = kernel_info
buffers, args_arr, args_ptr, ranges = setup_buffers(buf_sizes, buf_data)
# Benchmark Python emulator (must be first to measure compile time before cache is populated)
py_compile, py_exec, n_insts, n_compiled = benchmark_python_split(kernel, global_size, local_size, args_ptr, rsrc2, args.iterations)
n_workgroups = global_size[0] * global_size[1] * global_size[2]
n_threads = local_size[0] * local_size[1] * local_size[2]
total_work = n_insts * n_workgroups * n_threads
print(f"{n_insts} insts ({n_compiled} unique) × {n_workgroups} WGs × {n_threads} threads = {total_work:,} ops")
rust_time = benchmark_emulator("Rust", rust_remu.run_asm, kernel, global_size, local_size,
args_ptr, rsrc2, args.iterations) if rust_remu else None
if py_compile is not None:
py_exec_rate = total_work / py_exec / 1e6
print(f" Compile: {py_compile*1000:8.3f} ms ({n_compiled} unique)")
print(f" Exec: {py_exec*1000:8.3f} ms ({py_exec_rate:7.2f} M ops/s)")
if rust_time:
rust_rate = total_work / rust_time / 1e6
speedup = py_exec / rust_time if py_exec else 0
print(f" Rust: {rust_time*1000:8.3f} ms ({rust_rate:7.2f} M ops/s) [{speedup:.1f}x faster]")
results.append((op_name, n_insts, n_compiled, n_workgroups, py_compile, py_exec, rust_time))
# Summary table
print("\n" + "=" * 110)
print("SUMMARY")
print("=" * 110)
print(f"{'Name':<16} {'Insts':<6} {'Unique':<6} {'WGs':<5} {'Compile (ms)':<14} {'Exec (ms)':<12} {'Rust (ms)':<12} {'Speedup':<10}")
print("-" * 110)
for name, n_insts, n_compiled, n_wgs, py_compile, py_exec, rust_time in results:
compile_ms = f"{py_compile*1000:.3f}" if py_compile else "error"
exec_ms = f"{py_exec*1000:.3f}" if py_exec else "error"
if rust_time:
rust_ms = f"{rust_time*1000:.3f}"
speedup = f"{py_exec/rust_time:.1f}x" if py_exec else "N/A"
else:
rust_ms, speedup = "N/A", "N/A"
print(f"{name:<16} {n_insts:<6} {n_compiled:<6} {n_wgs:<5} {compile_ms:<14} {exec_ms:<12} {rust_ms:<12} {speedup:<10}")
if __name__ == "__main__":
os.environ["AMD"] = "1"
main()

View file

@ -3,7 +3,7 @@ import unittest, ctypes
from dataclasses import dataclass
from tinygrad import Device
from tinygrad.renderer.amd.emu import WaveState, decode_program, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
from tinygrad.renderer.amd.emu import WaveState, _decode_at, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
from tinygrad.renderer.amd import decode_inst
from test.amd.helpers import KernelInfo
from test.amd.bench_emu import REMU_PATH
@ -89,7 +89,7 @@ class RustEmulator:
class PythonEmulator:
def __init__(self):
self.state: WaveState | None = None
self.program: dict | None = None
self.program: dict[int, tuple] = {} # lazily populated: pc -> (name, fxn, globals)
self.vmem_buf = None
self.lds_buf = None
self.kernel_buf = None # Keep kernel bytes alive
@ -99,27 +99,29 @@ class PythonEmulator:
import ctypes
from tinygrad.device import Buffer, BufferSpec
from tinygrad.dtype import dtypes
# Store kernel in a ctypes buffer so generic instructions can read from vmem at actual PC address
# Store kernel in a ctypes buffer so _decode_at can read from memory at actual PC address
self.kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
self.lib_addr = ctypes.addressof(self.kernel_buf)
# Remap program dict to use actual addresses (like run_asm does)
program_raw = decode_program(kernel)
self.program = {self.lib_addr + offset: val for offset, val in program_raw.items()}
self.program = {}
self.state = WaveState(n_lanes)
self.state.pc = self.lib_addr # Set PC to code base address
self.vmem_buf = Buffer('CPU', 1 << 40, dtypes.uint32, options=BufferSpec(external_ptr=0)).ensure_allocated()
self.lds_buf = Buffer('CPU', 65536 // 4, dtypes.uint32).ensure_allocated()
def _ensure_decoded(self, pc: int):
if pc not in self.program:
runner = _decode_at(pc, "rdna3")
self.program[pc] = (runner.p.function_name, runner._prg.fxn, runner.p.globals)
def step(self) -> int:
import ctypes
assert self.program is not None and self.state is not None
assert self.state is not None
pc = self.state.pc
if pc == 0xFFFFFFFFFFFFFFFF or pc not in self.program: return -1
name, fxn, globals_list, _runner = self.program[pc]
if fxn is None: return 1 # unsupported instruction
if pc == 0xFFFFFFFFFFFFFFFF: return -1
self._ensure_decoded(pc)
name, fxn, globals_list = self.program[pc]
buf_addrs = {0: self.state.sgpr_buf._buf.va_addr, 1: self.state.vgpr_buf._buf.va_addr, # type: ignore[union-attr]
2: self.vmem_buf._buf.va_addr, 3: self.lds_buf._buf.va_addr} # type: ignore[union-attr]
# Direct ctypes call - bypasses HCQ overhead
fxn(*[ctypes.c_uint64(buf_addrs[g]) for g in globals_list], ctypes.c_int32(0))
return -1 if self.state.pc == 0xFFFFFFFFFFFFFFFF else 0
@ -140,7 +142,7 @@ class PythonEmulator:
exec_mask=sgpr[EXEC_LO.offset], sgpr=sgpr, vgpr=vgpr)
def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: tuple[int, int, int],
local_size: tuple[int, int, int], program, max_steps: int, debug: bool, trace_len: int,
local_size: tuple[int, int, int], max_steps: int, debug: bool, trace_len: int,
kernel_idx: int = 0, max_workgroups: int = 8) -> tuple[bool, str, int]:
"""Run a single kernel through both emulators. Returns (success, message, total_steps)."""
gx, gy, gz = global_size
@ -181,9 +183,9 @@ def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: t
rust_before = rust.get_snapshot()
python_before = python.get_snapshot()
assert python.program is not None
inst_info = python.program.get(python.lib_addr + python_before.pc * 4) # Convert word offset to actual address
inst_hex_name = inst_info[0] if inst_info else f"unknown at PC={python_before.pc}"
pc_addr = python.lib_addr + python_before.pc * 4 # Convert word offset to actual address
python._ensure_decoded(pc_addr)
inst_hex_name = python.program[pc_addr][0]
# Decode the instruction to get mnemonic for sync_after checks
try:
# Format is mnemonic_hexbytes, e.g. v_exp_f32_e32_014b027e -> hex is 014b027e
@ -310,12 +312,11 @@ def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int
kernel_ranges = ranges | {(args_ptr, ctypes.sizeof(args))}
set_valid_mem_ranges(kernel_ranges)
program = decode_program(kernel.code)
n_lanes = kernel.local_size[0] * kernel.local_size[1] * kernel.local_size[2]
ok, msg, steps = run_single_kernel(
kernel.code, min(n_lanes, 32), args_ptr, kernel.global_size,
kernel.local_size, program, max_steps, debug, trace_len, ki
kernel.local_size, max_steps, debug, trace_len, ki
)
total_steps += steps
if not ok:
@ -341,9 +342,8 @@ def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list,
ranges.add((args_ptr, ctypes.sizeof(args)))
set_valid_mem_ranges(ranges)
program = decode_program(kernel)
# Legacy wrapper assumes local_size = (n_lanes, 1, 1)
ok, msg, _ = run_single_kernel(kernel, n_lanes, args_ptr, global_size, (n_lanes, 1, 1), program, max_steps, debug, trace_len)
ok, msg, _ = run_single_kernel(kernel, n_lanes, args_ptr, global_size, (n_lanes, 1, 1), max_steps, debug, trace_len)
return ok, msg
def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelInfo], dict[int, int], dict[int, bytes]]:

View file

@ -1,7 +1,7 @@
import unittest, ctypes
from tinygrad.runtime.autogen.amd.rdna4 import ins as ir4
from tinygrad.renderer.amd.dsl import v, s
from tinygrad.renderer.amd.emu import WaveState, decode_program
from tinygrad.renderer.amd.emu import WaveState, _decode_at
from tinygrad.device import Buffer, BufferSpec
from tinygrad.dtype import dtypes
@ -12,12 +12,10 @@ class TestRDNA4Emu(unittest.TestCase):
if not any(isinstance(i, ir4.SOPP) and i.op == ir4.SOPPOp.S_ENDPGM for i in insts):
insts = list(insts) + [ir4.SOPP(ir4.SOPPOp.S_ENDPGM, simm=0)]
# Assemble and decode
# Assemble into ctypes buffer (must stay alive for _decode_at to read from memory)
code = b''.join(i.to_bytes() for i in insts)
code_buf = (ctypes.c_uint8 * len(code)).from_buffer_copy(code)
code_addr = ctypes.addressof(code_buf)
program_raw = decode_program(code, "rdna4")
program = {code_addr + offset: val for offset, val in program_raw.items()}
# Setup wave state
st = WaveState(n_lanes=1)
@ -28,12 +26,16 @@ class TestRDNA4Emu(unittest.TestCase):
# Setup vmem buffer with external_ptr=0 (maps to address 0, allows any pointer access)
vmem_buf = Buffer('CPU', 1 << 40, dtypes.uint32, options=BufferSpec(external_ptr=0)).ensure_allocated()
# Execute
# Execute with lazy decoding (same pattern as run_asm)
program: dict[int, tuple] = {}
c_bufs = [ctypes.c_uint64(st.sgpr_buf._buf.va_addr), ctypes.c_uint64(st.vgpr_buf._buf.va_addr),
ctypes.c_uint64(vmem_buf._buf.va_addr), ctypes.c_uint64(0), ctypes.c_uint64(0)]
for _ in range(100):
if (pc := st.pc) == 0xFFFFFFFFFFFFFFFF or pc not in program: break
_, fxn, globals_list, _ = program[pc]
if (pc := st.pc) == 0xFFFFFFFFFFFFFFFF: break
if pc not in program:
runner = _decode_at(pc, "rdna4")
program[pc] = (runner._prg.fxn, runner.p.globals)
fxn, globals_list = program[pc]
fxn(*[c_bufs[g] for g in globals_list])
return st

View file

@ -7,7 +7,7 @@
# arg=4: scratch - per-lane scratch memory
from __future__ import annotations
import ctypes, functools, re, platform, subprocess, tempfile
from typing import Any, Callable
from typing import Callable
# Set/restore DAZ+FTZ (denormals-are-zero + flush-to-zero) to match RDNA3 default float mode
# x86: MXCSR bits DAZ(6)+FTZ(15), ARM64: FPCR bit FZ(24)
@ -1183,17 +1183,15 @@ def _get_runner(inst_bytes: bytes, arch: str = "rdna3"):
_canonical_runner_cache.append((base, mask, size, runner))
return runner
def _decode_at(pc: int, arch: str) -> tuple[Callable, list[int]]:
"""Decode and compile instruction at absolute address pc. Returns (fxn, globals)."""
def _decode_at(pc: int, arch: str):
"""Decode and compile instruction at absolute address pc. Returns CompiledRunner."""
inst_bytes = bytes((ctypes.c_char * 16).from_address(pc).raw)
inst = decode_inst(inst_bytes, arch)
try:
runner = _get_runner(bytes(inst_bytes[:inst.size() + 4]), arch)
try: return _get_runner(bytes(inst_bytes[:inst.size() + 4]), arch)
except Exception as e:
try: inst_str = repr(inst)
except Exception: inst_str = f"<{type(inst).__name__}>"
raise RuntimeError(f"[emu] Failed to compile {inst_str}: {type(e).__name__}: {e}") from e
return runner._prg.fxn, runner.p.globals
# ═══════════════════════════════════════════════════════════════════════════════
# WAVE STATE
@ -1242,7 +1240,7 @@ class WaveState:
def run_asm(lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int, rsrc2: int = 0x19c,
scratch_size: int = 0, arch: str = "rdna3", user_data: list[int]|None = None) -> int:
"""Execute AMD assembly program. scratch_size is private_segment_fixed_size from kernel descriptor (per-lane)."""
program: dict[int, tuple[Callable, list[int]]] = {} # lazily populated: pc -> (fxn, globals)
program: dict[int, tuple[Callable, list[int]]] = {} # lazily populated: pc -> (fxn, globals) extracted from runner
lds_size = ((rsrc2 & hsa.AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE) >> hsa.AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT) * 512
total_threads = lx * ly * lz
@ -1295,7 +1293,8 @@ def run_asm(lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int,
if (pc := st.pc) == 0xFFFFFFFFFFFFFFFF: break
if pc not in program:
prev_len = len(_canonical_runner_cache)
program[pc] = _decode_at(pc, arch)
runner = _decode_at(pc, arch)
program[pc] = (runner._prg.fxn, runner.p.globals)
if DEBUG >= 3:
inst = decode_inst(bytes((ctypes.c_char * 16).from_address(pc).raw), arch)
msg = f"[emu] PC={pc - lib}: {inst!r}"