Compare commits

...

3 commits

Author SHA1 Message Date
Comma Device
6d082d46ce fp32 paeak 2026-05-16 03:03:59 +00:00
Comma Device
9169a9b674 thread128 support 2026-05-16 03:00:57 +00:00
Comma Device
a4b9f67153 add qcom_fp16_mad_peak.py 2026-05-16 02:41:31 +00:00
2 changed files with 120 additions and 3 deletions

View file

@ -0,0 +1,110 @@
#!/usr/bin/env python3
"""FP16/FP32 MAD peak repro for comparing DEV=CL and DEV=QCOM.
Example:
DEV=CL python3 extra/mmapeak/qcom_fp16_mad_peak.py
DEV=QCOM python3 extra/mmapeak/qcom_fp16_mad_peak.py --dtype fp32
"""
from __future__ import annotations
import argparse
from tinygrad import Device, dtypes
from tinygrad.device import Buffer
MAD_OPS_PER_LOOP = 16
VEC = 16
def kernel_name(dtype:str) -> str:
return f"{dtype}_mad_peak"
def make_kernel(loops:int, dtype:str="fp16") -> str:
assert dtype in {"fp16", "fp32"}
scalar = "half" if dtype == "fp16" else "float"
vec_type = f"{scalar}{VEC}"
prefix = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" if dtype == "fp16" else ""
cast = "(half)" if dtype == "fp16" else ""
suffix = "f"
mad_block = "\n".join([
" x = mad(y, x, y);",
" y = mad(x, y, x);",
] * (MAD_OPS_PER_LOOP // 2))
x_init = ",\n ".join(f"bx + {cast}{(i + 1) * 0.001:.3f}{suffix}" for i in range(VEC))
y_init = ",\n ".join(f"by + {cast}{(i + 17) * 0.001:.3f}{suffix}" for i in range(VEC))
sum_terms = " + ".join([f"x.s{'0123456789abcdef'[i]}" for i in range(VEC)] +
[f"y.s{'0123456789abcdef'[i]}" for i in range(VEC)])
return f"""{prefix}__kernel void {kernel_name(dtype)}(__global {scalar} *out) {{
int lid = get_local_id(0);
int gid = get_group_id(0);
{scalar} bx = {cast}1.0f + {cast}(lid & 15) * {cast}0.001f;
{scalar} by = {cast}1.0f + {cast}(gid & 15) * {cast}0.001f;
{vec_type} x = ({vec_type})(
{x_init});
{vec_type} y = ({vec_type})(
{y_init});
for (int i = 0; i < {loops}; i++) {{
{mad_block}
}}
out[get_global_id(0)] = {sum_terms};
}}"""
def run(args:argparse.Namespace) -> None:
dev = Device[Device.DEFAULT]
renderer = type(dev.renderer).__name__
if renderer == "IR3Renderer":
raise SystemExit("This repro uses OpenCL source. Use DEV=QCOM or DEV=CL, not DEV=QCOM:IR3.")
dtype = args.dtype
dt = dtypes.half if dtype == "fp16" else dtypes.float
src = make_kernel(args.loops, dtype)
if args.print_source: print(src)
lib = dev.compiler.compile_cached(src)
if args.disasm: dev.compiler.disassemble(lib)
# Runtime aux mirrors OpenCLRenderer.aux: one __global output pointer at kernel arg 0.
global_size = (args.groups, 1, 1)
local_size = (args.local, 1, 1)
workitems = args.groups * args.local
flops = workitems * args.loops * MAD_OPS_PER_LOOP * VEC * 2
prg = dev.runtime(kernel_name(dtype), lib, (((0, dt.ptr()),),))
out = Buffer(dev.device, workitems, dt, preallocate=True)
for _ in range(args.warmup):
prg(out._buf, global_size=global_size, local_size=local_size, wait=True)
times = [prg(out._buf, global_size=global_size, local_size=local_size, wait=True) for _ in range(args.iters)]
best = min(t for t in times if t is not None)
out_bits = out.copyout(memoryview(bytearray(out.nbytes))).cast("H" if dtype == "fp16" else "I")[0]
out_fmt = "04x" if dtype == "fp16" else "08x"
print(f"device={dev.device} renderer={renderer} arch={dev.arch}")
print(f"dtype={dtype} groups={args.groups} local={args.local} workitems={workitems} loops={args.loops} flops={flops}")
print(f"best={best*1e6:.2f} us {dtype}_mad_peak={flops / best * 1e-9:.2f} GFLOPS out0=0x{out_bits:{out_fmt}}")
if args.show_times:
print("times_us=" + ",".join(f"{t*1e6:.2f}" for t in times if t is not None))
def main() -> None:
parser = argparse.ArgumentParser(description="FP16/FP32 MAD peak repro for DEV=CL vs DEV=QCOM")
parser.add_argument("--dtype", choices=("fp16", "fp32"), default="fp16", help="MAD datatype")
parser.add_argument("--groups", type=int, default=2048, help="number of workgroups")
parser.add_argument("--local", type=int, default=256, help="workitems per workgroup")
parser.add_argument("--loops", type=int, default=8, help="inner loop count; default matches clpeak vec16")
parser.add_argument("--warmup", type=int, default=2, help="warmup launches")
parser.add_argument("--iters", type=int, default=10, help="timed launches")
parser.add_argument("--show-times", action="store_true", help="print every timed launch")
parser.add_argument("--print-source", action="store_true", help="print generated OpenCL source")
parser.add_argument("--disasm", action="store_true", help="call the tinygrad compiler disassembler after compile")
run(parser.parse_args())
if __name__ == "__main__":
main()

View file

@ -137,13 +137,15 @@ class QCOMComputeQueue(HWQueue):
self.reg(mesa.REG_A6XX_TPL1_DBG_ECO_CNTL, 0)
self.cmd(mesa.CP_WAIT_FOR_IDLE)
threadsize = prg.threadsize
self.reg(mesa.REG_A6XX_SP_CS_NDRANGE_0,
qreg.a6xx_sp_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1),
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_sp_cs_wge_cntl(threadsize=mesa.THREAD64),
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_sp_cs_wge_cntl(threadsize=threadsize),
cast_int(global_size[0], ceil=True), cast_int(global_size[1], ceil=True), cast_int(global_size[2], ceil=True))
self.reg(mesa.REG_A6XX_SP_CS_CNTL_0,
qreg.a6xx_sp_cs_cntl_0(threadsize=mesa.THREAD64, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
qreg.a6xx_sp_cs_cntl_0(threadsize=threadsize, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
qreg.a6xx_sp_cs_cntl_1(constantrammode=mesa.CONSTLEN_256, shared_size=prg.shared_size), # should this be CONSTLEN_512?
0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr),
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.dev._stack.va_addr),
@ -187,7 +189,7 @@ class QCOMComputeQueue(HWQueue):
if prg.NIR:
self.reg(mesa.REG_A6XX_SP_CS_CONST_CONFIG_0,
qreg.a6xx_sp_cs_const_config_0(wgidconstid=prg.wgid, wgsizeconstid=prg.wgsz, wgoffsetconstid=0xfc, localidregid=prg.lid),
qreg.a6xx_sp_cs_wge_cntl(linearlocalidregid=0xfc, threadsize=mesa.THREAD64))
qreg.a6xx_sp_cs_wge_cntl(linearlocalidregid=0xfc, threadsize=threadsize))
self.cmd(mesa.CP_EXEC_CS, 0,
qreg.cp_exec_cs_1(ngroups_x=global_size[0]), qreg.cp_exec_cs_2(ngroups_y=global_size[1]), qreg.cp_exec_cs_3(_ngroups_z=global_size[2]))
else: self.cmd(mesa.CP_RUN_OPENCL, 0)
@ -251,6 +253,7 @@ class QCOMProgram(HCQProgram):
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
self.threadsize = mesa.THREAD128 if v.info.double_threadsize else mesa.THREAD64
else: self._parse_lib(lib)
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
@ -320,6 +323,10 @@ class QCOMProgram(HCQProgram):
reg_desc_off = _read_lib(lib, 0x34)
self.fregs, self.hregs = _read_lib(lib, reg_desc_off + 0x14), _read_lib(lib, reg_desc_off + 0x18)
# The Qualcomm OpenCL stack dispatches these binaries with 128-thread waves.
# THREAD64 leaves half-rate ALU throughput for the same shader image.
self.threadsize = mesa.THREAD128 if getenv("THREAD128") else mesa.THREAD64
class QCOMAllocator(HCQAllocatorBase):
def _alloc(self, size:int, opts:BufferSpec) -> HCQBuffer:
return self.dev._gpu_map(opts.external_ptr, size) if opts.external_ptr else self.dev._gpu_alloc(size)