mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Compare commits
3 commits
master
...
qcom_mmape
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6d082d46ce | ||
|
|
9169a9b674 | ||
|
|
a4b9f67153 |
2 changed files with 120 additions and 3 deletions
110
extra/mmapeak/qcom_fp16_mad_peak.py
Normal file
110
extra/mmapeak/qcom_fp16_mad_peak.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
#!/usr/bin/env python3
|
||||
"""FP16/FP32 MAD peak repro for comparing DEV=CL and DEV=QCOM.
|
||||
|
||||
Example:
|
||||
DEV=CL python3 extra/mmapeak/qcom_fp16_mad_peak.py
|
||||
DEV=QCOM python3 extra/mmapeak/qcom_fp16_mad_peak.py --dtype fp32
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
from tinygrad import Device, dtypes
|
||||
from tinygrad.device import Buffer
|
||||
|
||||
|
||||
MAD_OPS_PER_LOOP = 16
|
||||
VEC = 16
|
||||
|
||||
|
||||
def kernel_name(dtype:str) -> str:
|
||||
return f"{dtype}_mad_peak"
|
||||
|
||||
|
||||
def make_kernel(loops:int, dtype:str="fp16") -> str:
|
||||
assert dtype in {"fp16", "fp32"}
|
||||
scalar = "half" if dtype == "fp16" else "float"
|
||||
vec_type = f"{scalar}{VEC}"
|
||||
prefix = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" if dtype == "fp16" else ""
|
||||
cast = "(half)" if dtype == "fp16" else ""
|
||||
suffix = "f"
|
||||
mad_block = "\n".join([
|
||||
" x = mad(y, x, y);",
|
||||
" y = mad(x, y, x);",
|
||||
] * (MAD_OPS_PER_LOOP // 2))
|
||||
|
||||
x_init = ",\n ".join(f"bx + {cast}{(i + 1) * 0.001:.3f}{suffix}" for i in range(VEC))
|
||||
y_init = ",\n ".join(f"by + {cast}{(i + 17) * 0.001:.3f}{suffix}" for i in range(VEC))
|
||||
sum_terms = " + ".join([f"x.s{'0123456789abcdef'[i]}" for i in range(VEC)] +
|
||||
[f"y.s{'0123456789abcdef'[i]}" for i in range(VEC)])
|
||||
return f"""{prefix}__kernel void {kernel_name(dtype)}(__global {scalar} *out) {{
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
{scalar} bx = {cast}1.0f + {cast}(lid & 15) * {cast}0.001f;
|
||||
{scalar} by = {cast}1.0f + {cast}(gid & 15) * {cast}0.001f;
|
||||
{vec_type} x = ({vec_type})(
|
||||
{x_init});
|
||||
{vec_type} y = ({vec_type})(
|
||||
{y_init});
|
||||
|
||||
for (int i = 0; i < {loops}; i++) {{
|
||||
{mad_block}
|
||||
}}
|
||||
|
||||
out[get_global_id(0)] = {sum_terms};
|
||||
}}"""
|
||||
|
||||
|
||||
def run(args:argparse.Namespace) -> None:
|
||||
dev = Device[Device.DEFAULT]
|
||||
renderer = type(dev.renderer).__name__
|
||||
if renderer == "IR3Renderer":
|
||||
raise SystemExit("This repro uses OpenCL source. Use DEV=QCOM or DEV=CL, not DEV=QCOM:IR3.")
|
||||
|
||||
dtype = args.dtype
|
||||
dt = dtypes.half if dtype == "fp16" else dtypes.float
|
||||
src = make_kernel(args.loops, dtype)
|
||||
if args.print_source: print(src)
|
||||
lib = dev.compiler.compile_cached(src)
|
||||
if args.disasm: dev.compiler.disassemble(lib)
|
||||
|
||||
# Runtime aux mirrors OpenCLRenderer.aux: one __global output pointer at kernel arg 0.
|
||||
global_size = (args.groups, 1, 1)
|
||||
local_size = (args.local, 1, 1)
|
||||
workitems = args.groups * args.local
|
||||
flops = workitems * args.loops * MAD_OPS_PER_LOOP * VEC * 2
|
||||
|
||||
prg = dev.runtime(kernel_name(dtype), lib, (((0, dt.ptr()),),))
|
||||
out = Buffer(dev.device, workitems, dt, preallocate=True)
|
||||
|
||||
for _ in range(args.warmup):
|
||||
prg(out._buf, global_size=global_size, local_size=local_size, wait=True)
|
||||
|
||||
times = [prg(out._buf, global_size=global_size, local_size=local_size, wait=True) for _ in range(args.iters)]
|
||||
best = min(t for t in times if t is not None)
|
||||
out_bits = out.copyout(memoryview(bytearray(out.nbytes))).cast("H" if dtype == "fp16" else "I")[0]
|
||||
out_fmt = "04x" if dtype == "fp16" else "08x"
|
||||
|
||||
print(f"device={dev.device} renderer={renderer} arch={dev.arch}")
|
||||
print(f"dtype={dtype} groups={args.groups} local={args.local} workitems={workitems} loops={args.loops} flops={flops}")
|
||||
print(f"best={best*1e6:.2f} us {dtype}_mad_peak={flops / best * 1e-9:.2f} GFLOPS out0=0x{out_bits:{out_fmt}}")
|
||||
if args.show_times:
|
||||
print("times_us=" + ",".join(f"{t*1e6:.2f}" for t in times if t is not None))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="FP16/FP32 MAD peak repro for DEV=CL vs DEV=QCOM")
|
||||
parser.add_argument("--dtype", choices=("fp16", "fp32"), default="fp16", help="MAD datatype")
|
||||
parser.add_argument("--groups", type=int, default=2048, help="number of workgroups")
|
||||
parser.add_argument("--local", type=int, default=256, help="workitems per workgroup")
|
||||
parser.add_argument("--loops", type=int, default=8, help="inner loop count; default matches clpeak vec16")
|
||||
parser.add_argument("--warmup", type=int, default=2, help="warmup launches")
|
||||
parser.add_argument("--iters", type=int, default=10, help="timed launches")
|
||||
parser.add_argument("--show-times", action="store_true", help="print every timed launch")
|
||||
parser.add_argument("--print-source", action="store_true", help="print generated OpenCL source")
|
||||
parser.add_argument("--disasm", action="store_true", help="call the tinygrad compiler disassembler after compile")
|
||||
run(parser.parse_args())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -137,13 +137,15 @@ class QCOMComputeQueue(HWQueue):
|
|||
self.reg(mesa.REG_A6XX_TPL1_DBG_ECO_CNTL, 0)
|
||||
self.cmd(mesa.CP_WAIT_FOR_IDLE)
|
||||
|
||||
threadsize = prg.threadsize
|
||||
|
||||
self.reg(mesa.REG_A6XX_SP_CS_NDRANGE_0,
|
||||
qreg.a6xx_sp_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1),
|
||||
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_sp_cs_wge_cntl(threadsize=mesa.THREAD64),
|
||||
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_sp_cs_wge_cntl(threadsize=threadsize),
|
||||
cast_int(global_size[0], ceil=True), cast_int(global_size[1], ceil=True), cast_int(global_size[2], ceil=True))
|
||||
|
||||
self.reg(mesa.REG_A6XX_SP_CS_CNTL_0,
|
||||
qreg.a6xx_sp_cs_cntl_0(threadsize=mesa.THREAD64, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
|
||||
qreg.a6xx_sp_cs_cntl_0(threadsize=threadsize, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
|
||||
qreg.a6xx_sp_cs_cntl_1(constantrammode=mesa.CONSTLEN_256, shared_size=prg.shared_size), # should this be CONSTLEN_512?
|
||||
0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr),
|
||||
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.dev._stack.va_addr),
|
||||
|
|
@ -187,7 +189,7 @@ class QCOMComputeQueue(HWQueue):
|
|||
if prg.NIR:
|
||||
self.reg(mesa.REG_A6XX_SP_CS_CONST_CONFIG_0,
|
||||
qreg.a6xx_sp_cs_const_config_0(wgidconstid=prg.wgid, wgsizeconstid=prg.wgsz, wgoffsetconstid=0xfc, localidregid=prg.lid),
|
||||
qreg.a6xx_sp_cs_wge_cntl(linearlocalidregid=0xfc, threadsize=mesa.THREAD64))
|
||||
qreg.a6xx_sp_cs_wge_cntl(linearlocalidregid=0xfc, threadsize=threadsize))
|
||||
self.cmd(mesa.CP_EXEC_CS, 0,
|
||||
qreg.cp_exec_cs_1(ngroups_x=global_size[0]), qreg.cp_exec_cs_2(ngroups_y=global_size[1]), qreg.cp_exec_cs_3(_ngroups_z=global_size[2]))
|
||||
else: self.cmd(mesa.CP_RUN_OPENCL, 0)
|
||||
|
|
@ -251,6 +253,7 @@ class QCOMProgram(HCQProgram):
|
|||
|
||||
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
|
||||
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
|
||||
self.threadsize = mesa.THREAD128 if v.info.double_threadsize else mesa.THREAD64
|
||||
else: self._parse_lib(lib)
|
||||
|
||||
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
|
||||
|
|
@ -320,6 +323,10 @@ class QCOMProgram(HCQProgram):
|
|||
reg_desc_off = _read_lib(lib, 0x34)
|
||||
self.fregs, self.hregs = _read_lib(lib, reg_desc_off + 0x14), _read_lib(lib, reg_desc_off + 0x18)
|
||||
|
||||
# The Qualcomm OpenCL stack dispatches these binaries with 128-thread waves.
|
||||
# THREAD64 leaves half-rate ALU throughput for the same shader image.
|
||||
self.threadsize = mesa.THREAD128 if getenv("THREAD128") else mesa.THREAD64
|
||||
|
||||
class QCOMAllocator(HCQAllocatorBase):
|
||||
def _alloc(self, size:int, opts:BufferSpec) -> HCQBuffer:
|
||||
return self.dev._gpu_map(opts.external_ptr, size) if opts.external_ptr else self.dev._gpu_alloc(size)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue