remove CompiledRunner (#15970)

* rm usage of CompiledRunner

* more tests

* last

* linter

* sink

* remove

* linter
This commit is contained in:
nimlgen 2026-04-29 22:45:48 +03:00 committed by GitHub
commit dfd2d07005
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 77 additions and 107 deletions

View file

@ -4,7 +4,8 @@ import triton.language as tl
from triton.compiler import AttrsDescriptor, ASTSource, compile as triton_compile
import numpy as np
from tinygrad import Tensor, dtypes, Device
from tinygrad.engine.realize import CompiledRunner
from tinygrad.engine.realize import get_runtime
from tinygrad.codegen import to_program
from tinygrad.uop.ops import Ops, UOp, KernelInfo, ProgramInfo
from tinygrad.helpers import getenv
np.set_printoptions(suppress=True)
@ -92,13 +93,15 @@ if __name__ == "__main__":
info = ProgramInfo(name="matmul_kernel",
global_size=(M//BLOCK_SIZE_M, N//BLOCK_SIZE_N, 1), local_size=(32*compiled.metadata.num_warps, 1, 1))
sink = UOp.sink(arg=KernelInfo(name="matmul_kernel"))
prg_uop = UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=Device.DEFAULT), UOp(Ops.LINEAR), UOp(Ops.SOURCE, arg=src)), arg=info)
runner = CompiledRunner(prg_uop, Device.DEFAULT)
prg_uop = to_program(UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=Device.DEFAULT), UOp(Ops.LINEAR), UOp(Ops.SOURCE, arg=src)), arg=info),
Device.default.renderer)
rt = get_runtime(Device.DEFAULT, prg_uop)
all_bufs = [x.ensure_allocated() for x in bufs]
prg_bufs = [all_bufs[i] for i in runner.p.globals]
prg_bufs = [all_bufs[i] for i in info.globals]
gsize, lsize = info.launch_dims({})
tflops = []
for i in range(5):
tm = runner(prg_bufs, {}, wait=True)
tm = rt(*[b._buf for b in prg_bufs], global_size=gsize, local_size=lsize, vals=info.vals({}), wait=True)
tflops.append((2*M*K*N/tm)*1e-12)
print(f"TFLOPS: {max(tflops):.2f}")

View file

@ -2,10 +2,10 @@ import numpy as np
import unittest
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.uop.ops import UOp, Ops, GroupOp, AxisType
from tinygrad.uop.ops import UOp, Ops, GroupOp, AxisType, buffers
from tinygrad.device import Device, Buffer, is_dtype_supported
from tinygrad.tensor import Tensor, _to_np_dtype
from tinygrad.engine.realize import run_linear, CompiledRunner
from tinygrad.engine.realize import run_linear
from tinygrad.codegen import to_program
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, DEV
from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
@ -424,30 +424,28 @@ def reset_bufs(bufs:list[Buffer]):
def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[],
apply_tc=False, atol=1e-4, rtol=1e-4, color_sizes=[], wanna_output=[]):
outbufs = real_bufs[:len(realized_ast.src)]
device = real_bufs[0].device
wanna_output = [np.array(x).flatten() for x in wanna_output]
buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in real_bufs]
for u,b in zip(buf_uops, real_bufs): buffers[u] = b
def get_prg(opts):
def run_prg(opts):
ast = realized_ast if opts is None else replace_opts(realized_ast, list(opts))
return CompiledRunner(to_program(ast, renderer=Device[Device.DEFAULT].renderer), device)
run_linear(UOp(Ops.LINEAR, src=(ast.call(*buf_uops),)))
def check_opt(opts):
prg = get_prg(opts=opts)
reset_bufs(outbufs)
prg.exec(real_bufs)
run_prg(opts)
for x,want in zip(copyout_outputs(outbufs), wanna_output): np.testing.assert_allclose(x, want, atol=atol, rtol=rtol)
# Get baseline if it is not provided, which is not optimized at all.
prg = get_prg(opts=())
prg.exec(real_bufs)
run_prg(opts=())
if len(wanna_output) == 0: wanna_output = copyout_outputs(outbufs)
else:
for buf,want in zip(copyout_outputs(outbufs), wanna_output): np.testing.assert_allclose(buf, want, atol=atol, rtol=rtol)
# Check correctness of handcoded optimiztions.
prg = get_prg(opts=None)
reset_bufs(outbufs)
prg.exec(real_bufs)
run_prg(opts=None)
for buf,want in zip(copyout_outputs(outbufs), wanna_output): np.testing.assert_allclose(buf, want, atol=atol, rtol=rtol)
for x in opts: # Check custom transformations if any.
check_opt(([Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, 1))] if apply_tc else [])+x)

View file

@ -1,9 +1,8 @@
import unittest
import numpy as np
from dataclasses import replace
from tinygrad.device import Buffer, Device, is_dtype_supported
from tinygrad.device import Device, is_dtype_supported
from tinygrad.dtype import dtypes, ConstType
from tinygrad.engine.realize import CompiledRunner
from tinygrad.engine.realize import run_linear
from tinygrad.codegen import to_program
from tinygrad.helpers import prod
from tinygrad.renderer.cstyle import CStyleLanguage
@ -13,17 +12,13 @@ from tinygrad.runtime.ops_python import PythonRenderer
from tinygrad.uop.ops import UOp, Ops, KernelInfo, python_alu
from tinygrad.tensor import Tensor, _to_np_dtype
def _test_uop_result(inputs:list[Tensor], prg:UOp, local_size=None):
def _test_uop_result(inputs:list[Tensor], sink:UOp, local_size=None):
for x in inputs: x.realize()
uops = prg.src[2].src
outbufs = [Buffer(Device.DEFAULT, sz:=(1 if local_size is None else prod(local_size)), (dtype:=u.src[1].dtype), \
initial_value=np.zeros(sz, dtype=_to_np_dtype(dtype)).data) for u in uops if u.op is Ops.STORE]
inbufs = [x.uop.base.buffer for x in inputs]
info = prg.arg
if local_size is not None: info = replace(info, local_size=tuple(local_size))
ei = CompiledRunner(prg.replace(arg=info), Device.DEFAULT)
ei.exec(outbufs+inbufs)
return [np.frombuffer(x.as_memoryview(), _to_np_dtype(x.dtype)) for x in outbufs]
sz = 1 if local_size is None else prod(local_size)
outs = [UOp.new_buffer(Device.DEFAULT, sz, u.src[1].dtype) for u in sink.src if u.op is Ops.STORE]
for u in outs: u.buffer.allocate().copyin(np.zeros(sz, dtype=_to_np_dtype(u.dtype)).data)
run_linear(UOp(Ops.LINEAR, src=(sink.call(*outs, *(x.uop.base for x in inputs)),)))
return [u.buffer.numpy() for u in outs]
def _setup_and_test_alu(alu_op:Ops, input_val:ConstType, *alu_src_uops:UOp):
dtype = alu_src_uops[0].dtype
@ -33,9 +28,7 @@ def _setup_and_test_alu(alu_op:Ops, input_val:ConstType, *alu_src_uops:UOp):
ld = b.index(idx)
alu = ld.alu(alu_op, *alu_src_uops)
store = UOp.store(a.index(idx), alu)
sink = UOp(Ops.SINK, dtypes.void, (store,), arg=KernelInfo())
prg = to_program(sink, Device[Device.DEFAULT].renderer)
return _test_uop_result([Tensor([input_val])], prg)[0]
return _test_uop_result([Tensor([input_val])], UOp(Ops.SINK, dtypes.void, (store,), arg=KernelInfo()))[0]
class TestRendererFailures(unittest.TestCase):
@unittest.skipIf(not isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, PythonRenderer)), "test is for ptx or python renderer")
@ -44,8 +37,7 @@ class TestRendererFailures(unittest.TestCase):
gate_alu = (lidx0:=UOp(Ops.SPECIAL, dtypes.int, (UOp.const(dtypes.int, 4),), 'lidx0')).ne(0)
gated_alu_store = UOp(Ops.STORE, dtypes.void, (a.index(lidx0.valid(gate_alu)), UOp.const(dtypes.int, 1)))
sink = UOp(Ops.SINK, dtypes.void, (gated_alu_store,), arg=KernelInfo())
prg = to_program(sink, Device[Device.DEFAULT].renderer)
ret = _test_uop_result([], prg, local_size=[4, 1, 1])[0]
ret = _test_uop_result([], sink, local_size=[4, 1, 1])[0]
np.testing.assert_equal(ret, [0, 1, 1, 1])
@unittest.skipIf(not isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, PythonRenderer)), "test is for ptx or python renderer")
@ -55,8 +47,7 @@ class TestRendererFailures(unittest.TestCase):
gate_alu_1 = (lidx1:=UOp(Ops.SPECIAL, dtypes.int, (UOp.const(dtypes.int, 2),), 'lidx1')).ne(0)
gated_alu_store = UOp(Ops.STORE, dtypes.void, (a.index((lidx0+lidx1*4).valid(gate_alu_0&gate_alu_1)), UOp.const(dtypes.int, 1)))
sink = UOp(Ops.SINK, dtypes.void, (gated_alu_store,), arg=KernelInfo())
prg = to_program(sink, Device[Device.DEFAULT].renderer)
ret = _test_uop_result([], prg, local_size=[4, 2, 1])[0]
ret = _test_uop_result([], sink, local_size=[4, 2, 1])[0]
np.testing.assert_equal(ret, [0, 0, 0, 0, 0, 1, 1, 1])
@unittest.skipIf(not isinstance(Device[Device.DEFAULT].renderer, CStyleLanguage), "uops are for cstyle")
@ -102,8 +93,7 @@ class TestPTXFailures(unittest.TestCase):
if_uop = UOp(Ops.IF, dtypes.void, (gate_alu,))
gated_alu_store = UOp(Ops.STORE, dtypes.void, (a.index(lidx0, if_uop), val))
sink = UOp(Ops.SINK, dtypes.void, (gated_alu_store,), arg=KernelInfo())
prg = to_program(sink, Device[Device.DEFAULT].renderer)
ret = _test_uop_result([], prg, local_size=[4, 1, 1])[0]
ret = _test_uop_result([], sink, local_size=[4, 1, 1])[0]
np.testing.assert_equal(ret, [0, 1, 1, 1])
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")

View file

@ -5,18 +5,19 @@ from tinygrad.tensor import Tensor, _to_np_dtype
from tinygrad.helpers import CI, Context
from tinygrad.dtype import dtypes, DType, AddrSpace, ConstFloat # noqa: F401
from tinygrad.device import Buffer, Device
from tinygrad.uop.ops import Ops, UOp, KernelInfo, AxisType
from tinygrad.uop.ops import Ops, UOp, KernelInfo, AxisType, buffers
from tinygrad.renderer.cstyle import CStyleLanguage
from tinygrad.engine.realize import CompiledRunner, run_linear
from tinygrad.engine.realize import run_linear
from tinygrad.codegen import to_program
from tinygrad.device import is_dtype_supported
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.renderer.ptx import PTXRenderer
from test.helpers import to_uops_list
def _uops_to_prg(uops_list):
prg = to_program(UOp.sink(*uops_list, arg=KernelInfo()), Device[Device.DEFAULT].renderer)
return CompiledRunner(prg, Device.DEFAULT)
def run_uops(uops_list:list[UOp], bufs:list[Buffer]):
buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in bufs]
for u,b in zip(buf_uops, bufs): buffers[u] = b
run_linear(UOp(Ops.LINEAR, src=(UOp.sink(*uops_list, arg=KernelInfo()).call(*buf_uops),)))
def uop(uops:list[UOp], op:Ops, dtype:Optional[DType], src:tuple[UOp, ...], arg:Any=None) -> UOp:
if op is Ops.CONST: uops.append(UOp.const(dtype, arg))
@ -33,8 +34,7 @@ def _test_single_value(vals, op, dts):
out = uop(uops, Ops.STORE, dtypes.void, (buf_store.index(uop(uops, Ops.CONST, dtypes.int32, (), 0), ptr=True), alu))
buf = Buffer(Device.DEFAULT, 1, output_dtype).allocate()
buf2 = [Buffer(Device.DEFAULT, 1, dtype).allocate().copyin(np.array([a], dtype=_to_np_dtype(dtype)).data) for a,dtype in zip(vals, dts)]
prg = _uops_to_prg([out])
prg.exec([buf]+buf2)
run_uops([out], [buf]+buf2)
ret = np.empty(1, _to_np_dtype(output_dtype))
buf.copyout(ret.data)
return ret[0]
@ -47,8 +47,7 @@ def _test_single_value_const(vals, op, dts):
alu = uop(uops, op, output_dtype, loads)
out = buf_store[UOp.const(dtypes.int32, 0)].store(alu)
buf = Buffer(Device.DEFAULT, 1, output_dtype).allocate()
prg = _uops_to_prg([out])
prg.exec([buf])
run_uops([out], [buf])
ret = np.empty(1, _to_np_dtype(output_dtype))
buf.copyout(ret.data)
return ret[0]
@ -59,8 +58,7 @@ def _test_uops_result(output_dtype, uops, res):
# res = output_fn(uops)
out = uop(uops, Ops.STORE, dtypes.void, (buf_store.index(uop(uops, Ops.CONST, dtypes.int32, (), 0)), res))
buf = Buffer(Device.DEFAULT, 1, output_dtype).allocate()
prg = _uops_to_prg([out])
prg.exec([buf])
run_uops([out], [buf])
ret = np.empty(1, _to_np_dtype(output_dtype))
buf.copyout(ret.data)
return ret[0]

View file

@ -3,7 +3,7 @@ from dataclasses import replace
from tinygrad import dtypes, Device
from tinygrad.uop.ops import UOp, AxisType, Ops, KernelInfo
from tinygrad.codegen.opt import Opt, OptOps # pylint: disable=unused-import
from tinygrad.engine.realize import CompiledRunner
from tinygrad.engine.realize import get_runtime
from tinygrad.codegen import to_program
from tinygrad.helpers import dedup, getenv
from tinygrad.device import Buffer
@ -90,12 +90,13 @@ renderer = Device.default.renderer
allocator = Device.default.allocator
ps = to_program(ast, renderer)
cr = CompiledRunner(ps, Device.DEFAULT)
rt = get_runtime(Device.DEFAULT, ps)
gs = sorted(dedup([u for u in ast.toposort() if u.op is Ops.PARAM]), key=lambda u: u.arg)
# print(len(gs))
# print([g.dtype for g in gs])
bufs = [Buffer(ps.arg.device, g.size, g.dtype if isinstance(g.dtype, ImageDType) else g.dtype._base).ensure_allocated() for g in gs]
t = cr(bufs, wait=True)
gsize, lsize = ps.arg.launch_dims({})
t = rt(*[b._buf for b in bufs], global_size=gsize, local_size=lsize, vals=ps.arg.vals({}), wait=True)
print(f"{t*1e6:.2f} us")

View file

@ -3,12 +3,12 @@ import unittest
from tinygrad import Device, Tensor, dtypes
from tinygrad.tensor import _to_np_dtype
from tinygrad.uop.ops import Ops
from tinygrad.uop.ops import Ops, UOp, buffers
from tinygrad.dtype import DType
from tinygrad.device import is_dtype_supported
from tinygrad.device import Buffer, is_dtype_supported
from tinygrad.helpers import DEV, Context
from test.helpers import slow, replace_opts
from tinygrad.engine.realize import CompiledRunner
from tinygrad.engine.realize import run_linear
from tinygrad.codegen import to_program
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
from tinygrad.codegen.opt.tc import amd_cdna_1616128
@ -20,6 +20,11 @@ from test.backend.test_linearizer import helper_realized_ast, helper_linearizer_
AMX = "AMX" in DEV.arch
def run_program(prg:UOp, bufs:list[Buffer]):
buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in bufs]
for u,b in zip(buf_uops, bufs): buffers[u] = b
run_linear(UOp(Ops.LINEAR, src=(prg.call(*buf_uops),)))
def helper_tc_ensure_uops_and_opts_count(N: int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0,
ensure_triggered:bool=True):
a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in)
@ -47,11 +52,11 @@ def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axi
if dtype_in == dtypes.bfloat16: r = r.float()
realized_ast, bufs = helper_realized_ast(r)
opts = [Opt(op=OptOps.TC, axis=axis, arg=(tc_select, tc_opt, use_tensor_cores))]
pu = to_program(replace_opts(realized_ast, opts), Device[Device.DEFAULT].renderer)
ast = replace_opts(realized_ast, opts)
pu = to_program(ast, Device[Device.DEFAULT].renderer)
if use_tensor_cores == 1: assert len([uop for uop in pu.src[2].src if uop.op is Ops.WMMA]) > 0, "wmma not triggered"
assert len([x for x in pu.src[0].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
prg = CompiledRunner(pu, Device.DEFAULT)
prg.exec(bufs)
run_program(ast, bufs)
if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = (1e-1, 2e-2) if dtype_out == dtypes.bfloat16 else (1e-2, 1e-2)
else: tc_atol, tc_rtol = 5e-3, 1e-4
@ -145,15 +150,15 @@ class TestTensorCores(unittest.TestCase):
c = a.conv2d(b, padding=1, dtype=tc.dtype_out)
realized_ast, real_bufs = helper_realized_ast(c)
program = to_program(replace_opts(realized_ast, [Opt(OptOps.TC, axis, (-1, 2, 1))]), Device[Device.DEFAULT].renderer)
ast = replace_opts(realized_ast, [Opt(OptOps.TC, axis, (-1, 2, 1))])
program = to_program(ast, Device[Device.DEFAULT].renderer)
assert len([uop for uop in tuple(program.src[2].src) if uop.op is Ops.WMMA]) > 0, "tensor core not triggered"
assert len([x for x in program.src[0].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
prg = CompiledRunner(program, Device.DEFAULT)
# TODO: support this even if numpy doesn't
if _to_np_dtype(real_bufs[0].dtype) is None: continue
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=_to_np_dtype(real_bufs[0].dtype)).data) # Zero to check that all values are filled
prg.exec(real_bufs)
run_program(ast, real_bufs)
result = np.frombuffer(real_bufs[0].as_memoryview(), _to_np_dtype(real_bufs[0].dtype))
# ensure the results for each choice of axis matches

View file

@ -1,12 +1,12 @@
import functools, math, time, multiprocessing, traceback, signal, atexit
import math, time, multiprocessing, traceback, signal, atexit
from dataclasses import replace
from tinygrad.uop.ops import sym_infer, AxisType, pyrender, UOp, Ops
from tinygrad.device import Device, Buffer, Compiler
from tinygrad.uop.ops import sym_infer, AxisType, pyrender, UOp
from tinygrad.device import Device, Buffer
from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, time_to_str, unwrap
from tinygrad.helpers import IGNORE_BEAM_CACHE
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
from tinygrad.tensor import Tensor
from tinygrad.engine.realize import CompiledRunner
from tinygrad.engine.realize import get_runtime
from tinygrad.codegen import to_program
from tinygrad.codegen.opt.postrange import Scheduler
@ -34,25 +34,24 @@ def get_test_global_size(global_size, max_global_size, var_vals):
break
return test_global_size, input_size / prod(test_global_size)
def _time_program(prg:UOp, lib:bytes, var_vals:dict[str, int], rawbufs:list[Buffer], early_stop:float|None=None,
def _time_program(prg:UOp, var_vals:dict[str, int], rawbufs:list[Buffer], early_stop:float|None=None,
allow_test_size:int=True, max_global_size:int|None=65536, clear_l2=False, cnt=3, name="test", dev_timeout=False) -> list[float]:
timeout = int(early_stop * 1e3) if dev_timeout and early_stop is not None and early_stop < math.inf else None
factor = 1
info = prg.arg
if allow_test_size and max_global_size is not None:
global_size, factor = get_test_global_size(info.global_size, max_global_size, var_vals)
prg = prg.replace(arg=replace(info, global_size=tuple(global_size)))
if len(prg.src) <= 4 or prg.src[4].op is not Ops.BINARY: prg = prg.replace(src=prg.src + (UOp(Ops.BINARY, arg=lib),))
try: car = CompiledRunner(prg, prg.src[1].arg)
global_size, factor = get_test_global_size(prg.arg.global_size, max_global_size, var_vals)
prg = prg.replace(arg=replace(prg.arg, global_size=tuple(global_size)))
try: rt = get_runtime(prg.src[1].arg, prg)
except AssertionError: return [math.inf] * cnt
global_size, local_size = prg.arg.launch_dims(var_vals)
bufs = [rawbufs[i]._buf for i in prg.arg.globals]
tms = []
input_bufs = [rawbufs[i] for i in car.p.globals]
for _ in range(cnt):
if clear_l2:
if hasattr(dev:=Device[prg.src[1].arg], 'invalidate_caches'): dev.invalidate_caches()
else:
with Context(DEBUG=0, BEAM=0, CAPTURING=0, TRACK_MATCH_STATS=0): Tensor.ones(1024,1024).contiguous().realize(do_update_stats=False)
tms.append(unwrap(car(input_bufs, var_vals, wait=True, timeout=timeout))*factor)
tms.append(unwrap(rt(*bufs, global_size=global_size, local_size=local_size, vals=prg.arg.vals(var_vals), wait=True, timeout=timeout))*factor)
if early_stop is not None and early_stop < min(tms): break
return tms
@ -61,22 +60,21 @@ def timeout_handler(signum, frame):
if DEBUG >= 2: print("*** BEAM COMPILE TIMEOUT")
raise TimeoutException()
def _try_compile(x:tuple[int,Scheduler], compiler:Compiler) -> tuple[int, tuple[UOp, bytes, float]|None]:
def _try_compile(x:tuple[int,Scheduler]) -> tuple[int, tuple[UOp, float]|None]:
if hasattr(signal, "alarm"):
signal.signal(getattr(signal, 'SIGALRM'), timeout_handler)
# set timeout
signal.alarm(getenv("BEAM_TIMEOUT_SEC", 10))
ret = None
try:
st = time.perf_counter()
prg = to_program(x[1].copy().get_optimized_ast(name_override="test"), x[1].ren)
et = time.perf_counter() - st
uops = prg.src[2].src
if len(uops) >= (uops_max:=getenv("BEAM_UOPS_MAX", 3000)) > 0:
if getenv("BEAM_LOG_SURPASS_MAX"): print(f"too many uops. {len(uops)=}, {uops_max=}")
raise RuntimeError("too many uops")
st = time.perf_counter()
prog = prg.src[4].arg if len(prg.src) > 4 and prg.src[4].op is Ops.BINARY else compiler.compile(prg.src[3].arg)
et = time.perf_counter() - st
ret = (prg, prog, et)
ret = (prg, et)
except RuntimeError:
if DEBUG >= 4: traceback.print_exc()
except Exception as e:
@ -150,12 +148,11 @@ def beam_search(s:Scheduler, rawbufs:list[Buffer], amt:int, allow_test_size=True
while not exiting:
candidates: list[Scheduler] = flatten([get_kernel_actions(si, include_0=False).values() for si,_ in beam])
timed: list[tuple[Scheduler, float]] = []
_compile_fn = functools.partial(_try_compile, compiler=dev.compiler)
least_compute_ops = math.inf
for i,proc in (map(_compile_fn, enumerate(candidates)) if beam_pool is None else beam_pool.imap_unordered(_compile_fn, enumerate(candidates))):
for i, proc in ((map if beam_pool is None else beam_pool.imap_unordered)(_try_compile, enumerate(candidates))):
if proc is None: continue
prg, lib, compile_et = proc
if lib in seen_libs: continue
prg, compile_et = proc
if (lib:=prg.src[4].arg) in seen_libs: continue
# filter out kernels that use 1000x more compute than the smallest
estimates = prg.src[0].arg.estimates
least_compute_ops = min(this_compute_ops:=sym_infer(estimates.ops if estimates is not None else 0, var_vals), least_compute_ops)
@ -163,7 +160,7 @@ def beam_search(s:Scheduler, rawbufs:list[Buffer], amt:int, allow_test_size=True
if getenv("BEAM_LOG_SURPASS_MAX"): print(f"too much compute. {this_compute_ops} when least is {least_compute_ops}")
continue
seen_libs.add(lib)
try: tms = _time_program(prg, lib, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0,
try: tms = _time_program(prg, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0,
allow_test_size=allow_test_size, clear_l2=hasattr(dev, 'invalidate_caches'),
dev_timeout=getenv("BEAM_DEV_TIMEOUT", 1))
except Exception as e:

View file

@ -1,8 +1,8 @@
from typing import cast, Iterator, Any
import time, random, itertools, math, contextlib, weakref
from dataclasses import dataclass, replace, field
from tinygrad.helpers import colored, DEBUG, GlobalCounters, ansilen, all_int, Metadata, TRACEMETA, TracingKey, prod, flatten
from tinygrad.helpers import BEAM, size_to_str, time_to_str, VALIDATE_WITH_CPU, cpu_profile, PROFILE, ProfilePointEvent, cpu_events
from tinygrad.helpers import colored, DEBUG, GlobalCounters, ansilen, all_int, Metadata, TRACEMETA, prod, flatten
from tinygrad.helpers import BEAM, size_to_str, time_to_str, VALIDATE_WITH_CPU, PROFILE, ProfilePointEvent, cpu_events
from tinygrad.dtype import dtypes
from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, sym_infer, buffers, graph_rewrite, ProgramInfo
from tinygrad.device import Device, Buffer, MultiBuffer
@ -93,28 +93,6 @@ def optimize_local_size(call:UOp, prg:UOp) -> UOp|None:
new_global = tuple(g//l if g%l == 0 else g/l for g,l in zip(prg.arg.global_size, local_size))
return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:]))
class CompiledRunner(Runner):
def __init__(self, prg:UOp, device:str):
info: ProgramInfo = prg.arg
sink = prg.src[0]
if DEBUG >= 3 and sink.arg.applied_opts: print(sink.arg.applied_opts)
if DEBUG >= 4: print(prg.src[3].arg)
if len(prg.src) <= 4 or prg.src[4].op is not Ops.BINARY:
with cpu_profile(TracingKey(f"compile {info.name}", (info.function_name,)), "TINY"):
lib = Device[device].compiler.compile_cached(prg.src[3].arg)
prg = prg.replace(src=prg.src + (UOp(Ops.BINARY, arg=lib),))
self.prg:UOp = prg
self.p:ProgramInfo = info
if DEBUG >= 7: Device[device].compiler.disassemble(prg.src[4].arg)
self._prg = Device[device].runtime(info.function_name, prg.src[4].arg, *info.aux, runtimevars=info.runtimevars)
super().__init__(info.name, device, sink.arg.estimates or Estimates())
def __call__(self, rawbufs:list[Buffer], var_vals:dict[str, int]|None=None, wait=False, timeout:int|None=None) -> float|None:
if var_vals is None: var_vals = {}
global_size, local_size = self.p.launch_dims(var_vals)
return self._prg(*[x._buf for x in rawbufs], global_size=tuple(global_size), local_size=tuple(local_size) if local_size else None,
vals=tuple(var_vals[k.expr] if k.expr not in self.p.runtimevars else None for k in self.p.vars), wait=wait, timeout=timeout)
# **************** method cache ****************
runtime_cache: dict[tuple[bytes, str], Any] = {}