mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
remove CompiledRunner (#15970)
* rm usage of CompiledRunner * more tests * last * linter * sink * remove * linter
This commit is contained in:
parent
0080489abe
commit
dfd2d07005
8 changed files with 77 additions and 107 deletions
|
|
@ -4,7 +4,8 @@ import triton.language as tl
|
|||
from triton.compiler import AttrsDescriptor, ASTSource, compile as triton_compile
|
||||
import numpy as np
|
||||
from tinygrad import Tensor, dtypes, Device
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.engine.realize import get_runtime
|
||||
from tinygrad.codegen import to_program
|
||||
from tinygrad.uop.ops import Ops, UOp, KernelInfo, ProgramInfo
|
||||
from tinygrad.helpers import getenv
|
||||
np.set_printoptions(suppress=True)
|
||||
|
|
@ -92,13 +93,15 @@ if __name__ == "__main__":
|
|||
info = ProgramInfo(name="matmul_kernel",
|
||||
global_size=(M//BLOCK_SIZE_M, N//BLOCK_SIZE_N, 1), local_size=(32*compiled.metadata.num_warps, 1, 1))
|
||||
sink = UOp.sink(arg=KernelInfo(name="matmul_kernel"))
|
||||
prg_uop = UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=Device.DEFAULT), UOp(Ops.LINEAR), UOp(Ops.SOURCE, arg=src)), arg=info)
|
||||
runner = CompiledRunner(prg_uop, Device.DEFAULT)
|
||||
prg_uop = to_program(UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=Device.DEFAULT), UOp(Ops.LINEAR), UOp(Ops.SOURCE, arg=src)), arg=info),
|
||||
Device.default.renderer)
|
||||
rt = get_runtime(Device.DEFAULT, prg_uop)
|
||||
all_bufs = [x.ensure_allocated() for x in bufs]
|
||||
prg_bufs = [all_bufs[i] for i in runner.p.globals]
|
||||
prg_bufs = [all_bufs[i] for i in info.globals]
|
||||
gsize, lsize = info.launch_dims({})
|
||||
tflops = []
|
||||
for i in range(5):
|
||||
tm = runner(prg_bufs, {}, wait=True)
|
||||
tm = rt(*[b._buf for b in prg_bufs], global_size=gsize, local_size=lsize, vals=info.vals({}), wait=True)
|
||||
tflops.append((2*M*K*N/tm)*1e-12)
|
||||
print(f"TFLOPS: {max(tflops):.2f}")
|
||||
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@ import numpy as np
|
|||
import unittest
|
||||
|
||||
from tinygrad.codegen.opt import Opt, OptOps
|
||||
from tinygrad.uop.ops import UOp, Ops, GroupOp, AxisType
|
||||
from tinygrad.uop.ops import UOp, Ops, GroupOp, AxisType, buffers
|
||||
from tinygrad.device import Device, Buffer, is_dtype_supported
|
||||
from tinygrad.tensor import Tensor, _to_np_dtype
|
||||
from tinygrad.engine.realize import run_linear, CompiledRunner
|
||||
from tinygrad.engine.realize import run_linear
|
||||
from tinygrad.codegen import to_program
|
||||
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, DEV
|
||||
from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
|
||||
|
|
@ -424,30 +424,28 @@ def reset_bufs(bufs:list[Buffer]):
|
|||
def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[],
|
||||
apply_tc=False, atol=1e-4, rtol=1e-4, color_sizes=[], wanna_output=[]):
|
||||
outbufs = real_bufs[:len(realized_ast.src)]
|
||||
device = real_bufs[0].device
|
||||
wanna_output = [np.array(x).flatten() for x in wanna_output]
|
||||
buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in real_bufs]
|
||||
for u,b in zip(buf_uops, real_bufs): buffers[u] = b
|
||||
|
||||
def get_prg(opts):
|
||||
def run_prg(opts):
|
||||
ast = realized_ast if opts is None else replace_opts(realized_ast, list(opts))
|
||||
return CompiledRunner(to_program(ast, renderer=Device[Device.DEFAULT].renderer), device)
|
||||
run_linear(UOp(Ops.LINEAR, src=(ast.call(*buf_uops),)))
|
||||
|
||||
def check_opt(opts):
|
||||
prg = get_prg(opts=opts)
|
||||
reset_bufs(outbufs)
|
||||
prg.exec(real_bufs)
|
||||
run_prg(opts)
|
||||
for x,want in zip(copyout_outputs(outbufs), wanna_output): np.testing.assert_allclose(x, want, atol=atol, rtol=rtol)
|
||||
|
||||
# Get baseline if it is not provided, which is not optimized at all.
|
||||
prg = get_prg(opts=())
|
||||
prg.exec(real_bufs)
|
||||
run_prg(opts=())
|
||||
if len(wanna_output) == 0: wanna_output = copyout_outputs(outbufs)
|
||||
else:
|
||||
for buf,want in zip(copyout_outputs(outbufs), wanna_output): np.testing.assert_allclose(buf, want, atol=atol, rtol=rtol)
|
||||
|
||||
# Check correctness of handcoded optimiztions.
|
||||
prg = get_prg(opts=None)
|
||||
reset_bufs(outbufs)
|
||||
prg.exec(real_bufs)
|
||||
run_prg(opts=None)
|
||||
for buf,want in zip(copyout_outputs(outbufs), wanna_output): np.testing.assert_allclose(buf, want, atol=atol, rtol=rtol)
|
||||
for x in opts: # Check custom transformations if any.
|
||||
check_opt(([Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, 1))] if apply_tc else [])+x)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,8 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from dataclasses import replace
|
||||
from tinygrad.device import Buffer, Device, is_dtype_supported
|
||||
from tinygrad.device import Device, is_dtype_supported
|
||||
from tinygrad.dtype import dtypes, ConstType
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.engine.realize import run_linear
|
||||
from tinygrad.codegen import to_program
|
||||
from tinygrad.helpers import prod
|
||||
from tinygrad.renderer.cstyle import CStyleLanguage
|
||||
|
|
@ -13,17 +12,13 @@ from tinygrad.runtime.ops_python import PythonRenderer
|
|||
from tinygrad.uop.ops import UOp, Ops, KernelInfo, python_alu
|
||||
from tinygrad.tensor import Tensor, _to_np_dtype
|
||||
|
||||
def _test_uop_result(inputs:list[Tensor], prg:UOp, local_size=None):
|
||||
def _test_uop_result(inputs:list[Tensor], sink:UOp, local_size=None):
|
||||
for x in inputs: x.realize()
|
||||
uops = prg.src[2].src
|
||||
outbufs = [Buffer(Device.DEFAULT, sz:=(1 if local_size is None else prod(local_size)), (dtype:=u.src[1].dtype), \
|
||||
initial_value=np.zeros(sz, dtype=_to_np_dtype(dtype)).data) for u in uops if u.op is Ops.STORE]
|
||||
inbufs = [x.uop.base.buffer for x in inputs]
|
||||
info = prg.arg
|
||||
if local_size is not None: info = replace(info, local_size=tuple(local_size))
|
||||
ei = CompiledRunner(prg.replace(arg=info), Device.DEFAULT)
|
||||
ei.exec(outbufs+inbufs)
|
||||
return [np.frombuffer(x.as_memoryview(), _to_np_dtype(x.dtype)) for x in outbufs]
|
||||
sz = 1 if local_size is None else prod(local_size)
|
||||
outs = [UOp.new_buffer(Device.DEFAULT, sz, u.src[1].dtype) for u in sink.src if u.op is Ops.STORE]
|
||||
for u in outs: u.buffer.allocate().copyin(np.zeros(sz, dtype=_to_np_dtype(u.dtype)).data)
|
||||
run_linear(UOp(Ops.LINEAR, src=(sink.call(*outs, *(x.uop.base for x in inputs)),)))
|
||||
return [u.buffer.numpy() for u in outs]
|
||||
|
||||
def _setup_and_test_alu(alu_op:Ops, input_val:ConstType, *alu_src_uops:UOp):
|
||||
dtype = alu_src_uops[0].dtype
|
||||
|
|
@ -33,9 +28,7 @@ def _setup_and_test_alu(alu_op:Ops, input_val:ConstType, *alu_src_uops:UOp):
|
|||
ld = b.index(idx)
|
||||
alu = ld.alu(alu_op, *alu_src_uops)
|
||||
store = UOp.store(a.index(idx), alu)
|
||||
sink = UOp(Ops.SINK, dtypes.void, (store,), arg=KernelInfo())
|
||||
prg = to_program(sink, Device[Device.DEFAULT].renderer)
|
||||
return _test_uop_result([Tensor([input_val])], prg)[0]
|
||||
return _test_uop_result([Tensor([input_val])], UOp(Ops.SINK, dtypes.void, (store,), arg=KernelInfo()))[0]
|
||||
|
||||
class TestRendererFailures(unittest.TestCase):
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, PythonRenderer)), "test is for ptx or python renderer")
|
||||
|
|
@ -44,8 +37,7 @@ class TestRendererFailures(unittest.TestCase):
|
|||
gate_alu = (lidx0:=UOp(Ops.SPECIAL, dtypes.int, (UOp.const(dtypes.int, 4),), 'lidx0')).ne(0)
|
||||
gated_alu_store = UOp(Ops.STORE, dtypes.void, (a.index(lidx0.valid(gate_alu)), UOp.const(dtypes.int, 1)))
|
||||
sink = UOp(Ops.SINK, dtypes.void, (gated_alu_store,), arg=KernelInfo())
|
||||
prg = to_program(sink, Device[Device.DEFAULT].renderer)
|
||||
ret = _test_uop_result([], prg, local_size=[4, 1, 1])[0]
|
||||
ret = _test_uop_result([], sink, local_size=[4, 1, 1])[0]
|
||||
np.testing.assert_equal(ret, [0, 1, 1, 1])
|
||||
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, PythonRenderer)), "test is for ptx or python renderer")
|
||||
|
|
@ -55,8 +47,7 @@ class TestRendererFailures(unittest.TestCase):
|
|||
gate_alu_1 = (lidx1:=UOp(Ops.SPECIAL, dtypes.int, (UOp.const(dtypes.int, 2),), 'lidx1')).ne(0)
|
||||
gated_alu_store = UOp(Ops.STORE, dtypes.void, (a.index((lidx0+lidx1*4).valid(gate_alu_0&gate_alu_1)), UOp.const(dtypes.int, 1)))
|
||||
sink = UOp(Ops.SINK, dtypes.void, (gated_alu_store,), arg=KernelInfo())
|
||||
prg = to_program(sink, Device[Device.DEFAULT].renderer)
|
||||
ret = _test_uop_result([], prg, local_size=[4, 2, 1])[0]
|
||||
ret = _test_uop_result([], sink, local_size=[4, 2, 1])[0]
|
||||
np.testing.assert_equal(ret, [0, 0, 0, 0, 0, 1, 1, 1])
|
||||
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT].renderer, CStyleLanguage), "uops are for cstyle")
|
||||
|
|
@ -102,8 +93,7 @@ class TestPTXFailures(unittest.TestCase):
|
|||
if_uop = UOp(Ops.IF, dtypes.void, (gate_alu,))
|
||||
gated_alu_store = UOp(Ops.STORE, dtypes.void, (a.index(lidx0, if_uop), val))
|
||||
sink = UOp(Ops.SINK, dtypes.void, (gated_alu_store,), arg=KernelInfo())
|
||||
prg = to_program(sink, Device[Device.DEFAULT].renderer)
|
||||
ret = _test_uop_result([], prg, local_size=[4, 1, 1])[0]
|
||||
ret = _test_uop_result([], sink, local_size=[4, 1, 1])[0]
|
||||
np.testing.assert_equal(ret, [0, 1, 1, 1])
|
||||
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
|
||||
|
|
|
|||
|
|
@ -5,18 +5,19 @@ from tinygrad.tensor import Tensor, _to_np_dtype
|
|||
from tinygrad.helpers import CI, Context
|
||||
from tinygrad.dtype import dtypes, DType, AddrSpace, ConstFloat # noqa: F401
|
||||
from tinygrad.device import Buffer, Device
|
||||
from tinygrad.uop.ops import Ops, UOp, KernelInfo, AxisType
|
||||
from tinygrad.uop.ops import Ops, UOp, KernelInfo, AxisType, buffers
|
||||
from tinygrad.renderer.cstyle import CStyleLanguage
|
||||
from tinygrad.engine.realize import CompiledRunner, run_linear
|
||||
from tinygrad.engine.realize import run_linear
|
||||
from tinygrad.codegen import to_program
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.codegen.opt import Opt, OptOps
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from test.helpers import to_uops_list
|
||||
|
||||
def _uops_to_prg(uops_list):
|
||||
prg = to_program(UOp.sink(*uops_list, arg=KernelInfo()), Device[Device.DEFAULT].renderer)
|
||||
return CompiledRunner(prg, Device.DEFAULT)
|
||||
def run_uops(uops_list:list[UOp], bufs:list[Buffer]):
|
||||
buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in bufs]
|
||||
for u,b in zip(buf_uops, bufs): buffers[u] = b
|
||||
run_linear(UOp(Ops.LINEAR, src=(UOp.sink(*uops_list, arg=KernelInfo()).call(*buf_uops),)))
|
||||
|
||||
def uop(uops:list[UOp], op:Ops, dtype:Optional[DType], src:tuple[UOp, ...], arg:Any=None) -> UOp:
|
||||
if op is Ops.CONST: uops.append(UOp.const(dtype, arg))
|
||||
|
|
@ -33,8 +34,7 @@ def _test_single_value(vals, op, dts):
|
|||
out = uop(uops, Ops.STORE, dtypes.void, (buf_store.index(uop(uops, Ops.CONST, dtypes.int32, (), 0), ptr=True), alu))
|
||||
buf = Buffer(Device.DEFAULT, 1, output_dtype).allocate()
|
||||
buf2 = [Buffer(Device.DEFAULT, 1, dtype).allocate().copyin(np.array([a], dtype=_to_np_dtype(dtype)).data) for a,dtype in zip(vals, dts)]
|
||||
prg = _uops_to_prg([out])
|
||||
prg.exec([buf]+buf2)
|
||||
run_uops([out], [buf]+buf2)
|
||||
ret = np.empty(1, _to_np_dtype(output_dtype))
|
||||
buf.copyout(ret.data)
|
||||
return ret[0]
|
||||
|
|
@ -47,8 +47,7 @@ def _test_single_value_const(vals, op, dts):
|
|||
alu = uop(uops, op, output_dtype, loads)
|
||||
out = buf_store[UOp.const(dtypes.int32, 0)].store(alu)
|
||||
buf = Buffer(Device.DEFAULT, 1, output_dtype).allocate()
|
||||
prg = _uops_to_prg([out])
|
||||
prg.exec([buf])
|
||||
run_uops([out], [buf])
|
||||
ret = np.empty(1, _to_np_dtype(output_dtype))
|
||||
buf.copyout(ret.data)
|
||||
return ret[0]
|
||||
|
|
@ -59,8 +58,7 @@ def _test_uops_result(output_dtype, uops, res):
|
|||
# res = output_fn(uops)
|
||||
out = uop(uops, Ops.STORE, dtypes.void, (buf_store.index(uop(uops, Ops.CONST, dtypes.int32, (), 0)), res))
|
||||
buf = Buffer(Device.DEFAULT, 1, output_dtype).allocate()
|
||||
prg = _uops_to_prg([out])
|
||||
prg.exec([buf])
|
||||
run_uops([out], [buf])
|
||||
ret = np.empty(1, _to_np_dtype(output_dtype))
|
||||
buf.copyout(ret.data)
|
||||
return ret[0]
|
||||
|
|
|
|||
7
test/external/external_benchmark_op_conv.py
vendored
7
test/external/external_benchmark_op_conv.py
vendored
|
|
@ -3,7 +3,7 @@ from dataclasses import replace
|
|||
from tinygrad import dtypes, Device
|
||||
from tinygrad.uop.ops import UOp, AxisType, Ops, KernelInfo
|
||||
from tinygrad.codegen.opt import Opt, OptOps # pylint: disable=unused-import
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.engine.realize import get_runtime
|
||||
from tinygrad.codegen import to_program
|
||||
from tinygrad.helpers import dedup, getenv
|
||||
from tinygrad.device import Buffer
|
||||
|
|
@ -90,12 +90,13 @@ renderer = Device.default.renderer
|
|||
allocator = Device.default.allocator
|
||||
|
||||
ps = to_program(ast, renderer)
|
||||
cr = CompiledRunner(ps, Device.DEFAULT)
|
||||
rt = get_runtime(Device.DEFAULT, ps)
|
||||
|
||||
gs = sorted(dedup([u for u in ast.toposort() if u.op is Ops.PARAM]), key=lambda u: u.arg)
|
||||
# print(len(gs))
|
||||
# print([g.dtype for g in gs])
|
||||
bufs = [Buffer(ps.arg.device, g.size, g.dtype if isinstance(g.dtype, ImageDType) else g.dtype._base).ensure_allocated() for g in gs]
|
||||
|
||||
t = cr(bufs, wait=True)
|
||||
gsize, lsize = ps.arg.launch_dims({})
|
||||
t = rt(*[b._buf for b in bufs], global_size=gsize, local_size=lsize, vals=ps.arg.vals({}), wait=True)
|
||||
print(f"{t*1e6:.2f} us")
|
||||
|
|
@ -3,12 +3,12 @@ import unittest
|
|||
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.uop.ops import Ops, UOp, buffers
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.device import Buffer, is_dtype_supported
|
||||
from tinygrad.helpers import DEV, Context
|
||||
from test.helpers import slow, replace_opts
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.engine.realize import run_linear
|
||||
from tinygrad.codegen import to_program
|
||||
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
||||
from tinygrad.codegen.opt.tc import amd_cdna_1616128
|
||||
|
|
@ -20,6 +20,11 @@ from test.backend.test_linearizer import helper_realized_ast, helper_linearizer_
|
|||
|
||||
AMX = "AMX" in DEV.arch
|
||||
|
||||
def run_program(prg:UOp, bufs:list[Buffer]):
|
||||
buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in bufs]
|
||||
for u,b in zip(buf_uops, bufs): buffers[u] = b
|
||||
run_linear(UOp(Ops.LINEAR, src=(prg.call(*buf_uops),)))
|
||||
|
||||
def helper_tc_ensure_uops_and_opts_count(N: int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0,
|
||||
ensure_triggered:bool=True):
|
||||
a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in)
|
||||
|
|
@ -47,11 +52,11 @@ def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axi
|
|||
if dtype_in == dtypes.bfloat16: r = r.float()
|
||||
realized_ast, bufs = helper_realized_ast(r)
|
||||
opts = [Opt(op=OptOps.TC, axis=axis, arg=(tc_select, tc_opt, use_tensor_cores))]
|
||||
pu = to_program(replace_opts(realized_ast, opts), Device[Device.DEFAULT].renderer)
|
||||
ast = replace_opts(realized_ast, opts)
|
||||
pu = to_program(ast, Device[Device.DEFAULT].renderer)
|
||||
if use_tensor_cores == 1: assert len([uop for uop in pu.src[2].src if uop.op is Ops.WMMA]) > 0, "wmma not triggered"
|
||||
assert len([x for x in pu.src[0].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
prg = CompiledRunner(pu, Device.DEFAULT)
|
||||
prg.exec(bufs)
|
||||
run_program(ast, bufs)
|
||||
if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3
|
||||
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = (1e-1, 2e-2) if dtype_out == dtypes.bfloat16 else (1e-2, 1e-2)
|
||||
else: tc_atol, tc_rtol = 5e-3, 1e-4
|
||||
|
|
@ -145,15 +150,15 @@ class TestTensorCores(unittest.TestCase):
|
|||
c = a.conv2d(b, padding=1, dtype=tc.dtype_out)
|
||||
realized_ast, real_bufs = helper_realized_ast(c)
|
||||
|
||||
program = to_program(replace_opts(realized_ast, [Opt(OptOps.TC, axis, (-1, 2, 1))]), Device[Device.DEFAULT].renderer)
|
||||
ast = replace_opts(realized_ast, [Opt(OptOps.TC, axis, (-1, 2, 1))])
|
||||
program = to_program(ast, Device[Device.DEFAULT].renderer)
|
||||
assert len([uop for uop in tuple(program.src[2].src) if uop.op is Ops.WMMA]) > 0, "tensor core not triggered"
|
||||
assert len([x for x in program.src[0].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
|
||||
prg = CompiledRunner(program, Device.DEFAULT)
|
||||
# TODO: support this even if numpy doesn't
|
||||
if _to_np_dtype(real_bufs[0].dtype) is None: continue
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=_to_np_dtype(real_bufs[0].dtype)).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
run_program(ast, real_bufs)
|
||||
result = np.frombuffer(real_bufs[0].as_memoryview(), _to_np_dtype(real_bufs[0].dtype))
|
||||
|
||||
# ensure the results for each choice of axis matches
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
import functools, math, time, multiprocessing, traceback, signal, atexit
|
||||
import math, time, multiprocessing, traceback, signal, atexit
|
||||
from dataclasses import replace
|
||||
from tinygrad.uop.ops import sym_infer, AxisType, pyrender, UOp, Ops
|
||||
from tinygrad.device import Device, Buffer, Compiler
|
||||
from tinygrad.uop.ops import sym_infer, AxisType, pyrender, UOp
|
||||
from tinygrad.device import Device, Buffer
|
||||
from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, time_to_str, unwrap
|
||||
from tinygrad.helpers import IGNORE_BEAM_CACHE
|
||||
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.engine.realize import get_runtime
|
||||
from tinygrad.codegen import to_program
|
||||
from tinygrad.codegen.opt.postrange import Scheduler
|
||||
|
||||
|
|
@ -34,25 +34,24 @@ def get_test_global_size(global_size, max_global_size, var_vals):
|
|||
break
|
||||
return test_global_size, input_size / prod(test_global_size)
|
||||
|
||||
def _time_program(prg:UOp, lib:bytes, var_vals:dict[str, int], rawbufs:list[Buffer], early_stop:float|None=None,
|
||||
def _time_program(prg:UOp, var_vals:dict[str, int], rawbufs:list[Buffer], early_stop:float|None=None,
|
||||
allow_test_size:int=True, max_global_size:int|None=65536, clear_l2=False, cnt=3, name="test", dev_timeout=False) -> list[float]:
|
||||
timeout = int(early_stop * 1e3) if dev_timeout and early_stop is not None and early_stop < math.inf else None
|
||||
factor = 1
|
||||
info = prg.arg
|
||||
if allow_test_size and max_global_size is not None:
|
||||
global_size, factor = get_test_global_size(info.global_size, max_global_size, var_vals)
|
||||
prg = prg.replace(arg=replace(info, global_size=tuple(global_size)))
|
||||
if len(prg.src) <= 4 or prg.src[4].op is not Ops.BINARY: prg = prg.replace(src=prg.src + (UOp(Ops.BINARY, arg=lib),))
|
||||
try: car = CompiledRunner(prg, prg.src[1].arg)
|
||||
global_size, factor = get_test_global_size(prg.arg.global_size, max_global_size, var_vals)
|
||||
prg = prg.replace(arg=replace(prg.arg, global_size=tuple(global_size)))
|
||||
try: rt = get_runtime(prg.src[1].arg, prg)
|
||||
except AssertionError: return [math.inf] * cnt
|
||||
global_size, local_size = prg.arg.launch_dims(var_vals)
|
||||
bufs = [rawbufs[i]._buf for i in prg.arg.globals]
|
||||
tms = []
|
||||
input_bufs = [rawbufs[i] for i in car.p.globals]
|
||||
for _ in range(cnt):
|
||||
if clear_l2:
|
||||
if hasattr(dev:=Device[prg.src[1].arg], 'invalidate_caches'): dev.invalidate_caches()
|
||||
else:
|
||||
with Context(DEBUG=0, BEAM=0, CAPTURING=0, TRACK_MATCH_STATS=0): Tensor.ones(1024,1024).contiguous().realize(do_update_stats=False)
|
||||
tms.append(unwrap(car(input_bufs, var_vals, wait=True, timeout=timeout))*factor)
|
||||
tms.append(unwrap(rt(*bufs, global_size=global_size, local_size=local_size, vals=prg.arg.vals(var_vals), wait=True, timeout=timeout))*factor)
|
||||
if early_stop is not None and early_stop < min(tms): break
|
||||
return tms
|
||||
|
||||
|
|
@ -61,22 +60,21 @@ def timeout_handler(signum, frame):
|
|||
if DEBUG >= 2: print("*** BEAM COMPILE TIMEOUT")
|
||||
raise TimeoutException()
|
||||
|
||||
def _try_compile(x:tuple[int,Scheduler], compiler:Compiler) -> tuple[int, tuple[UOp, bytes, float]|None]:
|
||||
def _try_compile(x:tuple[int,Scheduler]) -> tuple[int, tuple[UOp, float]|None]:
|
||||
if hasattr(signal, "alarm"):
|
||||
signal.signal(getattr(signal, 'SIGALRM'), timeout_handler)
|
||||
# set timeout
|
||||
signal.alarm(getenv("BEAM_TIMEOUT_SEC", 10))
|
||||
ret = None
|
||||
try:
|
||||
st = time.perf_counter()
|
||||
prg = to_program(x[1].copy().get_optimized_ast(name_override="test"), x[1].ren)
|
||||
et = time.perf_counter() - st
|
||||
uops = prg.src[2].src
|
||||
if len(uops) >= (uops_max:=getenv("BEAM_UOPS_MAX", 3000)) > 0:
|
||||
if getenv("BEAM_LOG_SURPASS_MAX"): print(f"too many uops. {len(uops)=}, {uops_max=}")
|
||||
raise RuntimeError("too many uops")
|
||||
st = time.perf_counter()
|
||||
prog = prg.src[4].arg if len(prg.src) > 4 and prg.src[4].op is Ops.BINARY else compiler.compile(prg.src[3].arg)
|
||||
et = time.perf_counter() - st
|
||||
ret = (prg, prog, et)
|
||||
ret = (prg, et)
|
||||
except RuntimeError:
|
||||
if DEBUG >= 4: traceback.print_exc()
|
||||
except Exception as e:
|
||||
|
|
@ -150,12 +148,11 @@ def beam_search(s:Scheduler, rawbufs:list[Buffer], amt:int, allow_test_size=True
|
|||
while not exiting:
|
||||
candidates: list[Scheduler] = flatten([get_kernel_actions(si, include_0=False).values() for si,_ in beam])
|
||||
timed: list[tuple[Scheduler, float]] = []
|
||||
_compile_fn = functools.partial(_try_compile, compiler=dev.compiler)
|
||||
least_compute_ops = math.inf
|
||||
for i,proc in (map(_compile_fn, enumerate(candidates)) if beam_pool is None else beam_pool.imap_unordered(_compile_fn, enumerate(candidates))):
|
||||
for i, proc in ((map if beam_pool is None else beam_pool.imap_unordered)(_try_compile, enumerate(candidates))):
|
||||
if proc is None: continue
|
||||
prg, lib, compile_et = proc
|
||||
if lib in seen_libs: continue
|
||||
prg, compile_et = proc
|
||||
if (lib:=prg.src[4].arg) in seen_libs: continue
|
||||
# filter out kernels that use 1000x more compute than the smallest
|
||||
estimates = prg.src[0].arg.estimates
|
||||
least_compute_ops = min(this_compute_ops:=sym_infer(estimates.ops if estimates is not None else 0, var_vals), least_compute_ops)
|
||||
|
|
@ -163,7 +160,7 @@ def beam_search(s:Scheduler, rawbufs:list[Buffer], amt:int, allow_test_size=True
|
|||
if getenv("BEAM_LOG_SURPASS_MAX"): print(f"too much compute. {this_compute_ops} when least is {least_compute_ops}")
|
||||
continue
|
||||
seen_libs.add(lib)
|
||||
try: tms = _time_program(prg, lib, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0,
|
||||
try: tms = _time_program(prg, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0,
|
||||
allow_test_size=allow_test_size, clear_l2=hasattr(dev, 'invalidate_caches'),
|
||||
dev_timeout=getenv("BEAM_DEV_TIMEOUT", 1))
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
from typing import cast, Iterator, Any
|
||||
import time, random, itertools, math, contextlib, weakref
|
||||
from dataclasses import dataclass, replace, field
|
||||
from tinygrad.helpers import colored, DEBUG, GlobalCounters, ansilen, all_int, Metadata, TRACEMETA, TracingKey, prod, flatten
|
||||
from tinygrad.helpers import BEAM, size_to_str, time_to_str, VALIDATE_WITH_CPU, cpu_profile, PROFILE, ProfilePointEvent, cpu_events
|
||||
from tinygrad.helpers import colored, DEBUG, GlobalCounters, ansilen, all_int, Metadata, TRACEMETA, prod, flatten
|
||||
from tinygrad.helpers import BEAM, size_to_str, time_to_str, VALIDATE_WITH_CPU, PROFILE, ProfilePointEvent, cpu_events
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, sym_infer, buffers, graph_rewrite, ProgramInfo
|
||||
from tinygrad.device import Device, Buffer, MultiBuffer
|
||||
|
|
@ -93,28 +93,6 @@ def optimize_local_size(call:UOp, prg:UOp) -> UOp|None:
|
|||
new_global = tuple(g//l if g%l == 0 else g/l for g,l in zip(prg.arg.global_size, local_size))
|
||||
return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:]))
|
||||
|
||||
class CompiledRunner(Runner):
|
||||
def __init__(self, prg:UOp, device:str):
|
||||
info: ProgramInfo = prg.arg
|
||||
sink = prg.src[0]
|
||||
if DEBUG >= 3 and sink.arg.applied_opts: print(sink.arg.applied_opts)
|
||||
if DEBUG >= 4: print(prg.src[3].arg)
|
||||
if len(prg.src) <= 4 or prg.src[4].op is not Ops.BINARY:
|
||||
with cpu_profile(TracingKey(f"compile {info.name}", (info.function_name,)), "TINY"):
|
||||
lib = Device[device].compiler.compile_cached(prg.src[3].arg)
|
||||
prg = prg.replace(src=prg.src + (UOp(Ops.BINARY, arg=lib),))
|
||||
self.prg:UOp = prg
|
||||
self.p:ProgramInfo = info
|
||||
if DEBUG >= 7: Device[device].compiler.disassemble(prg.src[4].arg)
|
||||
self._prg = Device[device].runtime(info.function_name, prg.src[4].arg, *info.aux, runtimevars=info.runtimevars)
|
||||
super().__init__(info.name, device, sink.arg.estimates or Estimates())
|
||||
|
||||
def __call__(self, rawbufs:list[Buffer], var_vals:dict[str, int]|None=None, wait=False, timeout:int|None=None) -> float|None:
|
||||
if var_vals is None: var_vals = {}
|
||||
global_size, local_size = self.p.launch_dims(var_vals)
|
||||
return self._prg(*[x._buf for x in rawbufs], global_size=tuple(global_size), local_size=tuple(local_size) if local_size else None,
|
||||
vals=tuple(var_vals[k.expr] if k.expr not in self.p.runtimevars else None for k in self.p.vars), wait=wait, timeout=timeout)
|
||||
|
||||
# **************** method cache ****************
|
||||
|
||||
runtime_cache: dict[tuple[bytes, str], Any] = {}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue