mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
parent
9a23de7d27
commit
e0ff6cc15c
7 changed files with 54 additions and 86 deletions
|
|
@ -19,7 +19,7 @@ The `UOp` graph specifies the compute in terms of low level tinygrad ops. Not al
|
|||
|
||||
The [scheduler](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/schedule/__init__.py) converts the graph of UOps into a list of `ExecItem`. One `ExecItem` is one kernel on the GPU, and the scheduler is responsible for breaking the large compute graph into subgraphs that can fit in a kernel. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.
|
||||
|
||||
::: tinygrad.schedule.ExecItem
|
||||
::: tinygrad.engine.realize.ExecItem
|
||||
|
||||
## Lowering
|
||||
|
||||
|
|
|
|||
|
|
@ -19,8 +19,8 @@
|
|||
|
||||
## tinygrad ops
|
||||
|
||||
::: tinygrad.Tensor.schedule_with_vars
|
||||
::: tinygrad.Tensor.schedule
|
||||
::: tinygrad.Tensor.linear_with_vars
|
||||
::: tinygrad.Tensor.schedule_linear
|
||||
::: tinygrad.Tensor.realize
|
||||
::: tinygrad.Tensor.replace
|
||||
::: tinygrad.Tensor.assign
|
||||
|
|
|
|||
|
|
@ -7,8 +7,7 @@ from tinygrad.dtype import dtypes, DType, AddrSpace, ConstFloat # noqa: F401
|
|||
from tinygrad.device import Buffer, Device
|
||||
from tinygrad.uop.ops import Ops, UOp, KernelInfo, AxisType
|
||||
from tinygrad.renderer.cstyle import CStyleLanguage
|
||||
from tinygrad.engine.realize import CompiledRunner, get_program, get_runner
|
||||
from tinygrad.schedule import ExecItem
|
||||
from tinygrad.engine.realize import CompiledRunner, get_program, run_linear
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.codegen.opt import Opt, OptOps
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
|
|
@ -281,7 +280,7 @@ class TestZeroRange(unittest.TestCase):
|
|||
|
||||
class TestUOpPrograms(unittest.TestCase):
|
||||
def _run(self, prog:UOp, *tensors:Tensor):
|
||||
ExecItem(prog, [t.uop.buffer for t in tensors], prg=get_runner(Device.DEFAULT, prog)).run(wait=True)
|
||||
run_linear(UOp(Ops.LINEAR, src=(prog.call(*[t.uop.buf_uop for t in tensors]),)), do_update_stats=False)
|
||||
|
||||
def test_simple(self):
|
||||
out = Tensor.empty(10,10,dtype=dtypes.int)
|
||||
|
|
|
|||
3
test/external/fuzz_graph.py
vendored
3
test/external/fuzz_graph.py
vendored
|
|
@ -4,8 +4,7 @@ from tinygrad.device import Buffer, Device
|
|||
from tinygrad.helpers import Context, getenv, from_mv
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.tensor import Tensor, _to_np_dtype
|
||||
from tinygrad.engine.realize import BufferXfer, get_runner
|
||||
from tinygrad.schedule import ExecItem
|
||||
from tinygrad.engine.realize import BufferXfer, get_runner, ExecItem
|
||||
from tinygrad.uop.ops import UOp, Ops
|
||||
from tinygrad.engine.jit import apply_graph_to_jit
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,8 @@
|
|||
import unittest, math, time
|
||||
|
||||
from tinygrad import Tensor, Device, dtypes, Context
|
||||
from tinygrad import Tensor, Device, dtypes, Context, GlobalCounters
|
||||
from tinygrad.uop.ops import UOp, Ops
|
||||
from tinygrad.engine.realize import get_runner
|
||||
from tinygrad.schedule import ExecItem
|
||||
from tinygrad.engine.realize import run_linear
|
||||
from tinygrad.engine.jit import TinyJit
|
||||
import numpy as np
|
||||
|
||||
|
|
@ -67,8 +66,9 @@ class TestTK(unittest.TestCase):
|
|||
c = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b, c)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (c, a, b)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (c, a, b)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
c = c.float()
|
||||
|
||||
ref = a.matmul(b, dtype=dtypes.float32).float()
|
||||
|
|
@ -115,8 +115,9 @@ class TestTK(unittest.TestCase):
|
|||
c = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b, c)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (c, a, b)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (c, a, b)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
c = c.float()
|
||||
|
||||
ref = a.matmul(b.transpose(2, 3), dtype=dtypes.float32).float()
|
||||
|
|
@ -151,8 +152,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float()
|
||||
|
|
@ -190,8 +192,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float()
|
||||
|
|
@ -232,8 +235,9 @@ class TestTK(unittest.TestCase):
|
|||
c = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b, c)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, c, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, c, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
c = c.float()
|
||||
|
||||
|
|
@ -272,8 +276,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float()
|
||||
|
|
@ -309,8 +314,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float() + 1
|
||||
|
|
@ -354,8 +360,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float().max(axis=2, keepdim=True).expand(a.shape)
|
||||
|
|
@ -399,8 +406,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, M, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float().max(axis=2, keepdim=True).expand(a.shape)
|
||||
|
|
@ -444,8 +452,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, N, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float().sum(axis=2, keepdim=True).expand(a.shape)
|
||||
|
|
@ -489,8 +498,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, M, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float().sum(axis=2, keepdim=True).expand(a.shape)
|
||||
|
|
@ -549,8 +559,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, BLOCK_SIZE, N, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float().softmax(axis=3)
|
||||
|
|
@ -609,8 +620,9 @@ class TestTK(unittest.TestCase):
|
|||
b = Tensor.empty(1, 1, N, BLOCK_SIZE, dtype="float32")
|
||||
Tensor.realize(a, b)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (b, a)], prg=get_runner(Device.DEFAULT, sink))
|
||||
for _ in range(5): ei.run(wait=True)
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (b, a)]),))
|
||||
|
||||
for _ in range(5): run_linear(linear, do_update_stats=False)
|
||||
b = b.float()
|
||||
|
||||
ref = a.float().softmax(axis=2)
|
||||
|
|
@ -719,9 +731,11 @@ class TestTK(unittest.TestCase):
|
|||
out = Tensor.empty(B, N, H, D, dtype=dtypes.bfloat16)
|
||||
Tensor.realize(q, k, v, out)
|
||||
|
||||
ei = ExecItem(sink, [t.uop.buffer for t in (out, q, k, v)], prg=get_runner(Device.DEFAULT, sink))
|
||||
linear = UOp(Ops.LINEAR, src=(sink.call(*[t.uop.buf_uop for t in (out, q, k, v)]),))
|
||||
for _ in range(5):
|
||||
et = ei.run(wait=True)
|
||||
GlobalCounters.reset()
|
||||
with Context(DEBUG=2): run_linear(linear)
|
||||
et = GlobalCounters.time_sum_s
|
||||
attn_flops = 2 * B * H * N * N * D + \
|
||||
4 * B * H * N * N + \
|
||||
2 * B * H * N * N * D
|
||||
|
|
|
|||
|
|
@ -1,12 +1,8 @@
|
|||
import time, inspect
|
||||
from typing import cast
|
||||
from collections import deque
|
||||
from dataclasses import replace
|
||||
from tinygrad.uop.ops import UOp, Ops, buffers, UOpMetaClass, track_rewrites, graph_rewrite, gate_kernel_sink, KernelInfo
|
||||
from tinygrad.uop.ops import UOp, Ops, UOpMetaClass, track_rewrites, graph_rewrite, gate_kernel_sink, KernelInfo
|
||||
from tinygrad.uop.spec import type_verify, tensor_spec
|
||||
from tinygrad.device import Buffer, MultiBuffer
|
||||
from tinygrad.helpers import DEBUG, cpu_profile, TracingKey, SPEC, pluralize, SCACHE, BASEDIR, flatten, BEAM, partition
|
||||
from tinygrad.engine.realize import ExecItem
|
||||
from tinygrad.helpers import DEBUG, cpu_profile, TracingKey, SPEC, pluralize, SCACHE, BASEDIR, flatten, partition
|
||||
|
||||
# **** schedule linearizer
|
||||
|
||||
|
|
@ -70,31 +66,6 @@ def create_schedule(sched_sink:UOp) -> UOp:
|
|||
if in_degree[x] == 0: queue.append(x)
|
||||
return UOp(Ops.LINEAR, src=tuple(linearized))
|
||||
|
||||
def linear_to_schedule(linear:UOp) -> list[ExecItem]:
|
||||
"""Convert a LINEAR UOp to a list of ExecItems."""
|
||||
schedule: list[ExecItem] = []
|
||||
for si in linear.src:
|
||||
ast, buf_uops = si.src[0], si.src[1:]
|
||||
# create subbuffers if needed
|
||||
if ast.op is Ops.BUFFER_VIEW:
|
||||
base = buf_uops[1].buffer
|
||||
assert isinstance(base, Buffer), "base can't be MultiBuffer"
|
||||
buffers[buf_uops[0]] = base.view(buf_uops[0].arg, ast.dtype, ast.arg[1]*base.dtype.itemsize)
|
||||
# set beam on KernelInfo when beam search is enabled
|
||||
if ast.op is Ops.SINK and BEAM >= 1 and ast.arg.beam == 0: ast = ast.replace(arg=replace(ast.arg, beam=BEAM.value))
|
||||
ubufs = [b.buffer for b in buf_uops if b.op is not Ops.BIND]
|
||||
metadata = si.arg.metadata
|
||||
if ast.op is Ops.CUSTOM_FUNCTION and ast.arg == "graph":
|
||||
schedule.append(ExecItem(ast, flatten([b.bufs if isinstance(b, MultiBuffer) else [b] for b in ubufs]), metadata))
|
||||
elif any(isinstance(x, MultiBuffer) for x in ubufs):
|
||||
assert all(isinstance(x, MultiBuffer) for x in ubufs), "kernel must all be multibuffer"
|
||||
dnums = [x for x in ast.variables() if x.expr == '_device_num']
|
||||
for j, bufs in enumerate(zip(*[x.bufs for x in cast(tuple[MultiBuffer, ...], ubufs)])):
|
||||
schedule.append(ExecItem(ast, list(bufs), metadata, {dnums[0].expr:j} if len(dnums) else {}))
|
||||
else:
|
||||
schedule.append(ExecItem(ast, cast(list[Buffer|None], ubufs), metadata))
|
||||
return schedule
|
||||
|
||||
from tinygrad.schedule.memory import memory_plan_rewrite
|
||||
from tinygrad.engine.realize import capturing
|
||||
from tinygrad.schedule.rangeify import get_kernel_graph
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ from tinygrad.helpers import suppress_finalizing, disable_gc
|
|||
from tinygrad.gradient import compute_gradient
|
||||
from tinygrad.mixin import OpMixin
|
||||
from tinygrad.uop.ops import smax, UOp, Ops, sint, all_metadata, _index_to_concrete_int, Variable, _broadcast_shape
|
||||
from tinygrad.schedule import ExecItem, create_linear_with_vars, linear_to_schedule
|
||||
from tinygrad.schedule import create_linear_with_vars
|
||||
from tinygrad.device import Buffer, canonicalize_device
|
||||
from tinygrad.engine.realize import run_linear
|
||||
from tinygrad.callify import transform_to_call
|
||||
|
|
@ -232,21 +232,6 @@ class Tensor(OpMixin):
|
|||
_apply_map_to_tensors(becomes_map, name="buffers")
|
||||
return create_linear_with_vars(big_sink)
|
||||
|
||||
def schedule_with_vars(self, *lst:Tensor) -> tuple[list[ExecItem], dict[str, int]]:
|
||||
"""
|
||||
Creates the schedule needed to realize these Tensor(s), with Variables.
|
||||
|
||||
NOTE: A Tensor can only be scheduled once.
|
||||
"""
|
||||
linear, var_vals = self.linear_with_vars(*lst)
|
||||
return linear_to_schedule(linear), var_vals
|
||||
|
||||
def schedule(self, *lst:Tensor) -> list[ExecItem]:
|
||||
"""Creates the schedule needed to realize these Tensor(s)."""
|
||||
schedule, var_vals = self.schedule_with_vars(*lst)
|
||||
assert len(var_vals) == 0
|
||||
return schedule
|
||||
|
||||
def schedule_linear(self, *lst:Tensor) -> UOp:
|
||||
"""Creates the schedule needed to realize these Tensor(s)."""
|
||||
linear, var_vals = self.linear_with_vars(*lst)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue