mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
remove schedule from extra/docs/examples (#15929)
* remove schedule from extra/docs/examples * f
This commit is contained in:
parent
a5e9ea7a60
commit
768106a542
10 changed files with 52 additions and 43 deletions
|
|
@ -1,6 +1,4 @@
|
|||
# abstractions2 goes from back to front, here we will go from front to back
|
||||
from typing import List
|
||||
from tinygrad.helpers import tqdm
|
||||
|
||||
# *****
|
||||
# 0. Load mnist on the device
|
||||
|
|
@ -33,21 +31,21 @@ model(X).sparse_categorical_crossentropy(Y).backward()
|
|||
optim.schedule_step() # this will step the optimizer without running realize
|
||||
|
||||
# *****
|
||||
# 3. Create a schedule.
|
||||
# 3. Create a schedule (linear uop).
|
||||
|
||||
# The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
|
||||
# l1.uop and l2.uop define a computation graph
|
||||
|
||||
from tinygrad.schedule import ExecItem
|
||||
schedule: List[ExecItem] = Tensor.schedule(l1, l2)
|
||||
from tinygrad.engine.realize import run_linear
|
||||
linear = Tensor.schedule_linear(l1, l2)
|
||||
|
||||
print(f"The schedule contains {len(schedule)} items.")
|
||||
for si in schedule: print(str(si)[:80])
|
||||
print(f"The schedule contains {len(linear.src)} items.")
|
||||
for call in linear.src: print(str(call)[:80])
|
||||
|
||||
# *****
|
||||
# 4. Lower and run the schedule.
|
||||
# 4. Lower and run the schedule (linear uop).
|
||||
|
||||
for si in tqdm(schedule): si.run()
|
||||
run_linear(linear)
|
||||
|
||||
# *****
|
||||
# 5. Print the weight change
|
||||
|
|
|
|||
|
|
@ -176,7 +176,7 @@ if __name__ == "__main__":
|
|||
from tinygrad.codegen import get_program
|
||||
with Context(PCONTIG=2, DEVECTORIZE=2, SPEC=0):
|
||||
out = tree_traversal(forest_t, val_t, height, rounds)
|
||||
sink = out.schedule()[-1].ast
|
||||
sink = out.schedule_linear().src[-1].src[0]
|
||||
prg = get_program(sink, VLIWRenderer())
|
||||
|
||||
# *** run on Machine and compare ***
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ from tinygrad import Tensor, Device, Context, GlobalCounters
|
|||
from tinygrad.uop.ops import UOp, Ops, KernelInfo
|
||||
from tinygrad.helpers import getenv, colored
|
||||
from tinygrad.dtype import dtypes, AddrSpace
|
||||
from tinygrad.engine.realize import Estimates
|
||||
from tinygrad.engine.realize import Estimates, run_linear
|
||||
from tinygrad.renderer.amd.dsl import s, v, VCC_LO, NULL
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
|
||||
|
|
@ -463,11 +463,14 @@ def test_matmul():
|
|||
estimates=Estimates(ops=N*N*N*2, mem=N*N*4*3)))
|
||||
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=dname), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
|
||||
c = Tensor.custom_kernel(a, b, c, fxn=asm_kernel)[2]
|
||||
ei = c.schedule()[0].lower()
|
||||
linear = c.schedule_linear()
|
||||
|
||||
ets = []
|
||||
with Context(DEBUG=2):
|
||||
for _ in range(getenv("CNT", 5)): ets.append(ei.run(wait=True))
|
||||
for _ in range(getenv("CNT", 5)):
|
||||
start = GlobalCounters.time_sum_s
|
||||
run_linear(linear)
|
||||
ets.append(GlobalCounters.time_sum_s - start)
|
||||
print(f"REAL TFLOPS {N * N * N * 2 / min(ets) * 1e-12:.2f}")
|
||||
|
||||
if getenv("VERIFY", 1):
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ N = 4096
|
|||
run_count = 5
|
||||
|
||||
if __name__ == "__main__":
|
||||
ast = (Tensor.empty(N, N)@Tensor.empty(N, N)).schedule()[-1].ast
|
||||
ast = (Tensor.empty(N, N)@Tensor.empty(N, N)).schedule_linear().src[-1].src[0]
|
||||
prg = get_program(ast, Device.default.renderer)
|
||||
|
||||
if getenv("ASM") == 1:
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from tinygrad import Tensor, Device, Context, GlobalCounters
|
|||
from tinygrad.uop.ops import UOp, Ops, KernelInfo
|
||||
from tinygrad.helpers import getenv, colored
|
||||
from tinygrad.dtype import dtypes, AddrSpace
|
||||
from tinygrad.engine.realize import Estimates
|
||||
from tinygrad.engine.realize import Estimates, run_linear
|
||||
from tinygrad.renderer.amd.dsl import s, v, VCC_LO, NULL, src, ttmp
|
||||
from tinygrad.runtime.autogen.amd.rdna4.ins import *
|
||||
|
||||
|
|
@ -225,11 +225,14 @@ def test_matmul():
|
|||
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=dname), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
|
||||
|
||||
c = Tensor.custom_kernel(a, b, c, fxn=asm_kernel)[2]
|
||||
ei = c.schedule()[0].lower()
|
||||
linear = c.schedule_linear()
|
||||
|
||||
ets = []
|
||||
with Context(DEBUG=2):
|
||||
for _ in range(getenv("CNT", 5)): ets.append(ei.run(wait=True))
|
||||
for _ in range(getenv("CNT", 5)):
|
||||
start = GlobalCounters.time_sum_s
|
||||
run_linear(linear)
|
||||
ets.append(GlobalCounters.time_sum_s - start)
|
||||
print(f"REAL TFLOPS {N*N*N*2 / min(ets) * 1e-12:.2f}")
|
||||
|
||||
if getenv("VERIFY", 1):
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import numpy as np
|
|||
from tinygrad import dtypes, Tensor
|
||||
from tinygrad.helpers import getenv, get_single_element
|
||||
from tinygrad.dtype import _to_np_dtype
|
||||
from tinygrad.engine.realize import compile_linear
|
||||
from tinygrad.codegen.opt import OptOps
|
||||
|
||||
dtype_in = (dtypes.half if getenv("HALF") else dtypes.bfloat16 if getenv("BFLOAT16") else
|
||||
|
|
@ -38,10 +39,10 @@ if __name__ == "__main__":
|
|||
c = a.matmul(b, dtype=acc_dtype).realize()
|
||||
|
||||
if getenv("SHOULD_USE_TC"):
|
||||
sched = a.matmul(b, dtype=acc_dtype).schedule()
|
||||
ei = get_single_element(sched)
|
||||
ei.lower()
|
||||
assert any(opt.op is OptOps.TC for opt in ei.prg.p.applied_opts), f"TC not triggered, {ei.prg.p.applied_opts}"
|
||||
linear = compile_linear(a.matmul(b, dtype=acc_dtype).schedule_linear())
|
||||
call = get_single_element(list(linear.src))
|
||||
applied_opts = call.src[0].src[0].arg.applied_opts
|
||||
assert any(opt.op is OptOps.TC for opt in applied_opts), f"TC not triggered, {applied_opts}"
|
||||
|
||||
ref = a.numpy().astype(np.float32) @ b.numpy().astype(np.float32)
|
||||
res = c.numpy()
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from tinygrad import Tensor, dtypes, Device
|
||||
from tinygrad.helpers import getenv, DEBUG
|
||||
from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps
|
||||
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
|
||||
from tinygrad import Tensor, dtypes, Context
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.codegen.opt import Opt, OptOps
|
||||
from tinygrad.engine.realize import run_linear
|
||||
from dataclasses import replace
|
||||
|
||||
N = 4096
|
||||
|
|
@ -11,9 +11,6 @@ if __name__ == "__main__":
|
|||
else:
|
||||
A, B = Tensor.empty(N, N, dtype=dtypes.float16), Tensor.empty(N, N, dtype=dtypes.float16)
|
||||
C = A.matmul(B)
|
||||
si = C.schedule()[-1]
|
||||
ast = si.ast
|
||||
k = Kernel(ast, opts=Device[Device.DEFAULT].renderer)
|
||||
if getenv("GEMV"):
|
||||
opts = [
|
||||
Opt(op=OptOps.UNROLL, axis=0, amt=8),
|
||||
|
|
@ -28,10 +25,10 @@ if __name__ == "__main__":
|
|||
Opt(op=OptOps.LOCAL, axis=1, amt=2),
|
||||
Opt(op=OptOps.LOCAL, axis=0, amt=2),
|
||||
]
|
||||
k.apply_opts(opts)
|
||||
prg = get_program(k.ast.replace(arg=replace(k.ast.arg, opts_to_apply=tuple(k.applied_opts))), k.opts)
|
||||
new_src = prg.src
|
||||
# can mod source here
|
||||
prg = replace(prg, src=new_src)
|
||||
ei = ExecItem(si.ast, [x.ensure_allocated() for x in si.bufs], si.metadata, prg=CompiledRunner(prg))
|
||||
for i in range(5): ei.run(wait=True)
|
||||
linear = C.schedule_linear()
|
||||
call = linear.src[-1]
|
||||
new_ast = call.src[0].replace(arg=replace(call.src[0].arg, opts_to_apply=tuple(opts)))
|
||||
new_call = call.replace(src=(new_ast, *call.src[1:]))
|
||||
linear = linear.replace(src=tuple(new_call if c is call else c for c in linear.src))
|
||||
with Context(DEBUG=2):
|
||||
for i in range(5): run_linear(linear)
|
||||
|
|
|
|||
|
|
@ -36,10 +36,10 @@ A = Tensor.rand(M, K, device="CPU")
|
|||
B = Tensor.rand(K, N, device="CPU")
|
||||
C = (A.reshape(M, 1, K) * B.permute(1,0).reshape(1, N, K)).sum(axis=2)
|
||||
|
||||
sched = C.schedule()
|
||||
linear = C.schedule_linear()
|
||||
from tinygrad.codegen.opt.kernel import Kernel
|
||||
from tinygrad.device import CompilerOptions
|
||||
lin = Kernel(sched[-1].ast, CompilerOptions(has_local=False, supports_float4=False))
|
||||
lin = Kernel(linear.src[-1].src[0], CompilerOptions(has_local=False, supports_float4=False))
|
||||
lin.to_program()
|
||||
from tinygrad.runtime.ops_cpu import renderer
|
||||
src = renderer("mmult", lin.uops)
|
||||
|
|
|
|||
|
|
@ -3,11 +3,12 @@ import os
|
|||
# TODO: there is a timing bug without this
|
||||
os.environ["AMD_AQL"] = "1"
|
||||
|
||||
from tinygrad import Tensor, Device
|
||||
from tinygrad import Tensor, Device, GlobalCounters, Context
|
||||
from tinygrad.helpers import getenv, DEV
|
||||
from tinygrad.uop.ops import UOp, Ops, KernelInfo
|
||||
from tinygrad.renderer import Estimates
|
||||
from tinygrad.renderer.amd.dsl import Reg, Inst, s, v
|
||||
from tinygrad.engine.realize import run_linear
|
||||
|
||||
NUM_WORKGROUPS = 96
|
||||
WAVE_SIZE = 32
|
||||
|
|
@ -36,11 +37,17 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, **kwargs)
|
|||
gidx = UOp.special(NUM_WORKGROUPS, "gidx0")
|
||||
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
|
||||
sink = UOp.sink(A.base, threads, gidx, arg=KernelInfo(inst.op.name.lower(), estimates=Estimates(ops=FLOPs, mem=0)))
|
||||
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg="AMD"), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
|
||||
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=Device.DEFAULT), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
|
||||
dummy = Tensor.zeros(1).contiguous().realize()
|
||||
out = Tensor.custom_kernel(dummy, fxn=fxn)[0]
|
||||
ei = out.schedule()[-1].lower()
|
||||
elapsed = min([ei.run(wait=True) for _ in range(2)])
|
||||
linear = out.schedule_linear()
|
||||
ets = []
|
||||
with Context(DEBUG=2):
|
||||
for _ in range(2):
|
||||
start = GlobalCounters.time_sum_s
|
||||
run_linear(linear)
|
||||
ets.append(GlobalCounters.time_sum_s - start)
|
||||
elapsed = min(ets)
|
||||
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
|
||||
print(f"{inst.op_name.lower():<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")
|
||||
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ class TestBeamSearch(unittest.TestCase):
|
|||
tc = Device[Device.DEFAULT].renderer.tensor_cores[0]
|
||||
size = max(tc.dims[0], tc.dims[1]) * 8
|
||||
a, b = Tensor.rand(size, size, dtype=tc.dtype_in), Tensor.rand(size, size, dtype=tc.dtype_in)
|
||||
ast = a.matmul(b, dtype=tc.dtype_out).schedule()[-1].ast
|
||||
ast = a.matmul(b, dtype=tc.dtype_out).schedule_linear().src[-1].src[0]
|
||||
s = Scheduler(ast, Device[Device.DEFAULT].renderer)
|
||||
s.apply_opt(Opt(OptOps.TC, 0, (-1, 0, 1)))
|
||||
up = prod([x for x, t in zip(s.full_shape, s.axis_types) if t in (AxisType.UPCAST, AxisType.UNROLL)])
|
||||
|
|
@ -94,7 +94,7 @@ class TestBeamSearch(unittest.TestCase):
|
|||
|
||||
def test_max_up(self):
|
||||
a = Tensor.rand(16, 16)
|
||||
ast = a.schedule()[-1].ast
|
||||
ast = a.schedule_linear().src[-1].src[0]
|
||||
s = Scheduler(ast, Device[Device.DEFAULT].renderer)
|
||||
for max_up in (2, 4):
|
||||
actions = get_kernel_actions(s, include_0=False, max_up=max_up)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue