remove schedule from extra/docs/examples (#15929)

* remove schedule from extra/docs/examples

* f
This commit is contained in:
nimlgen 2026-04-25 14:09:12 +03:00 committed by GitHub
commit 768106a542
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 52 additions and 43 deletions

View file

@ -1,6 +1,4 @@
# abstractions2 goes from back to front, here we will go from front to back
from typing import List
from tinygrad.helpers import tqdm
# *****
# 0. Load mnist on the device
@ -33,21 +31,21 @@ model(X).sparse_categorical_crossentropy(Y).backward()
optim.schedule_step() # this will step the optimizer without running realize
# *****
# 3. Create a schedule.
# 3. Create a schedule (linear uop).
# The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
# l1.uop and l2.uop define a computation graph
from tinygrad.schedule import ExecItem
schedule: List[ExecItem] = Tensor.schedule(l1, l2)
from tinygrad.engine.realize import run_linear
linear = Tensor.schedule_linear(l1, l2)
print(f"The schedule contains {len(schedule)} items.")
for si in schedule: print(str(si)[:80])
print(f"The schedule contains {len(linear.src)} items.")
for call in linear.src: print(str(call)[:80])
# *****
# 4. Lower and run the schedule.
# 4. Lower and run the schedule (linear uop).
for si in tqdm(schedule): si.run()
run_linear(linear)
# *****
# 5. Print the weight change

View file

@ -176,7 +176,7 @@ if __name__ == "__main__":
from tinygrad.codegen import get_program
with Context(PCONTIG=2, DEVECTORIZE=2, SPEC=0):
out = tree_traversal(forest_t, val_t, height, rounds)
sink = out.schedule()[-1].ast
sink = out.schedule_linear().src[-1].src[0]
prg = get_program(sink, VLIWRenderer())
# *** run on Machine and compare ***

View file

@ -13,7 +13,7 @@ from tinygrad import Tensor, Device, Context, GlobalCounters
from tinygrad.uop.ops import UOp, Ops, KernelInfo
from tinygrad.helpers import getenv, colored
from tinygrad.dtype import dtypes, AddrSpace
from tinygrad.engine.realize import Estimates
from tinygrad.engine.realize import Estimates, run_linear
from tinygrad.renderer.amd.dsl import s, v, VCC_LO, NULL
from tinygrad.runtime.autogen.amd.rdna3.ins import *
@ -463,11 +463,14 @@ def test_matmul():
estimates=Estimates(ops=N*N*N*2, mem=N*N*4*3)))
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=dname), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
c = Tensor.custom_kernel(a, b, c, fxn=asm_kernel)[2]
ei = c.schedule()[0].lower()
linear = c.schedule_linear()
ets = []
with Context(DEBUG=2):
for _ in range(getenv("CNT", 5)): ets.append(ei.run(wait=True))
for _ in range(getenv("CNT", 5)):
start = GlobalCounters.time_sum_s
run_linear(linear)
ets.append(GlobalCounters.time_sum_s - start)
print(f"REAL TFLOPS {N * N * N * 2 / min(ets) * 1e-12:.2f}")
if getenv("VERIFY", 1):

View file

@ -10,7 +10,7 @@ N = 4096
run_count = 5
if __name__ == "__main__":
ast = (Tensor.empty(N, N)@Tensor.empty(N, N)).schedule()[-1].ast
ast = (Tensor.empty(N, N)@Tensor.empty(N, N)).schedule_linear().src[-1].src[0]
prg = get_program(ast, Device.default.renderer)
if getenv("ASM") == 1:

View file

@ -4,7 +4,7 @@ from tinygrad import Tensor, Device, Context, GlobalCounters
from tinygrad.uop.ops import UOp, Ops, KernelInfo
from tinygrad.helpers import getenv, colored
from tinygrad.dtype import dtypes, AddrSpace
from tinygrad.engine.realize import Estimates
from tinygrad.engine.realize import Estimates, run_linear
from tinygrad.renderer.amd.dsl import s, v, VCC_LO, NULL, src, ttmp
from tinygrad.runtime.autogen.amd.rdna4.ins import *
@ -225,11 +225,14 @@ def test_matmul():
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=dname), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
c = Tensor.custom_kernel(a, b, c, fxn=asm_kernel)[2]
ei = c.schedule()[0].lower()
linear = c.schedule_linear()
ets = []
with Context(DEBUG=2):
for _ in range(getenv("CNT", 5)): ets.append(ei.run(wait=True))
for _ in range(getenv("CNT", 5)):
start = GlobalCounters.time_sum_s
run_linear(linear)
ets.append(GlobalCounters.time_sum_s - start)
print(f"REAL TFLOPS {N*N*N*2 / min(ets) * 1e-12:.2f}")
if getenv("VERIFY", 1):

View file

@ -2,6 +2,7 @@ import numpy as np
from tinygrad import dtypes, Tensor
from tinygrad.helpers import getenv, get_single_element
from tinygrad.dtype import _to_np_dtype
from tinygrad.engine.realize import compile_linear
from tinygrad.codegen.opt import OptOps
dtype_in = (dtypes.half if getenv("HALF") else dtypes.bfloat16 if getenv("BFLOAT16") else
@ -38,10 +39,10 @@ if __name__ == "__main__":
c = a.matmul(b, dtype=acc_dtype).realize()
if getenv("SHOULD_USE_TC"):
sched = a.matmul(b, dtype=acc_dtype).schedule()
ei = get_single_element(sched)
ei.lower()
assert any(opt.op is OptOps.TC for opt in ei.prg.p.applied_opts), f"TC not triggered, {ei.prg.p.applied_opts}"
linear = compile_linear(a.matmul(b, dtype=acc_dtype).schedule_linear())
call = get_single_element(list(linear.src))
applied_opts = call.src[0].src[0].arg.applied_opts
assert any(opt.op is OptOps.TC for opt in applied_opts), f"TC not triggered, {applied_opts}"
ref = a.numpy().astype(np.float32) @ b.numpy().astype(np.float32)
res = c.numpy()

View file

@ -1,7 +1,7 @@
from tinygrad import Tensor, dtypes, Device
from tinygrad.helpers import getenv, DEBUG
from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
from tinygrad import Tensor, dtypes, Context
from tinygrad.helpers import getenv
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.engine.realize import run_linear
from dataclasses import replace
N = 4096
@ -11,9 +11,6 @@ if __name__ == "__main__":
else:
A, B = Tensor.empty(N, N, dtype=dtypes.float16), Tensor.empty(N, N, dtype=dtypes.float16)
C = A.matmul(B)
si = C.schedule()[-1]
ast = si.ast
k = Kernel(ast, opts=Device[Device.DEFAULT].renderer)
if getenv("GEMV"):
opts = [
Opt(op=OptOps.UNROLL, axis=0, amt=8),
@ -28,10 +25,10 @@ if __name__ == "__main__":
Opt(op=OptOps.LOCAL, axis=1, amt=2),
Opt(op=OptOps.LOCAL, axis=0, amt=2),
]
k.apply_opts(opts)
prg = get_program(k.ast.replace(arg=replace(k.ast.arg, opts_to_apply=tuple(k.applied_opts))), k.opts)
new_src = prg.src
# can mod source here
prg = replace(prg, src=new_src)
ei = ExecItem(si.ast, [x.ensure_allocated() for x in si.bufs], si.metadata, prg=CompiledRunner(prg))
for i in range(5): ei.run(wait=True)
linear = C.schedule_linear()
call = linear.src[-1]
new_ast = call.src[0].replace(arg=replace(call.src[0].arg, opts_to_apply=tuple(opts)))
new_call = call.replace(src=(new_ast, *call.src[1:]))
linear = linear.replace(src=tuple(new_call if c is call else c for c in linear.src))
with Context(DEBUG=2):
for i in range(5): run_linear(linear)

View file

@ -36,10 +36,10 @@ A = Tensor.rand(M, K, device="CPU")
B = Tensor.rand(K, N, device="CPU")
C = (A.reshape(M, 1, K) * B.permute(1,0).reshape(1, N, K)).sum(axis=2)
sched = C.schedule()
linear = C.schedule_linear()
from tinygrad.codegen.opt.kernel import Kernel
from tinygrad.device import CompilerOptions
lin = Kernel(sched[-1].ast, CompilerOptions(has_local=False, supports_float4=False))
lin = Kernel(linear.src[-1].src[0], CompilerOptions(has_local=False, supports_float4=False))
lin.to_program()
from tinygrad.runtime.ops_cpu import renderer
src = renderer("mmult", lin.uops)

View file

@ -3,11 +3,12 @@ import os
# TODO: there is a timing bug without this
os.environ["AMD_AQL"] = "1"
from tinygrad import Tensor, Device
from tinygrad import Tensor, Device, GlobalCounters, Context
from tinygrad.helpers import getenv, DEV
from tinygrad.uop.ops import UOp, Ops, KernelInfo
from tinygrad.renderer import Estimates
from tinygrad.renderer.amd.dsl import Reg, Inst, s, v
from tinygrad.engine.realize import run_linear
NUM_WORKGROUPS = 96
WAVE_SIZE = 32
@ -36,11 +37,17 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, **kwargs)
gidx = UOp.special(NUM_WORKGROUPS, "gidx0")
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
sink = UOp.sink(A.base, threads, gidx, arg=KernelInfo(inst.op.name.lower(), estimates=Estimates(ops=FLOPs, mem=0)))
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg="AMD"), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=Device.DEFAULT), UOp(Ops.LINEAR, src=tuple([UOp(Ops.INS, arg=x) for x in insts]))))
dummy = Tensor.zeros(1).contiguous().realize()
out = Tensor.custom_kernel(dummy, fxn=fxn)[0]
ei = out.schedule()[-1].lower()
elapsed = min([ei.run(wait=True) for _ in range(2)])
linear = out.schedule_linear()
ets = []
with Context(DEBUG=2):
for _ in range(2):
start = GlobalCounters.time_sum_s
run_linear(linear)
ets.append(GlobalCounters.time_sum_s - start)
elapsed = min(ets)
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
print(f"{inst.op_name.lower():<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")

View file

@ -84,7 +84,7 @@ class TestBeamSearch(unittest.TestCase):
tc = Device[Device.DEFAULT].renderer.tensor_cores[0]
size = max(tc.dims[0], tc.dims[1]) * 8
a, b = Tensor.rand(size, size, dtype=tc.dtype_in), Tensor.rand(size, size, dtype=tc.dtype_in)
ast = a.matmul(b, dtype=tc.dtype_out).schedule()[-1].ast
ast = a.matmul(b, dtype=tc.dtype_out).schedule_linear().src[-1].src[0]
s = Scheduler(ast, Device[Device.DEFAULT].renderer)
s.apply_opt(Opt(OptOps.TC, 0, (-1, 0, 1)))
up = prod([x for x, t in zip(s.full_shape, s.axis_types) if t in (AxisType.UPCAST, AxisType.UNROLL)])
@ -94,7 +94,7 @@ class TestBeamSearch(unittest.TestCase):
def test_max_up(self):
a = Tensor.rand(16, 16)
ast = a.schedule()[-1].ast
ast = a.schedule_linear().src[-1].src[0]
s = Scheduler(ast, Device[Device.DEFAULT].renderer)
for max_up in (2, 4):
actions = get_kernel_actions(s, include_0=False, max_up=max_up)