Compare commits

...

7 commits

Author SHA1 Message Date
George Hotz
383c0bf05e notes 2025-07-01 10:36:05 -07:00
George Hotz
fd2c0e2626 uc warp 2025-06-30 20:54:29 -07:00
George Hotz
5907f2d443
Merge branch 'master' into warp_fun 2025-06-30 18:18:52 -07:00
George Hotz
fc59db0aaa
Merge branch 'master' into warp_fun 2025-06-30 15:54:57 -07:00
George Hotz
256d4403c5 what does order change 2025-06-29 09:26:32 -07:00
George Hotz
ad823a5199
Merge branch 'master' into warp_fun 2025-06-29 09:13:16 -07:00
George Hotz
0df4355cd8 upcasted warp experiments 2025-06-29 09:04:37 -07:00
5 changed files with 95 additions and 4 deletions

73
extra/upcasted_warps.py Normal file
View file

@ -0,0 +1,73 @@
# play with upcasted warps
from tinygrad import Tensor, Device
from tinygrad.uop.ops import KernelInfo
from tinygrad.opt import get_optimized_ast
from tinygrad.opt.kernel import OptOps, Opt
from tinygrad.engine.realize import get_program
if __name__ == "__main__":
renderer = Device.default.renderer
N = 64
"""
a = Tensor.empty(N,N)
out = (a + 1) #.sum(axis=2)
ast = out.schedule()[-1].ast
opts = tuple()
opts += (Opt(OptOps.UPCAST, 0, 32),)
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
ast = get_optimized_ast(ast, renderer)
prg = get_program(ast, renderer)
print(prg.src)
"""
# how you split the store determines everything if you don't allow cross warp comms.
# actually not everything, there's also the split before the horizontal (unrolled) reduces
# new flow
# - pull out any dimensions from the store that you want to upcast.
# - decide how you want to assign them to registers. GPUs have a 512-byte memory LOAD/STORE which loads into 4 regs. see BUFFER_LOAD_B128
# - the loads and stores can be shuffled, but only in restrictive ways. in kernels without reduces, the store determines everything
# - it loads 16 bytes from up 32 different places = 512 bytes
# - in kernels with reduces, you now have more flexibility. the final target of the reduce must be what is stored
# - warp dimensions can be in the reduce (this is GROUP)
# every dimension can be assigned to <global, local, loop, upcast, warp>
"""
out = a.sum(axis=1)
ast = out.schedule()[-1].ast
opts = tuple()
opts += (Opt(OptOps.UPCAST, 0, 8),)
opts += (Opt(OptOps.UNROLL, 0, 8),)
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
ast = get_optimized_ast(ast, renderer)
prg = get_program(ast, renderer)
print(prg.src)
out = a.sum(axis=1)
ast = out.schedule()[-1].ast
opts = tuple()
opts += (Opt(OptOps.UNROLL, 0, 8),)
opts += (Opt(OptOps.UPCAST, 0, 8),)
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
ast = get_optimized_ast(ast, renderer)
prg = get_program(ast, renderer)
print(prg.src)
"""
# gemm
b = Tensor.empty(N,N)
# metal TC
#opts = (Opt(OptOps.UPCAST, 0, 2), # not the warp
# Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2), Opt(OptOps.UPCAST, 1, 2),
# Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2))
# new TC should just be able to extract from this and swizzle as needed
opts = (Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UPCAST, 1, 8), Opt(OptOps.UNROLL, 0, 8))
c = (a@b)
ast = c.schedule()[-1].ast
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
ast = get_optimized_ast(ast, renderer)
prg = get_program(ast, renderer)
print(prg.src)

View file

@ -37,6 +37,17 @@ def get_rewrites_for_renderer(opts:Renderer, linearizer:bool=True) -> list[Rewri
# cache with the values of the context vars
return _get_rewrites_for_renderer(opts, linearizer, QUANTIZE.value, DEVECTORIZE.value, TRANSCENDENTAL.value)
# tensor cores
from tinygrad.uop.ops import PatternMatcher, UPat, UOp
def tensor_cores(a:UOp, b:UOp, r:UOp):
print("use tensor cores")
pm_tensor_cores = PatternMatcher([
((UPat.var().gep(name='a') * UPat.var().gep(name='b')).reduce(name='r', allow_any_len=True), tensor_cores),
])
@functools.cache
def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVECTORIZE, _TRANSCENDENTAL) -> list[RewriteStep]:
# ** lowerer (rewrite_shapetracker_with_index) **
@ -50,10 +61,16 @@ def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVEC
# expand
ret.append(RewriteStep(sym+expander, name="expander"))
# use tensor cores
ret.append(RewriteStep(pm_tensor_cores, name="tensor cores"))
# ** devectorizer (full_graph_rewrite) **
# remove reduce
ret.append(RewriteStep(pm_reduce+gep_pushing, lambda _: ReduceContext(), name="remove_reduce"))
# factorize warp (before gpu dims)
#ret.append(RewriteStep(pm_warp, name="warpcast"))
# add gpu dims (late)
ret.append(RewriteStep(pm_add_gpudims, lambda _: opts, name="add gpudims"))

View file

@ -393,7 +393,7 @@ class Kernel:
elif opt.op is OptOps.UPCAST: # yellow
check(axis < self.first_reduce, "upcast is for non-reduce")
check(not (self.tensor_core and self.global_dims <= axis < self.global_dims+len(self.tensor_core.get_local_axes())), "can't upcast TC locals")
check((self.opts is not None and self.opts.device == "DSP") or amt <= 16, "don't upcast more than 16")
#check((self.opts is not None and self.opts.device == "DSP") or amt <= 16, "don't upcast more than 16")
self.shift_to(axis, amt, insert_before=None)
self.upcast()
elif opt.op is OptOps.NOLOCALS:
@ -455,7 +455,8 @@ class Kernel:
return ret.replace(src=(ret.src[0].replace(arg=st),)+ret.src[1:])
if op.op is Ops.SINK:
# NOTE: should group_for_reduces be added to the local_dims?
return ret.replace(arg = KernelInfo(ret.arg.name if ret.arg is not None else self.name if name_override is None else name_override,
return ret.replace(arg = KernelInfo((ret.arg.name if ret.arg is not None and ret.arg.name is not None else self.name) \
if name_override is None else name_override,
self.global_dims if self.opts.has_local else 0, self.local_dims+self.group_for_reduces,
self.upcasted, self.dont_use_locals, tuple(self.applied_opts)))
if op.op is Ops.REDUCE_AXIS:

View file

@ -135,7 +135,7 @@ class CStyleLanguage(Renderer):
name = "test"
for u in uops:
if u.op is Ops.SINK:
if u.arg is not None: name = u.arg.function_name
if u.arg is not None and u.arg.name is not None: name = u.arg.function_name
continue
if u.op in (Ops.DEFINE_GLOBAL, Ops.DEFINE_VAR):
r[u] = f"data{u.arg}" if u.op is Ops.DEFINE_GLOBAL else u.arg[0]

View file

@ -527,7 +527,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
@dataclass(frozen=True)
class KernelInfo:
name: str = "test" # name of the kernel
name: str|None = None # name of the kernel
global_dims: int = 0 # number of global dimensions (this is remapping RANGE to SPECIAL)
local_dims: int = 0 # number of local dimensions (this is remapping RANGE to SPECIAL)
upcasted: int = 0 # count that are upcasted (this is remapping RANGE to UNROLL)