mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Compare commits
7 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
383c0bf05e | ||
|
|
fd2c0e2626 | ||
|
|
5907f2d443 |
||
|
|
fc59db0aaa |
||
|
|
256d4403c5 | ||
|
|
ad823a5199 |
||
|
|
0df4355cd8 |
5 changed files with 95 additions and 4 deletions
73
extra/upcasted_warps.py
Normal file
73
extra/upcasted_warps.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
# play with upcasted warps
|
||||
from tinygrad import Tensor, Device
|
||||
from tinygrad.uop.ops import KernelInfo
|
||||
from tinygrad.opt import get_optimized_ast
|
||||
from tinygrad.opt.kernel import OptOps, Opt
|
||||
from tinygrad.engine.realize import get_program
|
||||
|
||||
if __name__ == "__main__":
|
||||
renderer = Device.default.renderer
|
||||
N = 64
|
||||
|
||||
"""
|
||||
a = Tensor.empty(N,N)
|
||||
|
||||
out = (a + 1) #.sum(axis=2)
|
||||
ast = out.schedule()[-1].ast
|
||||
opts = tuple()
|
||||
opts += (Opt(OptOps.UPCAST, 0, 32),)
|
||||
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
|
||||
ast = get_optimized_ast(ast, renderer)
|
||||
prg = get_program(ast, renderer)
|
||||
print(prg.src)
|
||||
"""
|
||||
|
||||
# how you split the store determines everything if you don't allow cross warp comms.
|
||||
# actually not everything, there's also the split before the horizontal (unrolled) reduces
|
||||
|
||||
# new flow
|
||||
# - pull out any dimensions from the store that you want to upcast.
|
||||
# - decide how you want to assign them to registers. GPUs have a 512-byte memory LOAD/STORE which loads into 4 regs. see BUFFER_LOAD_B128
|
||||
# - the loads and stores can be shuffled, but only in restrictive ways. in kernels without reduces, the store determines everything
|
||||
# - it loads 16 bytes from up 32 different places = 512 bytes
|
||||
# - in kernels with reduces, you now have more flexibility. the final target of the reduce must be what is stored
|
||||
# - warp dimensions can be in the reduce (this is GROUP)
|
||||
|
||||
# every dimension can be assigned to <global, local, loop, upcast, warp>
|
||||
|
||||
"""
|
||||
out = a.sum(axis=1)
|
||||
ast = out.schedule()[-1].ast
|
||||
opts = tuple()
|
||||
opts += (Opt(OptOps.UPCAST, 0, 8),)
|
||||
opts += (Opt(OptOps.UNROLL, 0, 8),)
|
||||
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
|
||||
ast = get_optimized_ast(ast, renderer)
|
||||
prg = get_program(ast, renderer)
|
||||
print(prg.src)
|
||||
|
||||
out = a.sum(axis=1)
|
||||
ast = out.schedule()[-1].ast
|
||||
opts = tuple()
|
||||
opts += (Opt(OptOps.UNROLL, 0, 8),)
|
||||
opts += (Opt(OptOps.UPCAST, 0, 8),)
|
||||
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
|
||||
ast = get_optimized_ast(ast, renderer)
|
||||
prg = get_program(ast, renderer)
|
||||
print(prg.src)
|
||||
"""
|
||||
|
||||
# gemm
|
||||
b = Tensor.empty(N,N)
|
||||
# metal TC
|
||||
#opts = (Opt(OptOps.UPCAST, 0, 2), # not the warp
|
||||
# Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2), Opt(OptOps.UPCAST, 1, 2),
|
||||
# Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2))
|
||||
# new TC should just be able to extract from this and swizzle as needed
|
||||
opts = (Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UPCAST, 1, 8), Opt(OptOps.UNROLL, 0, 8))
|
||||
c = (a@b)
|
||||
ast = c.schedule()[-1].ast
|
||||
ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
|
||||
ast = get_optimized_ast(ast, renderer)
|
||||
prg = get_program(ast, renderer)
|
||||
print(prg.src)
|
||||
|
|
@ -37,6 +37,17 @@ def get_rewrites_for_renderer(opts:Renderer, linearizer:bool=True) -> list[Rewri
|
|||
# cache with the values of the context vars
|
||||
return _get_rewrites_for_renderer(opts, linearizer, QUANTIZE.value, DEVECTORIZE.value, TRANSCENDENTAL.value)
|
||||
|
||||
# tensor cores
|
||||
|
||||
from tinygrad.uop.ops import PatternMatcher, UPat, UOp
|
||||
|
||||
def tensor_cores(a:UOp, b:UOp, r:UOp):
|
||||
print("use tensor cores")
|
||||
|
||||
pm_tensor_cores = PatternMatcher([
|
||||
((UPat.var().gep(name='a') * UPat.var().gep(name='b')).reduce(name='r', allow_any_len=True), tensor_cores),
|
||||
])
|
||||
|
||||
@functools.cache
|
||||
def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVECTORIZE, _TRANSCENDENTAL) -> list[RewriteStep]:
|
||||
# ** lowerer (rewrite_shapetracker_with_index) **
|
||||
|
|
@ -50,10 +61,16 @@ def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVEC
|
|||
# expand
|
||||
ret.append(RewriteStep(sym+expander, name="expander"))
|
||||
|
||||
# use tensor cores
|
||||
ret.append(RewriteStep(pm_tensor_cores, name="tensor cores"))
|
||||
|
||||
# ** devectorizer (full_graph_rewrite) **
|
||||
# remove reduce
|
||||
ret.append(RewriteStep(pm_reduce+gep_pushing, lambda _: ReduceContext(), name="remove_reduce"))
|
||||
|
||||
# factorize warp (before gpu dims)
|
||||
#ret.append(RewriteStep(pm_warp, name="warpcast"))
|
||||
|
||||
# add gpu dims (late)
|
||||
ret.append(RewriteStep(pm_add_gpudims, lambda _: opts, name="add gpudims"))
|
||||
|
||||
|
|
|
|||
|
|
@ -393,7 +393,7 @@ class Kernel:
|
|||
elif opt.op is OptOps.UPCAST: # yellow
|
||||
check(axis < self.first_reduce, "upcast is for non-reduce")
|
||||
check(not (self.tensor_core and self.global_dims <= axis < self.global_dims+len(self.tensor_core.get_local_axes())), "can't upcast TC locals")
|
||||
check((self.opts is not None and self.opts.device == "DSP") or amt <= 16, "don't upcast more than 16")
|
||||
#check((self.opts is not None and self.opts.device == "DSP") or amt <= 16, "don't upcast more than 16")
|
||||
self.shift_to(axis, amt, insert_before=None)
|
||||
self.upcast()
|
||||
elif opt.op is OptOps.NOLOCALS:
|
||||
|
|
@ -455,7 +455,8 @@ class Kernel:
|
|||
return ret.replace(src=(ret.src[0].replace(arg=st),)+ret.src[1:])
|
||||
if op.op is Ops.SINK:
|
||||
# NOTE: should group_for_reduces be added to the local_dims?
|
||||
return ret.replace(arg = KernelInfo(ret.arg.name if ret.arg is not None else self.name if name_override is None else name_override,
|
||||
return ret.replace(arg = KernelInfo((ret.arg.name if ret.arg is not None and ret.arg.name is not None else self.name) \
|
||||
if name_override is None else name_override,
|
||||
self.global_dims if self.opts.has_local else 0, self.local_dims+self.group_for_reduces,
|
||||
self.upcasted, self.dont_use_locals, tuple(self.applied_opts)))
|
||||
if op.op is Ops.REDUCE_AXIS:
|
||||
|
|
|
|||
|
|
@ -135,7 +135,7 @@ class CStyleLanguage(Renderer):
|
|||
name = "test"
|
||||
for u in uops:
|
||||
if u.op is Ops.SINK:
|
||||
if u.arg is not None: name = u.arg.function_name
|
||||
if u.arg is not None and u.arg.name is not None: name = u.arg.function_name
|
||||
continue
|
||||
if u.op in (Ops.DEFINE_GLOBAL, Ops.DEFINE_VAR):
|
||||
r[u] = f"data{u.arg}" if u.op is Ops.DEFINE_GLOBAL else u.arg[0]
|
||||
|
|
|
|||
|
|
@ -527,7 +527,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
|
|||
|
||||
@dataclass(frozen=True)
|
||||
class KernelInfo:
|
||||
name: str = "test" # name of the kernel
|
||||
name: str|None = None # name of the kernel
|
||||
global_dims: int = 0 # number of global dimensions (this is remapping RANGE to SPECIAL)
|
||||
local_dims: int = 0 # number of local dimensions (this is remapping RANGE to SPECIAL)
|
||||
upcasted: int = 0 # count that are upcasted (this is remapping RANGE to UNROLL)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue