notes

uc warp
Merge branch 'master' into warp_fun
2026-06-24 02:14:17 +00:00 · 2025-07-01 10:36:05 -07:00 · 2025-06-30 20:54:29 -07:00 · 2025-06-30 18:18:52 -07:00 · 2025-06-30 15:54:57 -07:00 · 2025-06-29 09:26:32 -07:00
5 changed files with 95 additions and 4 deletions
--- a/extra/upcasted_warps.py
+++ b/extra/upcasted_warps.py
@ -0,0 +1,73 @@
+# play with upcasted warps
+from tinygrad import Tensor, Device
+from tinygrad.uop.ops import KernelInfo
+from tinygrad.opt import get_optimized_ast
+from tinygrad.opt.kernel import OptOps, Opt
+from tinygrad.engine.realize import get_program
+
+if __name__ == "__main__":
+  renderer = Device.default.renderer
+  N = 64
+
+  """
+  a = Tensor.empty(N,N)
+
+  out = (a + 1) #.sum(axis=2)
+  ast = out.schedule()[-1].ast
+  opts = tuple()
+  opts += (Opt(OptOps.UPCAST, 0, 32),)
+  ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
+  ast = get_optimized_ast(ast, renderer)
+  prg = get_program(ast, renderer)
+  print(prg.src)
+  """
+
+  # how you split the store determines everything if you don't allow cross warp comms.
+  # actually not everything, there's also the split before the horizontal (unrolled) reduces
+
+  # new flow
+  #  - pull out any dimensions from the store that you want to upcast.
+  #  - decide how you want to assign them to registers. GPUs have a 512-byte memory LOAD/STORE which loads into 4 regs. see BUFFER_LOAD_B128
+  #    - the loads and stores can be shuffled, but only in restrictive ways. in kernels without reduces, the store determines everything
+  #    - it loads 16 bytes from up 32 different places = 512 bytes
+  #  - in kernels with reduces, you now have more flexibility. the final target of the reduce must be what is stored
+  #    - warp dimensions can be in the reduce (this is GROUP)
+
+  # every dimension can be assigned to <global, local, loop, upcast, warp>
+
+  """
+  out = a.sum(axis=1)
+  ast = out.schedule()[-1].ast
+  opts = tuple()
+  opts += (Opt(OptOps.UPCAST, 0, 8),)
+  opts += (Opt(OptOps.UNROLL, 0, 8),)
+  ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
+  ast = get_optimized_ast(ast, renderer)
+  prg = get_program(ast, renderer)
+  print(prg.src)
+
+  out = a.sum(axis=1)
+  ast = out.schedule()[-1].ast
+  opts = tuple()
+  opts += (Opt(OptOps.UNROLL, 0, 8),)
+  opts += (Opt(OptOps.UPCAST, 0, 8),)
+  ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
+  ast = get_optimized_ast(ast, renderer)
+  prg = get_program(ast, renderer)
+  print(prg.src)
+  """
+
+  # gemm
+  b = Tensor.empty(N,N)
+  # metal TC
+  #opts = (Opt(OptOps.UPCAST, 0, 2), # not the warp
+  #        Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2), Opt(OptOps.UPCAST, 1, 2),
+  #        Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2))
+  # new TC should just be able to extract from this and swizzle as needed
+  opts = (Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UPCAST, 1, 8), Opt(OptOps.UNROLL, 0, 8))
+  c = (a@b)
+  ast = c.schedule()[-1].ast
+  ast = ast.replace(arg=KernelInfo(opts_to_apply=opts))
+  ast = get_optimized_ast(ast, renderer)
+  prg = get_program(ast, renderer)
+  print(prg.src)
--- a/tinygrad/codegen/init.py
+++ b/tinygrad/codegen/init.py
@ -37,6 +37,17 @@ def get_rewrites_for_renderer(opts:Renderer, linearizer:bool=True) -> list[Rewri
  # cache with the values of the context vars
  return _get_rewrites_for_renderer(opts, linearizer, QUANTIZE.value, DEVECTORIZE.value, TRANSCENDENTAL.value)

+# tensor cores
+
+from tinygrad.uop.ops import PatternMatcher, UPat, UOp
+
+def tensor_cores(a:UOp, b:UOp, r:UOp):
+  print("use tensor cores")
+
+pm_tensor_cores = PatternMatcher([
+  ((UPat.var().gep(name='a') * UPat.var().gep(name='b')).reduce(name='r', allow_any_len=True), tensor_cores),
+])
+
@functools.cache
 def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVECTORIZE, _TRANSCENDENTAL) -> list[RewriteStep]:
  # ** lowerer (rewrite_shapetracker_with_index) **
@ -50,10 +61,16 @@ def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVEC
  # expand
  ret.append(RewriteStep(sym+expander, name="expander"))

+  # use tensor cores
+  ret.append(RewriteStep(pm_tensor_cores, name="tensor cores"))
+
  # ** devectorizer (full_graph_rewrite) **
  # remove reduce
  ret.append(RewriteStep(pm_reduce+gep_pushing, lambda _: ReduceContext(), name="remove_reduce"))

+  # factorize warp (before gpu dims)
+  #ret.append(RewriteStep(pm_warp, name="warpcast"))
+
  # add gpu dims (late)
  ret.append(RewriteStep(pm_add_gpudims, lambda _: opts, name="add gpudims"))

--- a/tinygrad/opt/kernel.py
+++ b/tinygrad/opt/kernel.py
@ -393,7 +393,7 @@ class Kernel:
    elif opt.op is OptOps.UPCAST:                     # yellow
      check(axis < self.first_reduce, "upcast is for non-reduce")
      check(not (self.tensor_core and self.global_dims <= axis < self.global_dims+len(self.tensor_core.get_local_axes())), "can't upcast TC locals")
-      check((self.opts is not None and self.opts.device == "DSP") or amt <= 16, "don't upcast more than 16")
+      #check((self.opts is not None and self.opts.device == "DSP") or amt <= 16, "don't upcast more than 16")
      self.shift_to(axis, amt, insert_before=None)
      self.upcast()
    elif opt.op is OptOps.NOLOCALS:
@ -455,7 +455,8 @@ class Kernel:
        return ret.replace(src=(ret.src[0].replace(arg=st),)+ret.src[1:])
      if op.op is Ops.SINK:
        # NOTE: should group_for_reduces be added to the local_dims?
-        return ret.replace(arg = KernelInfo(ret.arg.name if ret.arg is not None else self.name if name_override is None else name_override,
+        return ret.replace(arg = KernelInfo((ret.arg.name if ret.arg is not None and ret.arg.name is not None else self.name) \
+                                            if name_override is None else name_override,
                                            self.global_dims if self.opts.has_local else 0, self.local_dims+self.group_for_reduces,
                                            self.upcasted, self.dont_use_locals, tuple(self.applied_opts)))
      if op.op is Ops.REDUCE_AXIS:
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@ -135,7 +135,7 @@ class CStyleLanguage(Renderer):
    name = "test"
    for u in uops:
      if u.op is Ops.SINK:
-        if u.arg is not None: name = u.arg.function_name
+        if u.arg is not None and u.arg.name is not None: name = u.arg.function_name
        continue
      if u.op in (Ops.DEFINE_GLOBAL, Ops.DEFINE_VAR):
        r[u] = f"data{u.arg}" if u.op is Ops.DEFINE_GLOBAL else u.arg[0]
--- a/tinygrad/uop/ops.py
+++ b/tinygrad/uop/ops.py
@ -527,7 +527,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):

@dataclass(frozen=True)
 class KernelInfo:
-  name: str = "test"            # name of the kernel
+  name: str|None = None         # name of the kernel
  global_dims: int = 0          # number of global dimensions (this is remapping RANGE to SPECIAL)
  local_dims: int = 0           # number of local dimensions  (this is remapping RANGE to SPECIAL)
  upcasted: int = 0             # count that are upcasted     (this is remapping RANGE to UNROLL)
Author	SHA1	Message	Date
George Hotz	383c0bf05e	notes	2025-07-01 10:36:05 -07:00
George Hotz	fd2c0e2626	uc warp	2025-06-30 20:54:29 -07:00
George Hotz	5907f2d443	Merge branch 'master' into warp_fun	2025-06-30 18:18:52 -07:00
George Hotz	fc59db0aaa	Merge branch 'master' into warp_fun	2025-06-30 15:54:57 -07:00
George Hotz	256d4403c5	what does order change	2025-06-29 09:26:32 -07:00
George Hotz	ad823a5199	Merge branch 'master' into warp_fun	2025-06-29 09:13:16 -07:00
George Hotz	0df4355cd8	upcasted warp experiments	2025-06-29 09:04:37 -07:00