mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
parent
c9dc1d63cc
commit
7c1d0b6d9a
3 changed files with 17 additions and 17 deletions
|
|
@ -143,14 +143,17 @@ def make_getaddr(u, device=None):
|
|||
def make_ins(op, *srcs):
|
||||
return UOp(Ops.INS, dtypes.void, tuple(UOp.const(dtypes.uint32, s) if isinstance(s, int) else s.cast(dtypes.uint32) for s in srcs), op)
|
||||
|
||||
def make_patch(buf:UOp, off:sint, val:UOp, dtype=None) -> UOp:
|
||||
dt = dtype or val.dtype
|
||||
return UOp(Ops.SHRINK, buf.dtype.base, (buf, UOp.const(dtypes.int, off), UOp.const(dtypes.int, dt.itemsize))).bitcast(dt).store(val.cast(dt))
|
||||
|
||||
def make_cmdbuf(lin, devs, tag):
|
||||
blob, patches = b'', []
|
||||
for s in (s for ins in lin.src for s in ins.src):
|
||||
if s.op is not Ops.CONST: patches.append((len(blob), s))
|
||||
blob += struct.pack(f'<{s.dtype.fmt}', s.arg if s.op is Ops.CONST else 0x0)
|
||||
buf = UOp.new_buffer(devs, len(blob), dtypes.uint8).rtag(tag)
|
||||
stores = [buf.index(UOp.const(dtypes.int, off), dtype=buf.dtype.ptr()).cast(s.dtype.ptr()).store(s) for off, s in patches]
|
||||
return buf.after(buf.store(UOp(Ops.BINARY, dtypes.void, src=(), arg=blob)), *stores)
|
||||
return buf.after(buf.store(UOp(Ops.BINARY, dtypes.void, src=(), arg=blob)), *[make_patch(buf, off, s) for off, s in patches])
|
||||
|
||||
def make_mstack(uops): return uops[0] if len(uops) == 1 else UOp(Ops.MSTACK, uops[0].dtype, tuple(uops))
|
||||
|
||||
|
|
@ -211,15 +214,11 @@ def prep_program(call:UOp, prg:UOp) -> UOp|None:
|
|||
return prg.replace(src=(buf.after(buf.store(blob)),), arg=(data, prg.arg)).call(*call.src[1:], aux=HCQInfo.from_call(call))
|
||||
|
||||
def prep_kernargs(call:UOp, prg:UOp) -> UOp:
|
||||
data, info = prg.arg
|
||||
patches = [(i*dtypes.uint64.itemsize, UOp(Ops.GETADDR, dtypes.uint64, src=(call.src[1+gi], UOp(Ops.DEVICE, arg=call.src[1+gi].device))),
|
||||
dtypes.uint64) for i,gi in enumerate(info.globals)] \
|
||||
+ [(len(info.globals)*dtypes.uint64.itemsize + i*dtypes.uint32.itemsize, v, dtypes.uint32) for i,v in enumerate(info.vars)]
|
||||
|
||||
buf = UOp.new_buffer(call.src[1].device, data.kernargs_alloc_size, dtypes.uint8).rtag("kernargs")
|
||||
kernargs = buf.after(*tuple(buf.index(UOp.const(dtypes.int, o), dtype=buf.dtype.ptr()).cast(dt.ptr()).store(val.cast(dt)) for o, val, dt in patches))
|
||||
|
||||
return call.replace(src=(prg.replace(src=prg.src + (kernargs,), arg=(data, info)),) + call.src[1:])
|
||||
(data, info), dev_uop = prg.arg, UOp(Ops.DEVICE, arg=call.src[1].device)
|
||||
buf = UOp.new_buffer(dev_uop.arg, data.kernargs_alloc_size, dtypes.uint8).rtag("kernargs")
|
||||
patches = [make_patch(buf, i*8, UOp(Ops.GETADDR, dtypes.uint64, src=(call.src[1+gi], dev_uop))) for i,gi in enumerate(info.globals)] \
|
||||
+ [make_patch(buf, len(info.globals)*8 + i*4, v, dtypes.uint32) for i,v in enumerate(info.vars)]
|
||||
return call.replace(src=(prg.replace(src=prg.src + (buf.after(*patches),), arg=(data, info)),) + call.src[1:])
|
||||
|
||||
pm_prep_runtime = PatternMatcher([
|
||||
# bind generic PROGRAM device to the call's actual dev(s), then run device-specific lowering
|
||||
|
|
@ -532,9 +531,9 @@ pm_resolve_patches = PatternMatcher([
|
|||
(UPat(GroupOp.ALU, src=[UPat(Ops.STACK, name="s"), UPat(Ops.CONST)], name="op"), push_stack),
|
||||
(UPat(Ops.CAST, src=(UPat(Ops.STACK, name="s"),), name="op"), push_stack),
|
||||
|
||||
# index on slice is index
|
||||
(UPat(Ops.INDEX, src=(UPat(Ops.SLICE, name="bv"), UPat()), name="idx", allow_any_len=True),
|
||||
lambda idx, bv: idx.replace(src=(bv.src[0], idx.src[1] + bv.src[1].cast(idx.src[1].dtype), *idx.src[2:]))),
|
||||
# shrink on slice is shrink on base at offset
|
||||
(UPat(Ops.SHRINK, src=(UPat(Ops.SLICE, name="bv"), UPat(), UPat()), name="shr"),
|
||||
lambda shr, bv: shr.replace(src=(bv.src[0], shr.src[1] + bv.src[1].cast(shr.src[1].dtype), shr.src[2]))),
|
||||
|
||||
# getaddr
|
||||
(UPat(Ops.GETADDR, src=(UPat(Ops.SLICE, name="bv"), UPat(Ops.DEVICE, name="dev"))), resolve_getaddr_slice), # getaddr(slice(x)) -> offset+getaddr(x)
|
||||
|
|
@ -542,8 +541,8 @@ pm_resolve_patches = PatternMatcher([
|
|||
|
||||
# folders
|
||||
(UPat({Ops.BUFFER, Ops.SLICE, Ops.MSTACK}, name="buf").store(UPat(Ops.BINARY, name="blob")), fold_blob_store),
|
||||
(UPat({Ops.BUFFER, Ops.SLICE, Ops.MSTACK}, name="buf").index(UPat.cvar("off")).or_casted().store(UPat.any(UPat.cvar("val"), UPat(Ops.STACK, name="val"))),
|
||||
fold_const_store),
|
||||
(UPat(Ops.SHRINK, src=(UPat({Ops.BUFFER, Ops.SLICE, Ops.MSTACK}, name="buf"), UPat.cvar("off"), UPat(Ops.CONST))).bitcast()
|
||||
.store(UPat.any(UPat.cvar("val"), UPat(Ops.STACK, name="val"))), fold_const_store),
|
||||
]) + symbolic_simple
|
||||
|
||||
# *****************
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ base_rewrite = PatternMatcher([
|
|||
(UPat(Ops.CAST, name="x"), lambda ctx,x: f"__builtin_convertvector({ctx[x.src[0]]}, {ctx.render_type(x)})" \
|
||||
if x.max_numel() > 1 and x.addrspace is AddrSpace.REG else None),
|
||||
(UPat(Ops.CAST, name="x"), lambda ctx,x: f"({ctx.render_cast(x, ctx[x.src[0]])})"),
|
||||
(UPat(Ops.BITCAST, name="x"), lambda ctx,x: ctx[x.src[0]] if x.addrspace in (AddrSpace.GLOBAL, AddrSpace.LOCAL) else None),
|
||||
(UPat(Ops.BITCAST, name="x"), lambda ctx,x: f"__builtin_bit_cast({ctx.render_type(x)}, ({ctx.render_type(x.src[0])})({ctx[x.src[0]]}))"),
|
||||
|
||||
# GPU stuff
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import sys
|
|||
sys.setrecursionlimit(10000)
|
||||
|
||||
def add_ranges_to_store(ctx, x):
|
||||
if x.src[0]._shape is None or x.src[1]._shape is None or x.src[0].shape == (): return None
|
||||
if x.src[0]._shape is None or x.src[1]._shape is None or x.src[0].shape == () or x.src[0].max_numel() == x.src[1].max_numel() == 1: return None
|
||||
assert x.src[0].shape == x.src[1].shape, "bad store shape"
|
||||
idxs = [UOp.range(r, next(ctx), AxisType.LOOP) for r in x.src[0].shape]
|
||||
return UOp.store(x.src[0].index(*idxs), x.src[1].index(*idxs)).end(*idxs)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue