MULTICORE=1 PYTHONPATH=. QUANTIZE=1 DEBUG=2 DEVECTORIZE=0 python3 extra/replay_pkl.py /tmp/im.pkl

This commit is contained in:
George Hotz 2025-03-31 15:37:07 +08:00
commit eb606d7230
3 changed files with 17 additions and 2 deletions

View file

@ -27,5 +27,9 @@ if __name__ == "__main__":
#if knum == 13: k.apply_opt(Opt(OptOps.UPCAST, 0, 4))
p2 = k.to_program()
new_ei = replace(ei, prg=CompiledRunner(p2), bufs=dsp_bufs)
new_ei.run()
if getenv("MULTICORE", 0) == 1:
new_ei.run({p2.vars[0]: 0})
new_ei.run({p2.vars[0]: 1})
else:
new_ei.run()
knum += 1

View file

@ -482,7 +482,7 @@ class Kernel:
k.apply_opt(Opt(OptOps.UPCAST, 0, 128))
# make all non first dimensions local
if getenv("MULTICORE", 0) and len(k.full_shape) >= 1 and k.full_shape[0] > 1:
if getenv("MULTICORE", 0) == 2 and len(k.full_shape) >= 1 and k.full_shape[0] > 1:
if k.full_shape[0]%2 == 1: k.apply_opt(Opt(OptOps.PADTO, 0, 2))
if k.full_shape[0] > 2: k.apply_opt(Opt(OptOps.LOCAL, 0, k.full_shape[0]//2))
for i in range(1, k.first_reduce-1): k.apply_opt(Opt(OptOps.LOCAL, 1, 0))

View file

@ -270,6 +270,14 @@ def vectorize_shuffle(vec:UOp):
#src = "__builtin_shufflevector({0}, {1})"
return None
def multicore_range(r:UOp):
if getenv("MULTICORE", 0) != 1: return None
if any(x.op is Ops.DEFINE_VAR for x in r.toposort): return None
core = UOp(Ops.DEFINE_VAR, dtypes.int, arg=("core", 0, 1))
start = (core.eq(0)).where(r.src[0], r.src[1]//2)
end = (core.eq(0)).where(r.src[1]//2, r.src[1])
return r.replace(src=(start,end))
dsp_pm_late = PatternMatcher([
# prefetch L1
(UPat(Ops.LOAD, dtype=(dtypes.uchar.vec(4), dtypes.uchar.vec(8)), src=(UPat(Ops.INDEX, name="idx").cast(),), name="ld"), prefetch_l1),
@ -307,6 +315,9 @@ dsp_pm_late = PatternMatcher([
lambda x,y: x//UOp(Ops.CUSTOMI,x.dtype,(y,),arg="{0}") if x.op is not Ops.CUSTOMI or x.arg != "{0}" else None),
(UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
lambda d: d.replace(src=(UOp(Ops.CUSTOMI, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
# multicore
(UPat(Ops.RANGE, name="r", arg=0), multicore_range),
])
pretty_render = PatternMatcher([