mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
switch to the new memory coaleser [pr] (#16716)
* switch to the new memory coalese * move that stuff * copy in allowed length logic * mulitple buffers * new coalese is better * fine * earlier * fixes * work * work * valid * stack on index const
This commit is contained in:
parent
dfea9e7994
commit
0a8e61d0c5
7 changed files with 90 additions and 18 deletions
|
|
@ -42,8 +42,8 @@ def _custom_quantize_fp8_with_amax(fp8_out:UOp, amax_partial:UOp, x:UOp, amax_st
|
|||
step = THREADS_PER_WG // 2
|
||||
while step:
|
||||
active = tid < step
|
||||
other = lds[tid + step].load(UOp.const(dtypes.float, 0.0), active)
|
||||
lds = lds.after(lds[tid].store(lds[tid].maximum(other), gate=active).barrier())
|
||||
other = lds[(tid + step).valid(active)].load()
|
||||
lds = lds.after(lds[tid.valid(active)].store(lds[tid].maximum(other)).barrier())
|
||||
step //= 2
|
||||
|
||||
amax_store = amax_partial[tid.eq(0).where(wg, UOp.invalid())].store(lds[0])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue