mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Compare commits
2 commits
master
...
update_ben
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
04a8eca8e3 | ||
|
|
210b847b44 |
3 changed files with 17 additions and 9 deletions
10
.github/workflows/benchmark.yml
vendored
10
.github/workflows/benchmark.yml
vendored
|
|
@ -67,7 +67,7 @@ jobs:
|
||||||
- name: Test speed vs torch
|
- name: Test speed vs torch
|
||||||
run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
|
run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
|
||||||
- name: Test tensor cores
|
- name: Test tensor cores
|
||||||
run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops TestKernelOpts.test_tensor_core_opts
|
||||||
- name: Test AMX tensor cores
|
- name: Test AMX tensor cores
|
||||||
run: |
|
run: |
|
||||||
DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||||
|
|
@ -196,8 +196,8 @@ jobs:
|
||||||
run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py
|
run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py
|
||||||
- name: Test tensor cores
|
- name: Test tensor cores
|
||||||
run: |
|
run: |
|
||||||
NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops TestKernelOpts.test_tensor_core_opts
|
||||||
PTX=1 ALLOW_TF32=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
PTX=1 ALLOW_TF32=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops TestKernelOpts.test_tensor_core_opts
|
||||||
- name: Run Tensor Core GEMM (CUDA)
|
- name: Run Tensor Core GEMM (CUDA)
|
||||||
run: |
|
run: |
|
||||||
CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
|
CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
|
||||||
|
|
@ -396,8 +396,8 @@ jobs:
|
||||||
run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
|
run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
|
||||||
- name: Test tensor cores
|
- name: Test tensor cores
|
||||||
run: |
|
run: |
|
||||||
AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops TestKernelOpts.test_tensor_core_opts
|
||||||
AMD=1 AMD_LLVM=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
AMD=1 AMD_LLVM=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_emulation TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops TestKernelOpts.test_tensor_core_opts
|
||||||
AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||||
- name: Run Tensor Core GEMM (AMD)
|
- name: Run Tensor Core GEMM (AMD)
|
||||||
run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt
|
run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt
|
||||||
|
|
|
||||||
|
|
@ -2027,7 +2027,13 @@ class TestKernelOpts(unittest.TestCase):
|
||||||
[Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 0, 4)],
|
[Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 0, 4)],
|
||||||
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4)],
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4)],
|
||||||
[Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)],
|
[Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)],
|
||||||
# [Opt(OptOps.GROUP, 0, 2)] # doesn't work because group_for_reduce dims become early locals (conflicting with TC)
|
[Opt(OptOps.GROUP, 0, 2)],
|
||||||
|
[Opt(OptOps.GROUPTOP, 0, 4)],
|
||||||
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.GROUP, 0, 2)],
|
||||||
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUP, 0, 2)],
|
||||||
|
[Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.GROUP, 0, 2)],
|
||||||
|
[Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUP, 0, 2)],
|
||||||
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 2)],
|
||||||
], apply_tc=True, atol=atol, rtol=rtol)
|
], apply_tc=True, atol=atol, rtol=rtol)
|
||||||
|
|
||||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||||
|
|
|
||||||
|
|
@ -380,7 +380,7 @@ class Kernel:
|
||||||
elif opt.op in {OptOps.GROUP, OptOps.GROUPTOP}: # green
|
elif opt.op in {OptOps.GROUP, OptOps.GROUPTOP}: # green
|
||||||
check(self.opts.has_local and self.opts.has_shared, "target does not support local or shared mem")
|
check(self.opts.has_local and self.opts.has_shared, "target does not support local or shared mem")
|
||||||
check(self.first_reduce + self.group_for_reduces <= axis < self.first_upcast, "must be reduce axis to group")
|
check(self.first_reduce + self.group_for_reduces <= axis < self.first_upcast, "must be reduce axis to group")
|
||||||
check(not self.tensor_core, "can't group with tensor cores")
|
check(self.use_tensor_cores != 3, "can't group with tensor cores emulation")
|
||||||
check(len(reduce_axes:=[i for r in self.reduceops for i in r.axis_arg]) == len(set(reduce_axes)), "can't group with parallel reduces")
|
check(len(reduce_axes:=[i for r in self.reduceops for i in r.axis_arg]) == len(set(reduce_axes)), "can't group with parallel reduces")
|
||||||
self.shift_to(axis, amt, top=(opt.op is OptOps.GROUPTOP), insert_before=self.first_reduce + self.group_for_reduces)
|
self.shift_to(axis, amt, top=(opt.op is OptOps.GROUPTOP), insert_before=self.first_reduce + self.group_for_reduces)
|
||||||
self.group_for_reduces += 1
|
self.group_for_reduces += 1
|
||||||
|
|
@ -507,9 +507,11 @@ class Kernel:
|
||||||
else: # for TC=3 MUL/SUM instead of WMMA
|
else: # for TC=3 MUL/SUM instead of WMMA
|
||||||
tc_uop = UOp(Ops.REDUCE_AXIS, tc.dtype_out, ((srcs[0] * srcs[1]).cast(tc.dtype_out),), (Ops.ADD, tc_reduce_axes))
|
tc_uop = UOp(Ops.REDUCE_AXIS, tc.dtype_out, ((srcs[0] * srcs[1]).cast(tc.dtype_out),), (Ops.ADD, tc_reduce_axes))
|
||||||
|
|
||||||
return ret.replace(src=(tc_uop,), arg=(Ops.ADD, new_axes)) if (new_axes := tuple(i for i in axes if i not in tc_reduce_axes)) else tc_uop
|
ret = ret.replace(src=(tc_uop,), arg=(Ops.ADD, new_axes)) if (new_axes := tuple(i for i in axes if i not in tc_reduce_axes)) else tc_uop
|
||||||
|
|
||||||
|
else:
|
||||||
|
ret = ret.replace(arg = (op.arg[0], axes))
|
||||||
|
|
||||||
ret = ret.replace(arg = (op.arg[0], axes))
|
|
||||||
if self.group_for_reduces and grouped_axes:
|
if self.group_for_reduces and grouped_axes:
|
||||||
local_shape = (1,) * self.global_dims + self.full_shape[self.global_dims:self.global_dims+self.local_dims] + \
|
local_shape = (1,) * self.global_dims + self.full_shape[self.global_dims:self.global_dims+self.local_dims] + \
|
||||||
tuple([self.full_shape[i] if self.sts[reduce_idx].shape[i] != self.sts[reduce_idx+1].shape[i] else 1 \
|
tuple([self.full_shape[i] if self.sts[reduce_idx].shape[i] != self.sts[reduce_idx+1].shape[i] else 1 \
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue