mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Merge branch 'master' into shrink_in_render
This commit is contained in:
commit
c64a37fa7d
9 changed files with 23 additions and 39 deletions
35
.github/workflows/test.yml
vendored
35
.github/workflows/test.yml
vendored
|
|
@ -431,20 +431,12 @@ jobs:
|
|||
python-version: '3.12'
|
||||
llvm: 'true'
|
||||
- name: Test ONNX (CPU)
|
||||
run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
- name: Test ONNX (LLVM)
|
||||
run: DEV=CPU:LLVM python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
- name: Test ONNX Runner (CPU)
|
||||
run: DEV=CPU python3 test/external/external_test_onnx_runner.py
|
||||
- name: Test Additional ONNX Ops (CPU)
|
||||
run: DEV=CPU python3 test/external/external_test_onnx_ops.py
|
||||
- name: Test Quantize ONNX
|
||||
run: DEV=CPU python3 test/backend/test_quantize_onnx.py
|
||||
run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py test/external/external_test_onnx_runner.py test/external/external_test_onnx_ops.py test/backend/test_quantize_onnx.py --durations=20
|
||||
- name: Run process replay tests
|
||||
uses: ./.github/actions/process-replay
|
||||
|
||||
testopencl:
|
||||
name: ONNX (CL)+Optimization Tests
|
||||
testoptim:
|
||||
name: Optimization Tests
|
||||
runs-on: *linux
|
||||
timeout-minutes: 20
|
||||
steps:
|
||||
|
|
@ -453,13 +445,11 @@ jobs:
|
|||
- name: Setup Environment
|
||||
uses: ./.github/actions/setup-tinygrad
|
||||
with:
|
||||
key: onnxoptl
|
||||
key: optim
|
||||
deps: testing
|
||||
pydeps: "tensorflow==2.19"
|
||||
python-version: '3.12'
|
||||
opencl: 'true'
|
||||
- name: Test ONNX (CL)
|
||||
run: DEV=CL python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
#- name: Test Optimization Helpers
|
||||
# run: DEBUG=1 python3 extra/optimization/test_helpers.py
|
||||
#- name: Test Action Space
|
||||
|
|
@ -757,7 +747,7 @@ jobs:
|
|||
|
||||
# ****** OSX Tests ******
|
||||
|
||||
testmetal:
|
||||
unittestmacos:
|
||||
name: MacOS (unit)
|
||||
runs-on: macos-14
|
||||
timeout-minutes: 20
|
||||
|
|
@ -767,18 +757,15 @@ jobs:
|
|||
- name: Setup Environment
|
||||
uses: ./.github/actions/setup-tinygrad
|
||||
with:
|
||||
key: metal
|
||||
deps: testing
|
||||
key: unittest-macos
|
||||
deps: testing_unit
|
||||
python-version: '3.12'
|
||||
amd: 'true'
|
||||
ocelot: 'true'
|
||||
llvm: 'true'
|
||||
- name: Run unit tests
|
||||
run: DEV=METAL python -m pytest -n=auto test/unit/ --durations=20
|
||||
- name: Run NULL backend tests
|
||||
run: DEV=NULL python -m pytest -n=auto test/null/ --durations=20
|
||||
- name: Run ONNX
|
||||
run: DEV=METAL python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
- name: Test tensor core ops (fake)
|
||||
run: DEV=METAL DEBUG=3 TC=2 python test/backend/test_ops.py TestOps.test_gemm
|
||||
- name: Test tensor core ops (real)
|
||||
|
|
@ -789,20 +776,12 @@ jobs:
|
|||
run: DEV=METAL python3 -m pytest test/device/test_metal.py
|
||||
#- name: Fuzz Test linearizer
|
||||
# run: DEV=METAL DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
|
||||
- name: Run TRANSCENDENTAL math
|
||||
run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
|
||||
- name: Run pytest (amd)
|
||||
env:
|
||||
DEV: MOCKKFD+AMD
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
|
||||
- name: Run pytest (amd with llvm backend)
|
||||
env:
|
||||
DEV: "MOCKKFD+AMD:LLVM"
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
|
||||
- name: Run pytest (ptx)
|
||||
env:
|
||||
DEV: "MOCK+NV:PTX"
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
|
|||
export DEBUG=${DEBUG:-2}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-1}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
|
|
@ -22,7 +23,6 @@ export FUSED_INPUT_QUANTIZE=${FUSED_INPUT_QUANTIZE:-0}
|
|||
export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-0}
|
||||
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-0}
|
||||
export FUSED_SILU_W13=${FUSED_SILU_W13:-0}
|
||||
export FUSED_PAD_GRAD_ACCUM=${FUSED_PAD_GRAD_ACCUM:-0}
|
||||
export SPLIT_W13=${SPLIT_W13:-1}
|
||||
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR
|
|||
|
||||
export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
|
||||
if [ -z "$FULL_LAYERS" ]; then
|
||||
export LLAMA_LAYERS=2
|
||||
export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
|
||||
fi
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
|
|||
export DEBUG=${DEBUG:-0}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-1}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
|
|
@ -22,7 +23,6 @@ export FUSED_INPUT_QUANTIZE=${FUSED_INPUT_QUANTIZE:-0}
|
|||
export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-0}
|
||||
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-0}
|
||||
export FUSED_SILU_W13=${FUSED_SILU_W13:-0}
|
||||
export FUSED_PAD_GRAD_ACCUM=${FUSED_PAD_GRAD_ACCUM:-0}
|
||||
export SPLIT_W13=${SPLIT_W13:-1}
|
||||
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-1}
|
|||
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-1}
|
||||
export FUSED_SILU_W13=${FUSED_SILU_W13:-1}
|
||||
export SPLIT_W13=${SPLIT_W13:-0}
|
||||
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-0}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
|
||||
|
|
@ -47,7 +48,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR
|
|||
|
||||
export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
|
||||
if [ -z "$FULL_LAYERS" ]; then
|
||||
export LLAMA_LAYERS=2
|
||||
export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
|
||||
fi
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
|
|||
export DEBUG=${DEBUG:-2}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-1}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
|
|
@ -47,7 +48,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR
|
|||
|
||||
export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
|
||||
if [ -z "$FULL_LAYERS" ]; then
|
||||
export LLAMA_LAYERS=2
|
||||
export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
|
||||
fi
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-1}
|
|||
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-1}
|
||||
export FUSED_SILU_W13=${FUSED_SILU_W13:-1}
|
||||
export SPLIT_W13=${SPLIT_W13:-0}
|
||||
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-0}
|
||||
|
||||
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
|
||||
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
|
|||
export DEBUG=${DEBUG:-0}
|
||||
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
|
||||
export ALL2ALL=${ALL2ALL:-1}
|
||||
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
|
||||
export USE_ATOMICS=${USE_ATOMICS:-1}
|
||||
export ASM_GEMM=${ASM_GEMM:-1}
|
||||
export WQKV=${WQKV:-1}
|
||||
|
|
|
|||
10
test/external/external_test_onnx_ops.py
vendored
10
test/external/external_test_onnx_ops.py
vendored
|
|
@ -285,7 +285,7 @@ class TestMainOnnxOps(TestOnnxOps):
|
|||
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
|
||||
for b in (np.ones([32], dtype=np.int32), np.zeros([32], dtype=np.int32)):
|
||||
for channel_shape in [(), (32,)]:
|
||||
with self.subTest(dtype=dtype, zero_point=zero_point, channel_shape=channel_shape):
|
||||
with self.subTest(dtype=dtype.__name__, zero_point=zero_point, channel_shape=channel_shape):
|
||||
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
|
||||
inputs = {
|
||||
"x": np.random.randint(dtype_min, dtype_max + 1, [1, 3, 224, 224], dtype=dtype),
|
||||
|
|
@ -304,7 +304,7 @@ class TestMainOnnxOps(TestOnnxOps):
|
|||
|
||||
def test_qlinear_matmul(self):
|
||||
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
|
||||
with self.subTest(dtype=dtype, zero_point=zero_point):
|
||||
with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
|
||||
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
|
||||
inputs = {
|
||||
"A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
|
||||
|
|
@ -512,7 +512,7 @@ class TestContribOnnxOps(TestOnnxOps):
|
|||
|
||||
def test_qlinear_add(self):
|
||||
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
|
||||
with self.subTest(dtype=dtype, zero_point=zero_point):
|
||||
with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
|
||||
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
|
||||
inputs = {
|
||||
"A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
|
||||
|
|
@ -546,7 +546,7 @@ class TestContribOnnxOps(TestOnnxOps):
|
|||
|
||||
def test_qlinear_mul(self):
|
||||
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
|
||||
with self.subTest(dtype=dtype, zero_point=zero_point):
|
||||
with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
|
||||
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
|
||||
inputs = {
|
||||
"A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
|
||||
|
|
@ -580,7 +580,7 @@ class TestContribOnnxOps(TestOnnxOps):
|
|||
def test_qlinear_global_average_pool(self):
|
||||
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
|
||||
for channels_last in [0, 1]:
|
||||
with self.subTest(dtype=dtype, zero_point=zero_point, channels_last=channels_last):
|
||||
with self.subTest(dtype=dtype.__name__, zero_point=zero_point, channels_last=channels_last):
|
||||
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
|
||||
# NCHW for channels_last=0, NHWC for channels_last=1
|
||||
shape = [1, 3, 32, 32] if channels_last == 0 else [1, 32, 32, 3]
|
||||
|
|
|
|||
|
|
@ -96,10 +96,11 @@ def hand_coded_optimizations(k:Scheduler) -> Scheduler:
|
|||
|
||||
# if there are small dims with lots of valid masks, upcast them (they might be from Tensor.stack)
|
||||
to_upcast: list[int] = []
|
||||
where_gate_rngs = {r for u in k.ast.backward_slice if u.op is Ops.WHERE for r in u.src[0].ranges}
|
||||
# upcast leading axes first (hack-ish for winograd; we actually want to upcast masked axes with low stride first)
|
||||
for axis in k.upcastable_dims:
|
||||
# for Schedule, we check if the range is used in INDEX gates or WHERE gates
|
||||
is_masked = any(any(o is k.rngs[axis] for o in u.src[0].backward_slice) for u in k.ast.backward_slice if u.op is Ops.WHERE)
|
||||
is_masked = k.rngs[axis] in where_gate_rngs
|
||||
if k.full_shape[axis] <= 7 and is_masked and prod(k.full_shape[j] for j in to_upcast) * k.full_shape[axis] <= 7 * 7:
|
||||
if DEBUG >= 4: print(f"upcasting masked axis : {axis}")
|
||||
to_upcast.append(axis)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue