Merge branch 'master' into shrink_in_render

This commit is contained in:
George Hotz 2026-06-01 16:48:22 -07:00 committed by GitHub
commit c64a37fa7d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 23 additions and 39 deletions

View file

@ -431,20 +431,12 @@ jobs:
python-version: '3.12'
llvm: 'true'
- name: Test ONNX (CPU)
run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test ONNX (LLVM)
run: DEV=CPU:LLVM python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test ONNX Runner (CPU)
run: DEV=CPU python3 test/external/external_test_onnx_runner.py
- name: Test Additional ONNX Ops (CPU)
run: DEV=CPU python3 test/external/external_test_onnx_ops.py
- name: Test Quantize ONNX
run: DEV=CPU python3 test/backend/test_quantize_onnx.py
run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py test/external/external_test_onnx_runner.py test/external/external_test_onnx_ops.py test/backend/test_quantize_onnx.py --durations=20
- name: Run process replay tests
uses: ./.github/actions/process-replay
testopencl:
name: ONNX (CL)+Optimization Tests
testoptim:
name: Optimization Tests
runs-on: *linux
timeout-minutes: 20
steps:
@ -453,13 +445,11 @@ jobs:
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: onnxoptl
key: optim
deps: testing
pydeps: "tensorflow==2.19"
python-version: '3.12'
opencl: 'true'
- name: Test ONNX (CL)
run: DEV=CL python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
#- name: Test Optimization Helpers
# run: DEBUG=1 python3 extra/optimization/test_helpers.py
#- name: Test Action Space
@ -757,7 +747,7 @@ jobs:
# ****** OSX Tests ******
testmetal:
unittestmacos:
name: MacOS (unit)
runs-on: macos-14
timeout-minutes: 20
@ -767,18 +757,15 @@ jobs:
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: metal
deps: testing
key: unittest-macos
deps: testing_unit
python-version: '3.12'
amd: 'true'
ocelot: 'true'
llvm: 'true'
- name: Run unit tests
run: DEV=METAL python -m pytest -n=auto test/unit/ --durations=20
- name: Run NULL backend tests
run: DEV=NULL python -m pytest -n=auto test/null/ --durations=20
- name: Run ONNX
run: DEV=METAL python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test tensor core ops (fake)
run: DEV=METAL DEBUG=3 TC=2 python test/backend/test_ops.py TestOps.test_gemm
- name: Test tensor core ops (real)
@ -789,20 +776,12 @@ jobs:
run: DEV=METAL python3 -m pytest test/device/test_metal.py
#- name: Fuzz Test linearizer
# run: DEV=METAL DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
- name: Run TRANSCENDENTAL math
run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
- name: Run pytest (amd)
env:
DEV: MOCKKFD+AMD
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
- name: Run pytest (amd with llvm backend)
env:
DEV: "MOCKKFD+AMD:LLVM"
FORWARD_ONLY: 1
run: |
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
- name: Run pytest (ptx)
env:
DEV: "MOCK+NV:PTX"

View file

@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
export DEBUG=${DEBUG:-2}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
export USE_ATOMICS=${USE_ATOMICS:-1}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}
@ -22,7 +23,6 @@ export FUSED_INPUT_QUANTIZE=${FUSED_INPUT_QUANTIZE:-0}
export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-0}
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-0}
export FUSED_SILU_W13=${FUSED_SILU_W13:-0}
export FUSED_PAD_GRAD_ACCUM=${FUSED_PAD_GRAD_ACCUM:-0}
export SPLIT_W13=${SPLIT_W13:-1}
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}
@ -43,7 +43,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR
export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
if [ -z "$FULL_LAYERS" ]; then
export LLAMA_LAYERS=2
export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
fi
python3 examples/mlperf/model_train.py

View file

@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
export DEBUG=${DEBUG:-0}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
export USE_ATOMICS=${USE_ATOMICS:-1}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}
@ -22,7 +23,6 @@ export FUSED_INPUT_QUANTIZE=${FUSED_INPUT_QUANTIZE:-0}
export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-0}
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-0}
export FUSED_SILU_W13=${FUSED_SILU_W13:-0}
export FUSED_PAD_GRAD_ACCUM=${FUSED_PAD_GRAD_ACCUM:-0}
export SPLIT_W13=${SPLIT_W13:-1}
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}

View file

@ -24,6 +24,7 @@ export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-1}
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-1}
export FUSED_SILU_W13=${FUSED_SILU_W13:-1}
export SPLIT_W13=${SPLIT_W13:-0}
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-0}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
@ -47,7 +48,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR
export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
if [ -z "$FULL_LAYERS" ]; then
export LLAMA_LAYERS=2
export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
fi
python3 examples/mlperf/model_train.py

View file

@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
export DEBUG=${DEBUG:-2}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
export USE_ATOMICS=${USE_ATOMICS:-1}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}
@ -47,7 +48,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR
export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
if [ -z "$FULL_LAYERS" ]; then
export LLAMA_LAYERS=2
export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
fi
python3 examples/mlperf/model_train.py

View file

@ -24,6 +24,7 @@ export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-1}
export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-1}
export FUSED_SILU_W13=${FUSED_SILU_W13:-1}
export SPLIT_W13=${SPLIT_W13:-0}
export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-0}
export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}

View file

@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
export DEBUG=${DEBUG:-0}
export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}
export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
export USE_ATOMICS=${USE_ATOMICS:-1}
export ASM_GEMM=${ASM_GEMM:-1}
export WQKV=${WQKV:-1}

View file

@ -285,7 +285,7 @@ class TestMainOnnxOps(TestOnnxOps):
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
for b in (np.ones([32], dtype=np.int32), np.zeros([32], dtype=np.int32)):
for channel_shape in [(), (32,)]:
with self.subTest(dtype=dtype, zero_point=zero_point, channel_shape=channel_shape):
with self.subTest(dtype=dtype.__name__, zero_point=zero_point, channel_shape=channel_shape):
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
inputs = {
"x": np.random.randint(dtype_min, dtype_max + 1, [1, 3, 224, 224], dtype=dtype),
@ -304,7 +304,7 @@ class TestMainOnnxOps(TestOnnxOps):
def test_qlinear_matmul(self):
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
with self.subTest(dtype=dtype, zero_point=zero_point):
with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
inputs = {
"A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
@ -512,7 +512,7 @@ class TestContribOnnxOps(TestOnnxOps):
def test_qlinear_add(self):
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
with self.subTest(dtype=dtype, zero_point=zero_point):
with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
inputs = {
"A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
@ -546,7 +546,7 @@ class TestContribOnnxOps(TestOnnxOps):
def test_qlinear_mul(self):
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
with self.subTest(dtype=dtype, zero_point=zero_point):
with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
inputs = {
"A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
@ -580,7 +580,7 @@ class TestContribOnnxOps(TestOnnxOps):
def test_qlinear_global_average_pool(self):
for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
for channels_last in [0, 1]:
with self.subTest(dtype=dtype, zero_point=zero_point, channels_last=channels_last):
with self.subTest(dtype=dtype.__name__, zero_point=zero_point, channels_last=channels_last):
dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
# NCHW for channels_last=0, NHWC for channels_last=1
shape = [1, 3, 32, 32] if channels_last == 0 else [1, 32, 32, 3]

View file

@ -96,10 +96,11 @@ def hand_coded_optimizations(k:Scheduler) -> Scheduler:
# if there are small dims with lots of valid masks, upcast them (they might be from Tensor.stack)
to_upcast: list[int] = []
where_gate_rngs = {r for u in k.ast.backward_slice if u.op is Ops.WHERE for r in u.src[0].ranges}
# upcast leading axes first (hack-ish for winograd; we actually want to upcast masked axes with low stride first)
for axis in k.upcastable_dims:
# for Schedule, we check if the range is used in INDEX gates or WHERE gates
is_masked = any(any(o is k.rngs[axis] for o in u.src[0].backward_slice) for u in k.ast.backward_slice if u.op is Ops.WHERE)
is_masked = k.rngs[axis] in where_gate_rngs
if k.full_shape[axis] <= 7 and is_masked and prod(k.full_shape[j] for j in to_upcast) * k.full_shape[axis] <= 7 * 7:
if DEBUG >= 4: print(f"upcasting masked axis : {axis}")
to_upcast.append(axis)