Merge branch 'master' into shrink_in_render

2026-06-24 02:14:17 +00:00 · 2026-06-01 16:48:22 -07:00 · 2026-06-01 16:48:22 -07:00 · c64a37fa7d
commit c64a37fa7d
parent d6f1aadeb7 2d0f132a3b
9 changed files with 23 additions and 39 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -431,20 +431,12 @@ jobs:
          python-version: '3.12'
          llvm: 'true'
      - name: Test ONNX (CPU)
-        run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
-      - name: Test ONNX (LLVM)
-        run: DEV=CPU:LLVM python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
-      - name: Test ONNX Runner (CPU)
-        run: DEV=CPU python3 test/external/external_test_onnx_runner.py
-      - name: Test Additional ONNX Ops (CPU)
-        run: DEV=CPU python3 test/external/external_test_onnx_ops.py
-      - name: Test Quantize ONNX
-        run: DEV=CPU python3 test/backend/test_quantize_onnx.py
+        run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py test/external/external_test_onnx_runner.py test/external/external_test_onnx_ops.py test/backend/test_quantize_onnx.py --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

-  testopencl:
-    name: ONNX (CL)+Optimization Tests
+  testoptim:
+    name: Optimization Tests
    runs-on: *linux
    timeout-minutes: 20
    steps:
@ -453,13 +445,11 @@ jobs:
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
-          key: onnxoptl
+          key: optim
          deps: testing
          pydeps: "tensorflow==2.19"
          python-version: '3.12'
          opencl: 'true'
-      - name: Test ONNX (CL)
-        run: DEV=CL python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      #- name: Test Optimization Helpers
      #  run: DEBUG=1 python3 extra/optimization/test_helpers.py
      #- name: Test Action Space
@ -757,7 +747,7 @@ jobs:

 # ****** OSX Tests ******

-  testmetal:
+  unittestmacos:
    name: MacOS (unit)
    runs-on: macos-14
    timeout-minutes: 20
@ -767,18 +757,15 @@ jobs:
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
-        key: metal
-        deps: testing
+        key: unittest-macos
+        deps: testing_unit
        python-version: '3.12'
        amd: 'true'
        ocelot: 'true'
-        llvm: 'true'
    - name: Run unit tests
      run: DEV=METAL python -m pytest -n=auto test/unit/ --durations=20
    - name: Run NULL backend tests
      run: DEV=NULL python -m pytest -n=auto test/null/ --durations=20
-    - name: Run ONNX
-      run: DEV=METAL python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
    - name: Test tensor core ops (fake)
      run: DEV=METAL DEBUG=3 TC=2 python test/backend/test_ops.py TestOps.test_gemm
    - name: Test tensor core ops (real)
@ -789,20 +776,12 @@ jobs:
      run: DEV=METAL python3 -m pytest test/device/test_metal.py
    #- name: Fuzz Test linearizer
    #  run: DEV=METAL DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
-    - name: Run TRANSCENDENTAL math
-      run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
    - name: Run pytest (amd)
      env:
        DEV: MOCKKFD+AMD
        FORWARD_ONLY: 1
      run: |
        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
-    - name: Run pytest (amd with llvm backend)
-      env:
-        DEV: "MOCKKFD+AMD:LLVM"
-        FORWARD_ONLY: 1
-      run: |
-        python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
    - name: Run pytest (ptx)
      env:
        DEV: "MOCK+NV:PTX"
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_405b/implementations/tinybox_8xMI350X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_405b/implementations/tinybox_8xMI350X/dev_beam.sh
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
 export DEBUG=${DEBUG:-2}
 export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
 export ALL2ALL=${ALL2ALL:-1}
+export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
 export USE_ATOMICS=${USE_ATOMICS:-1}
 export ASM_GEMM=${ASM_GEMM:-1}
 export WQKV=${WQKV:-1}
@ -22,7 +23,6 @@ export FUSED_INPUT_QUANTIZE=${FUSED_INPUT_QUANTIZE:-0}
 export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-0}
 export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-0}
 export FUSED_SILU_W13=${FUSED_SILU_W13:-0}
-export FUSED_PAD_GRAD_ACCUM=${FUSED_PAD_GRAD_ACCUM:-0}
 export SPLIT_W13=${SPLIT_W13:-1}
 export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}

@ -43,7 +43,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR

 export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
 if [ -z "$FULL_LAYERS" ]; then
-  export LLAMA_LAYERS=2
+  export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
 fi

 python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_405b/implementations/tinybox_8xMI350X/dev_run.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_405b/implementations/tinybox_8xMI350X/dev_run.sh
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
 export DEBUG=${DEBUG:-0}
 export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
 export ALL2ALL=${ALL2ALL:-1}
+export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
 export USE_ATOMICS=${USE_ATOMICS:-1}
 export ASM_GEMM=${ASM_GEMM:-1}
 export WQKV=${WQKV:-1}
@ -22,7 +23,6 @@ export FUSED_INPUT_QUANTIZE=${FUSED_INPUT_QUANTIZE:-0}
 export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-0}
 export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-0}
 export FUSED_SILU_W13=${FUSED_SILU_W13:-0}
-export FUSED_PAD_GRAD_ACCUM=${FUSED_PAD_GRAD_ACCUM:-0}
 export SPLIT_W13=${SPLIT_W13:-1}
 export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-1}

--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_beam.sh
@ -24,6 +24,7 @@ export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-1}
 export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-1}
 export FUSED_SILU_W13=${FUSED_SILU_W13:-1}
 export SPLIT_W13=${SPLIT_W13:-0}
+export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-0}

 export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
 export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
@ -47,7 +48,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR

 export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
 if [ -z "$FULL_LAYERS" ]; then
-  export LLAMA_LAYERS=2
+  export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
 fi

 python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_beam_mp.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_beam_mp.sh
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
 export DEBUG=${DEBUG:-2}
 export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
 export ALL2ALL=${ALL2ALL:-1}
+export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
 export USE_ATOMICS=${USE_ATOMICS:-1}
 export ASM_GEMM=${ASM_GEMM:-1}
 export WQKV=${WQKV:-1}
@ -47,7 +48,7 @@ export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGR

 export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
 if [ -z "$FULL_LAYERS" ]; then
-  export LLAMA_LAYERS=2
+  export LLAMA_LAYERS=${LLAMA_LAYERS:-2}
 fi

 python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_run.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_run.sh
@ -24,6 +24,7 @@ export FUSED_GRAD_QUANTIZE=${FUSED_GRAD_QUANTIZE:-1}
 export FUSED_ADD_NORM_MUL_QUANTIZE=${FUSED_ADD_NORM_MUL_QUANTIZE:-1}
 export FUSED_SILU_W13=${FUSED_SILU_W13:-1}
 export SPLIT_W13=${SPLIT_W13:-0}
+export OFFLOAD_OPTIM=${OFFLOAD_OPTIM:-0}

 export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
 export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_run_mp.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama31_8b/implementations/tinybox_8xMI350X/dev_run_mp.sh
@ -11,6 +11,7 @@ export DEVICE_IN_FUNCTION_BUG=1
 export DEBUG=${DEBUG:-0}
 export HK_FLASH_ATTENTION=${HK_FLASH_ATTENTION:-1}
 export ALL2ALL=${ALL2ALL:-1}
+export LATE_ALLREDUCE=${LATE_ALLREDUCE:-1}
 export USE_ATOMICS=${USE_ATOMICS:-1}
 export ASM_GEMM=${ASM_GEMM:-1}
 export WQKV=${WQKV:-1}
--- a/test/external/external_test_onnx_ops.py
+++ b/test/external/external_test_onnx_ops.py
@ -285,7 +285,7 @@ class TestMainOnnxOps(TestOnnxOps):
    for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
      for b in (np.ones([32], dtype=np.int32), np.zeros([32], dtype=np.int32)):
        for channel_shape in [(), (32,)]:
-          with self.subTest(dtype=dtype, zero_point=zero_point, channel_shape=channel_shape):
+          with self.subTest(dtype=dtype.__name__, zero_point=zero_point, channel_shape=channel_shape):
            dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
            inputs = {
              "x": np.random.randint(dtype_min, dtype_max + 1, [1, 3, 224, 224], dtype=dtype),
@ -304,7 +304,7 @@ class TestMainOnnxOps(TestOnnxOps):

  def test_qlinear_matmul(self):
    for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
-      with self.subTest(dtype=dtype, zero_point=zero_point):
+      with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
        dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
        inputs = {
          "A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
@ -512,7 +512,7 @@ class TestContribOnnxOps(TestOnnxOps):

  def test_qlinear_add(self):
    for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
-      with self.subTest(dtype=dtype, zero_point=zero_point):
+      with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
        dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
        inputs = {
          "A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
@ -546,7 +546,7 @@ class TestContribOnnxOps(TestOnnxOps):

  def test_qlinear_mul(self):
    for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
-      with self.subTest(dtype=dtype, zero_point=zero_point):
+      with self.subTest(dtype=dtype.__name__, zero_point=zero_point):
        dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
        inputs = {
          "A": np.random.randint(dtype_min, dtype_max + 1, [10, 10], dtype=dtype),
@ -580,7 +580,7 @@ class TestContribOnnxOps(TestOnnxOps):
  def test_qlinear_global_average_pool(self):
    for dtype, zero_point in [(np.uint8, 128), (np.int8, 0)]:
      for channels_last in [0, 1]:
-        with self.subTest(dtype=dtype, zero_point=zero_point, channels_last=channels_last):
+        with self.subTest(dtype=dtype.__name__, zero_point=zero_point, channels_last=channels_last):
          dtype_min, dtype_max = np.iinfo(dtype).min, np.iinfo(dtype).max
          # NCHW for channels_last=0, NHWC for channels_last=1
          shape = [1, 3, 32, 32] if channels_last == 0 else [1, 32, 32, 3]
--- a/tinygrad/codegen/opt/heuristic.py
+++ b/tinygrad/codegen/opt/heuristic.py
@ -96,10 +96,11 @@ def hand_coded_optimizations(k:Scheduler) -> Scheduler:

  # if there are small dims with lots of valid masks, upcast them (they might be from Tensor.stack)
  to_upcast: list[int] = []
+  where_gate_rngs = {r for u in k.ast.backward_slice if u.op is Ops.WHERE for r in u.src[0].ranges}
  # upcast leading axes first (hack-ish for winograd; we actually want to upcast masked axes with low stride first)
  for axis in k.upcastable_dims:
    # for Schedule, we check if the range is used in INDEX gates or WHERE gates
-    is_masked = any(any(o is k.rngs[axis] for o in u.src[0].backward_slice) for u in k.ast.backward_slice if u.op is Ops.WHERE)
+    is_masked = k.rngs[axis] in where_gate_rngs
    if k.full_shape[axis] <= 7 and is_masked and prod(k.full_shape[j] for j in to_upcast) * k.full_shape[axis] <= 7 * 7:
      if DEBUG >= 4: print(f"upcasting masked axis : {axis}")
      to_upcast.append(axis)