Merge branch 'master' into nak

2026-06-24 02:14:17 +00:00 · 2025-09-25 13:31:23 -07:00 · 2025-09-25 13:31:23 -07:00 · 52c7fe80aa
commit 52c7fe80aa
parent a9d4b46c99 17cec8d645
151 changed files with 2173 additions and 1572 deletions
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@ -253,8 +253,13 @@ runs:
        git checkout b16039dc940dc6bc4ea0a98380495769ff35ed99
        mkdir build
        cd build
-        cmake .. -Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF -DCMAKE_BUILD_ALWAYS=0 -DBUILD_TESTS_CUDA=OFF \
-          -DBoost_INCLUDE_DIR=$(brew --prefix boost)/include -DBoost_LIBRARY_DIR=$(brew --prefix boost)/lib -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+
+        CMAKE_ARGS="-Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF -DCMAKE_BUILD_ALWAYS=0 -DBUILD_TESTS_CUDA=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+        if [[ "${{ runner.os }}" == "macOS" ]]; then
+          CMAKE_ARGS="$CMAKE_ARGS -DBoost_INCLUDE_DIR=$(brew --prefix boost)/include -DBoost_LIBRARY_DIR=$(brew --prefix boost)/lib"
+        fi
+
+        cmake .. $CMAKE_ARGS
        ninja
    - name: Install gpuocelot
      if: inputs.ocelot == 'true'
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -28,7 +28,7 @@ jobs:
      # since sudo is required for usbgpu on macos, move the cache to a new location, as some of the files are owned by root
      PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache
    runs-on: [self-hosted, macOS]
-    timeout-minutes: 20
+    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
@ -160,7 +160,7 @@ jobs:
  testnvidiabenchmark:
    name: tinybox green Benchmark
    runs-on: [self-hosted, Linux, tinyboxgreen]
-    timeout-minutes: 30
+    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
@ -197,14 +197,14 @@ jobs:
    - name: Test tensor cores
      run: |
        NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
-        PTX=1 ALLOW_TF32=1 NV=1 python3 test/opt/test_tensor_cores.py
+        NV=1 NV_PTX=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
    - name: Run Tensor Core GEMM (CUDA)
      run: |
        CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
        CUDA=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt
        CUDA=1 SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_tf32.txt
    - name: Run Tensor Core GEMM (PTX)
-      run: NV=1 PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt
+      run: NV=1 NV_PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt
    - name: Run Tensor Core GEMM (NV)
      run: NV=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt
    - name: Test NV=1
@ -274,7 +274,7 @@ jobs:
  testmorenvidiabenchmark:
    name: tinybox green Training Benchmark
    runs-on: [self-hosted, Linux, tinyboxgreen]
-    timeout-minutes: 20
+    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
@ -302,7 +302,7 @@ jobs:
    - name: Fuzz Padded Tensor Core GEMM (NV)
      run: NV=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
    - name: Fuzz Padded Tensor Core GEMM (PTX)
-      run: NV=1 PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
+      run: NV=1 NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
    - name: Train MNIST
      run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
    - name: Run 10 CIFAR training steps
@ -346,7 +346,7 @@ jobs:
  testamdbenchmark:
    name: tinybox red Benchmark
    runs-on: [self-hosted, Linux, tinybox]
-    timeout-minutes: 20
+    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
@ -476,7 +476,7 @@ jobs:
  testmoreamdbenchmark:
    name: tinybox red Training Benchmark
    runs-on: [self-hosted, Linux, tinybox]
-    timeout-minutes: 30
+    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
@ -511,8 +511,8 @@ jobs:
      run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=85 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
    - name: Run 10 CIFAR training steps w HALF
      run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=188 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
-    - name: Run 10 CIFAR training steps w BF16
-      run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
+    # - name: Run 10 CIFAR training steps w BF16
+    #   run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
    - name: Run 10 CIFAR training steps w winograd
      run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
    - name: Run full CIFAR training w 1 GPU
@ -539,7 +539,7 @@ jobs:
  testmlperfamdbenchmark:
    name: tinybox red MLPerf Benchmark
    runs-on: [self-hosted, Linux, tinybox]
-    timeout-minutes: 30
+    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
@ -645,7 +645,7 @@ jobs:
  testreddriverbenchmark:
    name: AM Benchmark
    runs-on: [self-hosted, Linux, tinyboxrandom]
-    timeout-minutes: 15
+    timeout-minutes: 20
    defaults:
      run:
        shell: bash -e -o pipefail {0}
@ -716,7 +716,7 @@ jobs:
  testgreendriverbenchmark:
    name: NV Benchmark
    runs-on: [self-hosted, Linux, tinyboxrandom]
-    timeout-minutes: 15
+    timeout-minutes: 20
    defaults:
      run:
        shell: bash -e -o pipefail {0}
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -30,8 +30,6 @@ jobs:
        key: llvm-speed
        deps: testing_minimal
        llvm: 'true'
-    - name: External Benchmark Schedule
-      run: python3 test/external/external_benchmark_schedule.py
    - name: Speed Test
      run: CPU=1 CPU_LLVM=1 python3 test/speed/external_test_speed_v_torch.py
    - name: Speed Test (BEAM=2)
@ -48,7 +46,7 @@ jobs:
      uses: ./.github/actions/setup-tinygrad
      with:
        deps: docs
-        pydeps: "capstone"
+        pydeps: "capstone torch"
    - name: Build wheel and show size
      run: |
        pip install build
@ -79,6 +77,8 @@ jobs:
      run: |
        python docs/abstractions2.py
        python docs/abstractions3.py
+    - name: Test README
+      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py &&  python README.py
    - name: Test Quickstart
      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py &&  python quickstart.py
    - name: Test DEBUG
@ -93,8 +93,6 @@ jobs:
    name: Torch Backend Tests
    runs-on: ubuntu-latest
    timeout-minutes: 15
-    env:
-      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
@ -132,8 +130,6 @@ jobs:
    name: Torch Backend Tests More
    runs-on: ubuntu-latest
    timeout-minutes: 15
-    env:
-      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
@ -152,20 +148,28 @@ jobs:
    - name: Test some torch tests (expect failure)
      run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true

-  tc:
-    name: Tensor Core tests
+  bepython:
+    name: Python Backend
    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    env:
-      IGNORE_OOB: 0
+    timeout-minutes: 15
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
-        key: uops-minimal
+        key: be-minimal
        deps: testing_minimal
+    - name: Test dtype with Python emulator
+      run: DEBUG=1 PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py
+    - name: Test ops with Python emulator
+      run: DEBUG=2 SKIP_SLOW_TEST=1 PYTHON=1 python3 -m pytest -n=auto test/test_ops.py --durations=20
+    - name: Test uops with Python emulator
+      run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20
+    - name: Test symbolic with Python emulator
+      run: PYTHON=1 python3 test/test_symbolic_ops.py
+    - name: test_renderer_failures with Python emulator
+      run: PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures
    - name: Test IMAGE=2 support
      run: |
        IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
@ -212,31 +216,6 @@ jobs:
        DEBUG=2 EMULATE=INTEL PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
        DEBUG=2 AMX=1 EMULATE=AMX PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStats.test_simple_matmul

-  bepython:
-    name: Python Backend
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    env:
-      IGNORE_OOB: 0
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v4
-    - name: Setup Environment
-      uses: ./.github/actions/setup-tinygrad
-      with:
-        key: be-minimal
-        deps: testing_minimal
-    - name: Test dtype with Python emulator
-      run: DEBUG=1 PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py
-    - name: Test ops with Python emulator
-      run: DEBUG=2 SKIP_SLOW_TEST=1 PYTHON=1 python3 -m pytest -n=auto test/test_ops.py --durations=20
-    - name: Test uops with Python emulator
-      run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20
-    - name: Test symbolic with Python emulator
-      run: PYTHON=1 python3 test/test_symbolic_ops.py
-    - name: test_renderer_failures with Python emulator
-      run: PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures
-
  linter:
    name: Linters
    runs-on: ubuntu-latest
@ -282,8 +261,6 @@ jobs:
        key: unittest-12
        pydeps: "pillow"
        deps: testing_unit
-    - name: Test README
-      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py &&  python README.py
    - name: Run unit tests
      run: python -m pytest -n=auto test/unit/ --durations=20
    - name: Run targetted tests on NULL backend
@ -295,6 +272,8 @@ jobs:
    #  run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing
    - name: Run GC tests
      run: python test/external/external_uop_gc.py
+    - name: External Benchmark Schedule
+      run: python3 test/external/external_benchmark_schedule.py
    - name: Run process replay tests
      uses: ./.github/actions/process-replay
    - name: Regen dataset on test_tiny
@ -330,12 +309,10 @@ jobs:
    - name: Fuzz Test shape ops
      run: python test/external/fuzz_shape_ops.py

-  testgpuimage:
-    name: 'GPU IMAGE Tests'
+  testopenclimage:
+    name: CL IMAGE Tests
    runs-on: ubuntu-22.04
    timeout-minutes: 10
-    env:
-      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
@ -345,19 +322,17 @@ jobs:
          key: gpu-image
          deps: testing_minimal
          opencl: 'true'
-      - name: Test GPU IMAGE=2 ops + training
+      - name: Test CL IMAGE=2 ops + training
        run: |
-          GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
-          GPU=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist
+          CL=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
+          CL=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  testgpumisc:
-    name: 'GPU Misc tests'
+    name: CL Misc tests
    runs-on: ubuntu-22.04
    timeout-minutes: 10
-    env:
-      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
@ -368,11 +343,11 @@ jobs:
          deps: testing_minimal
          opencl: 'true'
      - name: Generate Dataset
-        run: GPU=1 extra/optimization/generate_dataset.sh
+        run: CL=1 extra/optimization/generate_dataset.sh
      - name: Run Kernel Count Test
-        run: GPU=1 python -m pytest -n=auto test/external/external_test_opt.py
+        run: CL=1 python -m pytest -n=auto test/external/external_test_opt.py
      - name: Run fused optimizer tests
-        run: GPU=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py
+        run: CL=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py
      - name: Upload artifact
        uses: actions/upload-artifact@v4
        with:
@ -380,11 +355,9 @@ jobs:
          path: /tmp/sops.gz

  testopenpilot:
-    name: 'openpilot Compile Tests'
+    name: openpilot Compile Tests
    runs-on: ubuntu-22.04
    timeout-minutes: 15
-    env:
-      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
@ -397,26 +370,26 @@ jobs:
          llvm: 'true'
      - name: Test openpilot model kernel count and gate usage
        run: |
-          ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
+          ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
      - name: Test openpilot alt model correctness (float32)
-        run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
+        run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
      - name: Test openpilot fastvits model correctness (float32)
-        run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
+        run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
      # - name: Test openpilot simple_plan vision model correctness (float32)
-      #   run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b
+      #   run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b
      - name: Test openpilot LLVM compile
        run: CPU=1 CPU_LLVM=1 LLVMOPT=1 JIT=2 BEAM=0 IMAGE=0 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
      - name: Test openpilot compile4
-        run: NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py
+        run: NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

+# ****** ONNX Tests ******
+
  testonnxcpu:
-    name: 'ONNX (CPU) Tests'
+    name: ONNX (CPU) Tests
    runs-on: ubuntu-22.04
    timeout-minutes: 20
-    env:
-      IGNORE_OOB: 0

    steps:
      - name: Checkout Code
@ -442,12 +415,9 @@ jobs:
        uses: ./.github/actions/process-replay

  testopencl:
-    name: 'ONNX (GPU)+Optimization Tests'
+    name: ONNX (CL)+Optimization Tests
    runs-on: ubuntu-22.04
    timeout-minutes: 20
-    env:
-      IGNORE_OOB: 0
-
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
@ -459,16 +429,16 @@ jobs:
          pydeps: "tensorflow==2.15.1 tensorflow_addons"
          python-version: '3.11'
          opencl: 'true'
-      - name: Test ONNX (GPU)
-        run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - name: Test ONNX (CL)
+        run: CL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      #- name: Test Optimization Helpers
      #  run: DEBUG=1 python3 extra/optimization/test_helpers.py
      #- name: Test Action Space
-      #  run: DEBUG=1 GPU=1 python3 extra/optimization/get_action_space.py
+      #  run: DEBUG=1 CL=1 python3 extra/optimization/get_action_space.py
      - name: Test Beam Search
-        run: GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
+        run: CL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
      - name: Test MLPerf stuff
-        run: GPU=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20
+        run: CL=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20
      - name: Test llama 3 training
        run: MAX_BUFFER_SIZE=0 DEV=NULL SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
      - name: Run process replay tests
@ -488,12 +458,12 @@ jobs:
      - name: Test 1B LLM
        run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm | grep -i rooster

+# ****** Models Tests ******
+
  testmodels:
    name: Models (llvm+cpu+gpu)
    runs-on: ubuntu-22.04
    timeout-minutes: 15
-    env:
-      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
@ -506,15 +476,35 @@ jobs:
          llvm: 'true'
      - name: Test models (llvm)
        run: CPU=1 CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20
-      - name: Test models (gpu)
-        run: GPU=1 python -m pytest -n=auto test/models --durations=20
+      - name: Test models (opencl)
+        run: CL=1 python -m pytest -n=auto test/models --durations=20
      - name: Test models (cpu)
        run: CPU=1 CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

-  testrangeify:
-    name: Linux (rangeify)
+  testmetalmodels:
+    name: Models (metal)
+    runs-on: macos-14
+    timeout-minutes: 20
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: metal
+        deps: testing
+        python-version: '3.11'
+    - name: Test models (Metal)
+      run: METAL=1 python -m pytest -n=auto test/models --durations=20
+    - name: Test LLaMA compile speed
+      run: METAL=1 python test/external/external_test_speed_llama.py
+
+# ****** Feature Tests ******
+
+  testrangeifycpu:
+    name: Linux (rangeify) CPU
    runs-on: ubuntu-24.04
    timeout-minutes: 15
    steps:
@ -525,28 +515,80 @@ jobs:
      with:
        key: rangeify-minimal-llvm
        deps: testing_minimal
+        opencl: 'true'
        llvm: "true"
    - name: Test CPU=1 RANGEIFY=1
      # TODO: add more passing tests here
-      # test_symbolic_arange_sym_step is passing now
-      # test_threefry_doesnt_use_long is because there's a contig after the long now
+      # test_load_state_dict_sharded_model_dict_same_axis issue with multi
+      # test_instancenorm_3d is very slow
      run: |
        CPU=1 CPU_LLVM=0 RANGEIFY=1 python3 -m pytest -n auto --durations 20 \
-          -k "not test_symbolic_arange_sym_step and not test_threefry_doesnt_use_long" \
-          test/test_tiny.py test/test_rangeify.py test/test_ops.py test/test_tensor_variable.py \
-          test/test_outerworld_range.py test/test_sample.py test/test_randomness.py
+          -k "not test_load_state_dict_sharded_model_dict_same_axis and not test_instancenorm_3d" \
+          test/test_tiny.py test/test_rangeify.py test/test_ops.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_tensor_variable.py \
+          test/test_outerworld_range.py test/test_randomness.py test/test_nn.py test/test_arange.py test/test_tensor.py test/test_optim.py \
+          test/test_setitem.py
+    - name: Test const folding
+      run: CPU=1 RANGEIFY=1 python3 -m pytest -n auto --durations 20 test/test_const_folding.py -k "not test_cast_padded and not TestReduceOpsConstFolding and not TestMultiConstFolding"
+    - name: Test multitensor
+      run: |
+        CPU=1 RANGEIFY=1 python3 test/test_multitensor.py TestMultiTensor.test_matmul_shard_1_1 TestMultiTensor.test_simple_add_W TestMultiTensor.test_simple_reduce \
+        TestMultiTensor.test_elementwise_dtype TestMultiTensor.test_shard_no_recompile TestHandleData.test_copied_to_device TestMultiRamUsage
+        CPU=1 RANGEIFY=1 python3 -m pytest test/test_multitensor.py::TestMultiAssign -k 'not (multi_assign_piece_noncontig or multi_assign_var_offset)'
+        CPU=1 RANGEIFY=1 python3 -m pytest -n=auto test/test_multitensor.py::TestMultiTensor test/unit/test_allreduce.py -k 'not const_folding'
    - name: Test CPU=1 RANGEIFY=2
      run: CPU=1 CPU_LLVM=0 RANGEIFY=2 python3 -m pytest -n auto test/test_tiny.py test/test_rangeify.py test/test_ops.py --durations 20
    # slow (and still wrong on beautiful_mnist)
-    #- name: Test LLVM=1 RANGEIFY=1 (slow tests)
+    #- name: Test LLVM RANGEIFY=1 (slow tests)
    #  run: CPU=1 CPU_LLVM=1 RANGEIFY=1 python3 -m pytest -n auto test/models/test_mnist.py --durations 20
+    - name: Run process replay tests
+      uses: ./.github/actions/process-replay
+
+  testrangeifycl:
+    name: Linux (rangeify) CL
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: rangeify-cl
+        deps: testing
+        opencl: 'true'
+        llvm: "true"
+    - name: Test CL=1 RANGEIFY=1
+      run: CL=1 RANGEIFY=1 pytest -n auto test/test_ops.py test/test_schedule.py test/test_symbolic_ops.py test/test_jit.py test/unit/test_disk_tensor.py test/models/test_mnist.py test/unit/test_mnist_dataset.py test/test_optim.py --durations 20
+    - name: Test Fuse
+      run: CL=1 RANGEIFY=2 python3 -m pytest --durations 20 test/test_softmax_fusion.py -k "not test_auto_softmax"
+    - name: Test ONNX
+      run: CL=1 RANGEIFY=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+    - name: Run process replay tests
+      uses: ./.github/actions/process-replay
+
+  testrangeifymacos:
+    name: MacOS (rangeify)
+    runs-on: macos-14
+    timeout-minutes: 15
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: metal
+        deps: testing
+    - name: some unit tests
+      run: METAL=1 RANGEIFY=1 python -m pytest -n=auto test/unit/test_winograd.py --durations=20
+    - name: Test METAL=1 RANGEIFY=1
+      run: METAL=1 RANGEIFY=1 python -m pytest -n=auto test/test_ops.py --durations=20
+    - name: Run process replay tests
+      uses: ./.github/actions/process-replay

  testdevectorize:
    name: Linux (devectorize)
    runs-on: ubuntu-24.04
    timeout-minutes: 15
-    env:
-      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
@ -568,8 +610,6 @@ jobs:
    name: Linux (DSP)
    runs-on: ubuntu-24.04
    timeout-minutes: 15
-    env:
-      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
@ -636,7 +676,6 @@ jobs:
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    env:
-      IGNORE_OOB: 0
      AMD: 1
      MOCKGPU: 1
      FORWARD_ONLY: 1
@ -666,7 +705,7 @@ jobs:
        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
      - name: Run TestOps.test_add with SQTT
        run: |
-          PROFILE=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
+          VIZ=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
          extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp
      - name: Run process replay tests
        uses: ./.github/actions/process-replay
@ -694,7 +733,7 @@ jobs:
          cuda: 'true'
          ocelot: 'true'
      - name: Set env
-        run: printf "${{ matrix.backend == 'PTX' && 'CUDA=1\nPTX=1' || matrix.backend == 'nv' && 'NV=1\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'PTX' && 'CUDA=1\nCUDA_PTX=1' || matrix.backend == 'nv' && 'NV=1\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
@ -705,18 +744,15 @@ jobs:
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

-  tests:
+  testcpuopencl:
    strategy:
      fail-fast: false
      matrix:
-        backend: [llvm, cpu, gpu]
+        backend: [llvm, cpu, opencl]

    name: Linux (${{ matrix.backend }})
    runs-on: ubuntu-22.04
    timeout-minutes: 20
-    env:
-      IGNORE_OOB: 0
-
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
@ -725,13 +761,13 @@ jobs:
        with:
          key: ${{ matrix.backend }}-minimal
          deps: testing_minimal
-          opencl: ${{ matrix.backend == 'gpu' && 'true' }}
+          opencl: ${{ matrix.backend == 'opencl' && 'true' }}
          llvm: ${{ matrix.backend == 'llvm' && 'true' }}
      - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'gpu' && 'GPU=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'CL=1' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
-          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','GPU'], Device.DEFAULT"
+          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT"
          DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
      - name: Run pytest (${{ matrix.backend }})
        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20
@ -772,7 +808,7 @@ jobs:

          start_server "remote-server-amd-1" "AMD" 6667
          start_server "remote-server-amd-2" "AMD" 6668
-          start_server "remote-server-gpu" "GPU" 7667
+          start_server "remote-server-gpu" "CL" 7667
          start_server "remote-server-cpu" "CPU" 8667
      - name: Check Device.DEFAULT and print some source
        env:
@ -786,7 +822,7 @@ jobs:
          HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
        run: |
          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
-      - name: Run REMOTE=1 Test (GPU)
+      - name: Run REMOTE=1 Test (CL)
        env:
          HOST: 127.0.0.1:7667*6
        run: |
@ -807,48 +843,42 @@ jobs:

 # ****** OSX Tests ******

-  testmetal2:
+  testmetal:
    name: MacOS (unit)
    runs-on: macos-14
    timeout-minutes: 20
-    env:
-      IGNORE_OOB: 0
-
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
-        key: metal2
+        key: metal
        deps: testing
        python-version: '3.11'
        amd: 'true'
        cuda: 'true'
        ocelot: 'true'
        llvm: 'true'
-    - name: Run real world test
-      run: METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
-    - name: Test models (Metal)
-      run: METAL=1 python -m pytest -n=auto test/models -v --durations=20
+    - name: Run unit tests
+      run: METAL=1 python -m pytest -n=auto test/unit/ --durations=20
    - name: Run ONNX
      run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
    - name: Test tensor core ops (fake)
-      run: TC=2 METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_gemm
+      run: METAL=1 DEBUG=3 TC=2 python test/test_ops.py TestOps.test_gemm
    - name: Test tensor core ops (real)
      run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
-    - name: Test LLaMA compile speed
-      run: METAL=1 python test/external/external_test_speed_llama.py
    - name: Test Beam Search
      run: METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
    #- name: Fuzz Test linearizer
    #  run: METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
    - name: Run TRANSCENDENTAL math
-      run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
+      run: METAL=1 TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
    - name: Run pytest (amd)
      env:
        MOCKGPU: 1
        AMD: 1
+        AMD_LLVM: 0
        FORWARD_ONLY: 1
      run: |
        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
@ -856,13 +886,14 @@ jobs:
      env:
        MOCKGPU: 1
        AMD: 1
+        AMD_LLVM: 1
        FORWARD_ONLY: 1
      run: |
        python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
    - name: Run pytest (ptx)
      env:
        MOCKGPU: 1
-        PTX: 1
+        NV_PTX: 1
        NV: 1
        FORWARD_ONLY: 1
      run: |
@ -939,8 +970,6 @@ jobs:
    name: MacOS (${{ matrix.backend }})
    runs-on: macos-15
    timeout-minutes: 20
-    env:
-      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
@ -976,8 +1005,6 @@ jobs:
    name: Windows (${{ matrix.backend }})
    runs-on: windows-latest
    timeout-minutes: 15
-    env:
-      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
--- a/README.md
+++ b/README.md
@ -79,7 +79,7 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers

 tinygrad already supports numerous accelerators, including:

- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
+- [x] [OpenCL](tinygrad/runtime/ops_cl.py)
 - [x] [CPU](tinygrad/runtime/ops_cpu.py)
 - [x] [METAL](tinygrad/runtime/ops_metal.py)
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
--- a/docs/env_vars.md
+++ b/docs/env_vars.md
@ -3,7 +3,7 @@
 This is a list of environment variable that control the runtime behavior of tinygrad and its examples.
 Most of these are self-explanatory, and are usually used to set an option at runtime.

-Example: `GPU=1 DEBUG=4 python3 -m pytest`
+Example: `CL=1 DEBUG=4 python3 -m pytest`

 However you can also decorate a function to set a value only inside that function.

@ -31,7 +31,7 @@ These control the behavior of core tinygrad even when used as a library.
 Variable | Possible Value(s) | Description
 ---|---|---
 DEBUG               | [1-7]      | enable debugging output (operations, timings, speed, generated code and more)
-GPU                 | [1]        | enable the GPU (OpenCL) backend
+CL                  | [1]        | enable OpenCL backend
 CUDA                | [1]        | enable CUDA backend
 AMD                 | [1]        | enable AMD backend
 NV                  | [1]        | enable NV backend
@ -41,8 +41,6 @@ BEAM                | [#]        | number of beams in kernel beam search
 DEFAULT_FLOAT       | [HALF, ...]| specify the default float dtype (FLOAT32, HALF, BFLOAT16, FLOAT64, ...), default to FLOAT32
 IMAGE               | [1-2]      | enable 2d specific optimizations
 FLOAT16             | [1]        | use float16 for images instead of float32
-PTX                 | [1]        | enable the specialized [PTX](https://docs.nvidia.com/cuda/parallel-thread-execution/) assembler for Nvidia GPUs. If not set, defaults to generic CUDA codegen backend.
-PROFILE             | [1]        | enable profiling. This feature is supported in NV, AMD, QCOM and METAL backends.
 VISIBLE_DEVICES     | [list[int]]| restricts the NV/AMD devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0).
 JIT                 | [0-2]      | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled
 VIZ                 | [1]        | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz)
--- a/docs/runtime.md
+++ b/docs/runtime.md
@ -2,17 +2,17 @@

 tinygrad supports various runtimes, enabling your code to scale across a wide range of devices. The default runtime can be automatically selected based on the available hardware, or you can force a specific runtime to be default using environment variables (e.g., `CPU=1`).

-| Runtime | Description | Requirements |
-|---------|-------------|--------------|
-| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | Ampere/Ada series GPUs |
-| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | RDNA2/RDNA3/RDNA4 series GPUs. You can select one of the interfaces for communication by setting `AMD_IFACE=(KFD|PCI)`. See [AMD interfaces](#amd-interfaces) for more details. |
-| [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | 6xx series GPUs |
-| [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support |
-| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support |
-| [GPU (OpenCL)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_gpu.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device |
-| [CPU (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` |
-| [LLVM (LLVM IR)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_llvm.py) | Runs on CPU using the LLVM compiler infrastructure | llvm libraries installed and findable |
-| [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | Dawn library installed and findable. Download binaries [here](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0). |
+| Runtime | Description | Compiler Options | Requirements |
+|---------|-------------|------------------|--------------|
+| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`NV_PTX=1`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
+| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`AMD_LLVM=1`)<br>HIP/COMGR (`AMD_HIP=1`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
+| [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | - | 6xx series GPUs |
+| [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | - | M1+ Macs; Metal 3.0+ for `bfloat` support |
+| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`CUDA_PTX=1`) | NVIDIA GPU with CUDA support |
+| [CL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | - | OpenCL 2.0 compatible device |
+| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`CPU_LLVM=1`) | `clang` compiler in system `PATH` |
+| [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | - | Dawn library installed and discoverable. Binaries: [pydawn v0.3.0](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0) |
+

 ## Interoperability

@ -70,5 +70,12 @@ AMD backend supports several interfaces for communicating with devices:

 * `KFD`: uses the amdgpu driver
 * `PCI`: uses the [AM driver](developer/am.md)
+* `USB`: USB3 interafce for asm24xx chips.

 You can force an interface by setting `AMD_IFACE` to one of these values. In the case of `AMD_IFACE=PCI`, this may unbind your GPU from the amdgpu driver.
+
+## NV Interfaces
+NV backend supports several interfaces for communicating with devices:
+
+* `NVK`: uses the nvidia driver
+* `PCI`: uses the [NV driver](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/support/nv/nvdev.py)
--- a/examples/gpt2.py
+++ b/examples/gpt2.py
@ -189,7 +189,7 @@ class GPT2:
      GlobalCounters.reset()
      if timing: print("")
      st = GlobalCounters.time_sum_s
-      with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
+      with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
                  f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
                  (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
        with WallTimeEvent(BenchEvent.STEP):
--- a/examples/llama.py
+++ b/examples/llama.py
@ -478,7 +478,7 @@ After you are done speaking, output [EOS]. You are not Chad.
      with Profiling(enabled=args.profile):
        with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
          with WallTimeEvent(BenchEvent.STEP):
-            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
+            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
                        f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
                        (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
              tok_tensor = llama.model(next_tok, start_pos, args.temperature)
--- a/examples/llama3.py
+++ b/examples/llama3.py
@ -441,7 +441,7 @@ if __name__ == "__main__":
      with Profiling(enabled=args.profile):
        with Timing("total ", on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
          with WallTimeEvent(BenchEvent.STEP):
-            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
+            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
                        f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
                        (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
              tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
@ -479,7 +479,7 @@ if __name__ == "__main__":
        st = GlobalCounters.time_sum_s
        with Profiling(enabled=args.profile):
          with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
-            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
+            with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
                        f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
                        (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):

--- a/examples/mamba.py
+++ b/examples/mamba.py
@ -279,9 +279,15 @@ def generate(model, tokenizer, prompt: str, n_tokens_to_gen: int = 10, temp: boo
  # Loading in the prompt tokens
  logits = model.forward(Tensor([tks]))[:, -1, :]
  for _ in tqdm(range(n_tokens_to_gen), desc="Speed Gen"):
-    # TODO: topk
    if sample:
-      tok_Tens = (logits/temp).softmax().multinomial()
+      scaled_logits = logits / temp
+      if top_k is not None:
+        topk_values, topk_indices = scaled_logits.topk(top_k)
+        filtered_logits = Tensor.full_like(scaled_logits, -float("inf"))
+        filtered_logits = filtered_logits.scatter(dim=-1, index=topk_indices, src=topk_values)
+        tok_Tens = filtered_logits.softmax().multinomial()
+      else:
+        tok_Tens = scaled_logits.softmax().multinomial()
    else:
      tok_Tens = logits.argmax(axis=-1).unsqueeze(0)
    tok = tok_Tens.item()
@ -298,6 +304,7 @@ if __name__ == "__main__":
  parser.add_argument("--size", type=str, default="370m",
                      help=f"Size of model to use [{', '.join([k for k in MODELS.keys()])}]")
  parser.add_argument("--n_tokens", type=int, default=10, help="Number of tokens to generate")
+  parser.add_argument("--top_k", type=int, help="Limit sampling to the top k most likely tokens")
  parser.add_argument("--sample", dest="sample", action="store_true", help="Sample flag")
  parser.add_argument("--temp", type=float, default=1.0, help="Sampling temp has to be <=1.0")
  args = parser.parse_args()
@ -308,8 +315,9 @@ if __name__ == "__main__":
  num_toks = args.n_tokens
  sample = args.sample
  temp = args.temp
+  top_k = args.top_k
  s = time.time()
-  tinyoutput = generate(model, tokenizer, prompt, n_tokens_to_gen=num_toks, sample=sample, temp=temp)
+  tinyoutput = generate(model, tokenizer, prompt, n_tokens_to_gen=num_toks, sample=sample, temp=temp, top_k=top_k)
  print(tinyoutput)
  print('TIME: ', time.time() - s)
  TORCHOUTPUT = "Why is gravity \nso important?\nBecause it's the only"
--- a/examples/mlperf/scripts/stable_diffusion_downloads.sh
+++ b/examples/mlperf/scripts/stable_diffusion_downloads.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# adapted from https://github.com/mlcommons/training/blob/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/README.md
+
+# setup dirs
+
+DATA=/raid/datasets/stable_diffusion
+
+LAION=$DATA/laion-400m/webdataset-moments-filtered 
+COCO=$DATA/coco2014
+mkdir -p $LAION $COCO
+
+CKPT=/raid/weights/stable_diffusion
+mkdir -p $CKPT/clip $CKPT/sd $CKPT/inception
+
+# download data
+
+# if rclone isn't installed system-wide / in your PATH, put the executable path in quotes below
+#RCLONE=""
+RCLONE="rclone"
+
+## VAE-encoded image latents, from 6.1M image subset of laion-400m
+## about 1 TB for whole download
+$RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/ ${LAION} --include="*.tar" -P
+$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/sha512sums.txt ${LAION} -P
+cd $LAION && grep -E '\.tar$' sha512sums.txt | sha512sum -c --quiet - && \
+  echo "All .tar files verified" || { echo "Checksum failure when validating downloaded Laion moments"; exit 1; }
+
+## prompts and FID statistics from 30k image subset of coco2014
+## 33 MB
+$RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k.tsv ${COCO} -P
+
+$RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k_stats.npz ${COCO} -P
+
+# download checkpoints
+
+## clip (needed for text and vision encoders for validation)
+CLIP_WEIGHTS_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin"
+CLIP_WEIGHTS_SHA256="9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
+CLIP_CONFIG_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/raw/main/open_clip_config.json"
+wget -N -P ${CKPT}/clip ${CLIP_WEIGHTS_URL}
+wget -N -P ${CKPT}/clip ${CLIP_CONFIG_URL}
+echo "${CLIP_WEIGHTS_SHA256}  ${CKPT}/clip/open_clip_pytorch_model.bin"                    | sha256sum -c
+
+## sd (needed for latent->image decoder for validation, also has clip text encoder for training)
+SD_WEIGHTS_URL='https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt'
+SD_WEIGHTS_SHA256="d635794c1fedfdfa261e065370bea59c651fc9bfa65dc6d67ad29e11869a1824"
+wget -N -P ${CKPT}/sd ${SD_WEIGHTS_URL}
+echo "${SD_WEIGHTS_SHA256}  ${CKPT}/sd/512-base-ema.ckpt"                    | sha256sum -c
+
+## inception (needed for validation)
+FID_WEIGHTS_URL='https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'
+FID_WEIGHTS_SHA1="bd836944fd6db519dfd8d924aa457f5b3c8357ff"
+wget -N -P ${CKPT}/inception ${FID_WEIGHTS_URL}
+echo "${FID_WEIGHTS_SHA1}  ${CKPT}/inception/pt_inception-2015-12-05-6726825d.pth"                    | sha1sum -c
--- a/examples/openpilot/compile4.py
+++ b/examples/openpilot/compile4.py
@ -6,7 +6,7 @@ from tinygrad.schedule.kernelize import get_kernelize_map
 from tinygrad.engine.schedule import create_schedule_with_vars
 from tinygrad.engine.realize import run_schedule

-# NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py
+# NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py

 OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx"
 OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/openpilot.pkl"
--- a/examples/qwq.py
+++ b/examples/qwq.py
@ -8,7 +8,7 @@ from typing import Dict, Union

 from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
 from examples.llama3 import load
-from tinygrad import nn, Tensor
+from tinygrad import nn, Tensor, Device
 from tinygrad.helpers import fetch, colored, GlobalCounters, Timing, DEBUG
 from tinygrad.nn.state import load_state_dict, get_parameters

@ -80,7 +80,7 @@ if __name__ == "__main__":
    st = GlobalCounters.time_sum_s
    next_tok = Tensor([toks[start_pos:]]) if tok_tensor is None or (len(toks)-start_pos) > 1 else tok_tensor.reshape(1, 1)
    with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
-      with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "") +
+      with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "") +
                  f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB" +
                  (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
        tok_tensor = transformer(next_tok, start_pos, args.temperature)
--- a/examples/sdxl.py
+++ b/examples/sdxl.py
@ -437,8 +437,8 @@ if __name__ == "__main__":
    im.show()

  # validation!
-  if args.prompt == default_prompt and args.steps == 10 and args.seed == 0 and args.guidance == 6.0 and args.width == args.height == 1024 \
-    and not args.weights:
+  is_default = args.prompt == default_prompt and args.steps == 10 and args.seed == 0 and args.guidance == 6.0 and args.width == args.height == 1024
+  if is_default and not args.weights and not args.fakeweights:
    ref_image = Tensor(np.array(Image.open(Path(__file__).parent / "sdxl_seed0.png")))
    distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
    assert distance < 4e-3, colored(f"validation failed with {distance=}", "red")
--- a/examples/whisper.py
+++ b/examples/whisper.py
@ -109,7 +109,7 @@ class TextDecoder:

  def forward(self, x:Tensor, pos:Union[Variable, Literal[0]], encoded_audio:Tensor):
    seqlen = x.shape[-1]
-    x = self.token_embedding(x) + self.positional_embedding.shrink(((pos, pos+seqlen), None, None))
+    x = self.token_embedding(x) + self.positional_embedding.shrink(((pos, pos+seqlen), None))
    for block in self.blocks: x = block(x, xa=encoded_audio, mask=self.mask, len=pos)
    return self.output_tok(x)

--- a/extra/archprobe.py
+++ b/extra/archprobe.py
@ -1,7 +1,7 @@
 # copying the kernels from https://github.com/microsoft/ArchProbe into Python
 import numpy as np
 import pickle
-from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer
+from tinygrad.runtime.ops_cl import CLProgram, CLBuffer
 from tinygrad import dtypes
 from tqdm import trange, tqdm
 from matplotlib import pyplot as plt
--- a/extra/assembly/assembly_rdna.py
+++ b/extra/assembly/assembly_rdna.py
@ -4,7 +4,7 @@ from tinygrad import dtypes
 from tinygrad.codegen.assembly import AssemblyCodegen, Register
 from tinygrad.codegen.opt.kernel import Ops
 from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
-from tinygrad.runtime.ops_gpu import ROCM_LLVM_PATH
+from tinygrad.runtime.ops_cl import ROCM_LLVM_PATH

 # ugh, is this really needed?
 from extra.helpers import enable_early_exec
--- a/extra/assembly/rocm/rdna3/asm.py
+++ b/extra/assembly/rocm/rdna3/asm.py
@ -5,7 +5,7 @@ from tinygrad.helpers import colored
 from extra.helpers import enable_early_exec
 early_exec = enable_early_exec()

-from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH
+from tinygrad.runtime.ops_cl import CLProgram, CLBuffer, ROCM_LLVM_PATH

 ENABLE_NON_ASM = False

--- a/extra/export_model.py
+++ b/extra/export_model.py
@ -10,7 +10,7 @@ from tinygrad.uop.ops import Ops
 import json
 from collections import OrderedDict

-EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "GPU"]
+EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "CL"]

 def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str],List[Tuple[str,List[str],List[int]]],Dict[str,Tuple[int,DType,int]],Dict[str,Tensor]]:
  functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0
--- a/extra/gemm/intel_xmx.py
+++ b/extra/gemm/intel_xmx.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 import numpy as np
-from tinygrad.runtime.ops_gpu import CLProgram, CLCompiler
+from tinygrad.runtime.ops_cl import CLProgram, CLCompiler
 from tinygrad import Device, dtypes
 from tinygrad.device import Buffer
 from hexdump import hexdump
@ -11,7 +11,7 @@ from hexdump import hexdump
 # https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_split_matrix_multiply_accumulate.html
 # https://hc34.hotchips.org/assets/program/conference/day1/GPU%20HPC/Intel_s%20Ponte%20Vecchio%20GPU%20-%20Architecture%20Systems%20and%20Software%20FINAL.pdf

-device = Device["GPU"]
+device = Device["CL"]

 # NOTE: only the subgroup type 8 ones work
 prog = CLProgram(device, "test", CLCompiler(device, "test").compile(f"""
@ -26,9 +26,9 @@ __kernel void test(__global float* data0, const __global int* data1, const __glo
 """))
 #with open("/tmp/test.elf", "wb") as f: f.write(prog.lib)

-a = Buffer("GPU", 8, dtypes.float32).allocate()
-b = Buffer("GPU", 0x10, dtypes.float16).allocate()
-c = Buffer("GPU", 8*0x10, dtypes.float16).allocate()
+a = Buffer("CL", 8, dtypes.float32).allocate()
+b = Buffer("CL", 0x10, dtypes.float16).allocate()
+c = Buffer("CL", 8*0x10, dtypes.float16).allocate()

 row = np.array([1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8], np.float16)
 mat = np.random.random((8, 0x10)).astype(np.float16)
--- a/extra/gemm/max_matmul.py
+++ b/extra/gemm/max_matmul.py
@ -75,7 +75,7 @@ if __name__ == "__main__":

    if GEMM_VARIATION == "max" and (M%64)==0 and (N%128)==0 and (K%64)==0 and DTYPE_IN == dtypes.half and DTYPE_OUT == dtypes.float and DTYPE_ACC == dtypes.float:
      print("Using CUDA and triton-generated kernel")
-      # See nv_triton_gemm.annotated.ptx for PTX code which was generated from `PYTHONPATH=. DEBUG=6 CUDA=1 PTX=1 python3 extra/gemm/triton_nv_matmul.py`
+      # See nv_triton_gemm.annotated.ptx for PTX code which was generated from `PYTHONPATH=. DEBUG=6 CUDA=1 CUDA_PTX=1 python3 extra/gemm/triton_nv_matmul.py`
      # this kernel with M=N=K=4096 does 162TFLOPS, vs torch at 144TFLOPS and BEAM=8 tinygrad at 138TFLOPS.  theo max is 165TFLOPS.

      # WMMA element size is (M, N, K) = (16, 8, 16)
--- a/extra/gemm/triton_nv_matmul.py
+++ b/extra/gemm/triton_nv_matmul.py
@ -43,7 +43,7 @@ def matmul_kernel(c_ptr, a_ptr, b_ptr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N:
  c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
  tl.store(c_ptrs, c)

-# CUDA=1 PTX=1 python3 extra/gemm/triton_nv_matmul.py
+# CUDA=1 CUDA_PTX=1 python3 extra/gemm/triton_nv_matmul.py
 if __name__ == "__main__":
  BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K = 64, 128, 64
  M, N, K = 4096, 4096, 4096
--- a/extra/models/inception.py
+++ b/extra/models/inception.py
@ -270,8 +270,10 @@ class FidInceptionV3:
    self.Mixed_7b = inception.Mixed_7b
    self.Mixed_7c = inception.Mixed_7c

-  def load_from_pretrained(self):
-    state_dict = torch_load(str(fetch("https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth", "pt_inception-2015-12-05-6726825d.pth")))
+  def load_from_pretrained(self, path=None):
+    if path is None:
+      path = fetch("https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth", "pt_inception-2015-12-05-6726825d.pth")
+    state_dict = torch_load(str(path))
    for k,v in state_dict.items():
      if k.endswith(".num_batches_tracked"):
        state_dict[k] = v.reshape(1)
--- a/extra/nv_gpu_driver/nv_ioctl.py
+++ b/extra/nv_gpu_driver/nv_ioctl.py
@ -272,4 +272,4 @@ def compare_launch_state(states, good_states):

  return True, "PASS"

-# IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add
+# IOCTL=1 CUDA=1 CUDA_PTX=1 python3 test/test_ops.py TestOps.test_tiny_add
--- a/extra/optimization/generate_dataset.sh
+++ b/extra/optimization/generate_dataset.sh
@ -7,7 +7,7 @@ rm $LOGOPS
 test/external/process_replay/reset.py

 CI=1 python3 -m pytest -n=auto test/test_ops.py test/test_nn.py test/test_winograd.py test/models/test_real_world.py --durations=20
-GPU=1 python3 -m pytest test/test_tiny.py
+CL=1 python3 -m pytest test/test_tiny.py

 # extract, sort and uniq
 extra/optimization/extract_dataset.py
--- a/extra/optimization/test_beam_search.py
+++ b/extra/optimization/test_beam_search.py
@ -50,7 +50,7 @@ class TestBeamSearch(unittest.TestCase):
  def test_variable_shrink_prime_number(self):
    v = Variable("v", 1, 400).bind(367)
    a = rand(400, 367)
-    b = (a.shrink(((0,v), None))+1).reshape(367,367).realize()
+    b = (a.shrink(((0,v), None))+1)[:367,:367].realize()
    np.testing.assert_allclose(b.numpy(), a.numpy()[:367]+1, atol=1e-4, rtol=1e-4)

  def test_no_mutate_rawbuffers(self):
--- a/extra/qcom_gpu_driver/qcom_opencl_interop.py
+++ b/extra/qcom_gpu_driver/qcom_opencl_interop.py
@ -1,6 +1,6 @@
 import ctypes, array
 from hexdump import hexdump
-from tinygrad.runtime.ops_gpu import GPUDevice
+from tinygrad.runtime.ops_cl import CLDevice
 from tinygrad.helpers import getenv, to_mv, mv_address
 from tinygrad.dtype import dtypes
 from tinygrad import Tensor, TinyJit
@ -8,7 +8,7 @@ from tinygrad.runtime.autogen import opencl as cl
 if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl  # noqa: F401  # pylint: disable=unused-import

 # create raw opencl buffer.
-gdev = GPUDevice()
+gdev = CLDevice()
 cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 0x100, None, status := ctypes.c_int32())
 assert status.value == 0

--- a/extra/sqtt/README.md
+++ b/extra/sqtt/README.md
@ -4,7 +4,7 @@

 Only supported on 7900XTX, requires either AM (`rmmod amdgpu`) or disabling power gating on AMD (`ppfeaturemask=0xffff3fff`, don't forget to rebuild initramfs)

-SQTT is implemented on top of normal tinygrad PROFILE=1, `PROFILE=1 SQTT=1` to get profile pickle with sqtt data embedded in it.
+SQTT is implemented on top of normal tinygrad profiling, `VIZ=1 SQTT=1` to get profile pickle with sqtt data embedded in it.

 `SQTT_BUFFER_SIZE=X` to change size of SQTT buffer (per shader engine, 6 SEs on 7900xtx) in megabytes, default 256.

--- a/extra/thneed.py
+++ b/extra/thneed.py
@ -4,13 +4,13 @@ import struct
 import json
 import traceback
 import numpy as np
-from tinygrad.runtime.ops_gpu import CLProgram, compile_gpu
+from tinygrad.runtime.ops_cl import CLProgram, compile_gpu
 from tinygrad.device import Device
 from tinygrad.helpers import DEBUG, getenv
 from collections import defaultdict
 import pyopencl as cl
-from tinygrad.runtime.ops_gpu import OSX_TIMING_RATIO
-CL = Device["GPU"]
+from tinygrad.runtime.ops_cl import OSX_TIMING_RATIO
+CL = Device["CL"]

 DEBUGCL = getenv("DEBUGCL", 0)
 FLOAT16 = getenv("FLOAT16", 0)
@ -110,7 +110,7 @@ class Thneed:
    prgs = {}
    for o in jdat['binaries']:
      nptr = ptr + o['length']
-      prgs[o['name']] = CLProgram(Device["GPU"], o['name'], weights[ptr:nptr])
+      prgs[o['name']] = CLProgram(Device["CL"], o['name'], weights[ptr:nptr])
      ptr = nptr

    # populate the cl_cache
@ -267,7 +267,7 @@ class Thneed:
    for prg, args in self.cl_cache:
      events.append(prg.clprg(CL.queue, *args))
    mt = time.monotonic()
-    Device["GPU"].synchronize()
+    Device["CL"].synchronize()
    et = time.monotonic() - st
    print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms")

--- a/extra/to_movement_ops.py
+++ b/extra/to_movement_ops.py
@ -35,7 +35,7 @@ def to_movement_ops(st: ShapeTracker) -> List[Tuple[MovementOps, Tuple]]:
  to_apply:List[Tuple[MovementOps, Tuple]] = []
  for i, v in enumerate(st.views):
    real_shape = tuple(y-x for x,y in v.mask) if v.mask else v.shape
-    offset = v.offset + sum(st*(s-1) for s,st in zip(real_shape, v.strides) if st<0)
+    offset = (v.offset or 0) + sum(st*(s-1) for s,st in zip(real_shape, v.strides) if st<0)
    real_offset = offset + (sum(x*st for (x,_),st in zip(v.mask, v.strides)) if v.mask else 0)
    real_real_shape = [s for s,st in zip(real_shape, v.strides) if st]
    strides: List[int] = [abs(st) if isinstance(st,int) else st for st in v.strides if st]
--- a/extra/torch_backend/backend.py
+++ b/extra/torch_backend/backend.py
@ -177,22 +177,28 @@ def cached_to_movement_ops(shape, st) -> list:

 from tinygrad.shape.shapetracker import ShapeTracker, View
 from extra.to_movement_ops import to_movement_ops, apply_mop, MovementOps
+
+@wrap_view_op
+def _as_strided(tensor:Tensor, size, stride, storage_offset=None):
+  # multiple as_strided do not compound
+  base = canonical_base(tensor)
+  # TODO: this is heavyweight
+  st = ShapeTracker(base.uop.st.views + (View.create(tuple(size), tuple(stride), storage_offset),))
+  ret = base
+  if TORCH_DEBUG >= 1: print("**** as_strided", tensor.shape, size, stride, st)
+  if prod(size) == 1: return ret.flatten()[storage_offset].reshape(size)
+  for mo in cached_to_movement_ops(tuple(base.shape), st): ret = apply_mop(ret, mo)
+  return ret
+
@torch.library.impl("aten::as_strided", "privateuseone")
 def as_strided(tensor:torch.Tensor, size, stride, storage_offset=None):
  storage_offset = storage_offset or tensor.storage_offset()
-  @wrap_view_op
-  def _as_strided(tensor:Tensor, size, stride, storage_offset=None):
-    # multiple as_strided do not compound
-    base = canonical_base(tensor)
-    # TODO: this is heavyweight
-    st = ShapeTracker(base.uop.st.views + (View.create(tuple(size), tuple(stride), storage_offset),))
-    ret = base
-    if TORCH_DEBUG >= 1: print("**** as_strided", tensor.shape, size, stride, st)
-    if prod(size) == 1: return ret.flatten()[storage_offset].reshape(size)
-    for mo in cached_to_movement_ops(tuple(base.shape), st): ret = apply_mop(ret, mo)
-    return ret
  return _as_strided(tensor, size, stride, storage_offset)

+@torch.library.impl("aten::_reshape_alias", "privateuseone")
+def _reshape_alias(tensor:torch.Tensor, size, stride):
+  return _as_strided(tensor, size, stride)
+
@torch.library.impl("aten::empty_strided", "privateuseone")
 def empty_strided(size, stride, dtype, layout=None, device=None, pin_memory=False):
  if TORCH_DEBUG: print(f"empty_strided {size=} {stride=} {dtype=} {layout=} {device=} {pin_memory=}")
--- a/pytest.ini
+++ b/pytest.ini
@ -1,2 +1,6 @@
 [pytest]
 norecursedirs = extra
+timeout = 180
+timeout_method = thread
+timeout_func_only = true
+testpaths = test
--- a/setup.py
+++ b/setup.py
@ -9,12 +9,12 @@ with open(directory / 'README.md', encoding='utf-8') as f:

 testing_minimal = [
  "numpy",
-  "torch==2.7.1",
+  "torch==2.8.0",
  "pytest",
  "pytest-xdist",
+  "pytest-timeout",
  "hypothesis",
  "z3-solver",
-  "ml_dtypes"
 ]

 setup(name='tinygrad',
@ -59,7 +59,7 @@ setup(name='tinygrad',
        'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
        'linting': [
            "pylint",
-            "mypy==1.13.0",
+            "mypy==1.18.1",
            "typing-extensions",
            "pre-commit",
            "ruff",
--- a/test/device/test_ocl.py
+++ b/test/device/test_ocl.py
@ -3,9 +3,9 @@ from tinygrad import Device
 from tinygrad.device import Buffer
 from tinygrad.dtype import dtypes
 from tinygrad.helpers import CI
-from tinygrad.runtime.ops_gpu import CLDevice, CLAllocator, CLCompiler, CLProgram
+from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram

-@unittest.skipUnless(Device.DEFAULT == "GPU", "Runs only on OpenCL (GPU)")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
 class TestCLError(unittest.TestCase):
  @unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
  def test_oom(self):
@ -24,7 +24,7 @@ class TestCLError(unittest.TestCase):
  def test_unaligned_copy(self):
    data = list(range(65))
    unaligned = memoryview(bytearray(data))[1:]
-    buffer = Buffer("GPU", 64, dtypes.uint8).allocate()
+    buffer = Buffer("CL", 64, dtypes.uint8).allocate()
    buffer.copyin(unaligned)
    result = memoryview(bytearray(len(data) - 1))
    buffer.copyout(result)
--- a/test/device/test_qcom.py
+++ b/test/device/test_qcom.py
@ -10,10 +10,11 @@ class TestQcom(unittest.TestCase):

    def __validate(imgdt, expected_pitch):
      img = dev.allocator.alloc(imgdt.shape[0] * imgdt.shape[1] * 16, options:=BufferSpec(image=imgdt))
-      pitch = (img.descriptor[2] & 0x1fffff80) >> 7
+      pitch = img.texture_info.pitch
      assert pitch == expected_pitch, f"Failed pitch for image: {imgdt}. Got 0x{pitch:X}, expected 0x{expected_pitch:X}"
      dev.allocator.free(img, imgdt.shape[0] * imgdt.shape[1] * 16, options)

+    # Match opencl pitches for perf
    __validate(dtypes.imageh((1, 201)), 0x680)
    __validate(dtypes.imageh((16, 216)), 0x700)
    __validate(dtypes.imageh((16, 9)), 0x80)
--- a/test/external/external_benchmark_hip_compile.py
+++ b/test/external/external_benchmark_hip_compile.py
@ -1,7 +1,7 @@
 import random, os
 from tinygrad.helpers import Timing
 from tinygrad.runtime.ops_hip import compile_hip, HIPDevice
-from tinygrad.runtime.ops_gpu import compile_cl, CLDevice
+from tinygrad.runtime.ops_cl import compile_cl, CLDevice

 # OMP_NUM_THREADS=1 strace -tt -f -e trace=file python3 test/external/external_benchmark_hip_compile.py
 # AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 python3 test/external/external_benchmark_hip_compile.py
--- a/test/external/external_cl_half_max.py
+++ b/test/external/external_cl_half_max.py
@ -1,4 +1,4 @@
-from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, compile_cl
+from tinygrad.runtime.ops_cl import CLDevice, CLProgram, compile_cl

 if __name__ == "__main__":
  dev = CLDevice()
--- a/test/external/external_gpu_fail_osx.py
+++ b/test/external/external_gpu_fail_osx.py
@ -1,5 +1,5 @@
 # ugh, OS X OpenCL doesn't support half
-from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, CLCompiler
+from tinygrad.runtime.ops_cl import CLDevice, CLProgram, CLCompiler

 src = """#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void max_half(__global half* data0, const __global half* data1) {
--- a/test/external/external_multi_gpu.py
+++ b/test/external/external_multi_gpu.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
-# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
+# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so CL=1 python3 test/external/external_multi_gpu.py
 import numpy as np
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import colored, Timing, getenv
--- a/test/external/external_osx_profiling.py
+++ b/test/external/external_osx_profiling.py
@ -1,4 +1,4 @@
-from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
+from tinygrad.runtime.ops_cl import CLProgram, CL, CLBuffer
 from tinygrad import dtypes
 import time

--- a/test/external/external_test_hcq_fuzz_failures.py
+++ b/test/external/external_test_hcq_fuzz_failures.py
--- a/test/external/external_test_image.py
+++ b/test/external/external_test_image.py
@ -4,7 +4,7 @@ import unittest
 import numpy as np
 if 'IMAGE' not in os.environ:
  os.environ['IMAGE'] = '2'
-os.environ['GPU'] = '1'
+os.environ['CL'] = '1'
 os.environ['OPT'] = '2'
 from tinygrad.tensor import Tensor
 from tinygrad.nn import Conv2d
--- a/test/external/external_test_onnx_backend.py
+++ b/test/external/external_test_onnx_backend.py
@ -193,12 +193,12 @@ backend_test.exclude('test_adam_cpu')
 backend_test.exclude('test_gradient_of_add_and_mul_cpu')
 backend_test.exclude('test_gradient_of_add_cpu')

-if Device.DEFAULT in ['GPU', 'METAL']:
+if Device.DEFAULT in ['CL', 'METAL']:
  backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
  backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
  backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')

-if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "GPU"):
+if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "CL"):
  # numerical inaccuracy
  backend_test.exclude('test_mish_cpu')
  backend_test.exclude('test_mish_expanded_cpu')
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@ -4,7 +4,7 @@ import numpy as np
 import torch

 from tinygrad import GlobalCounters, Tensor, Device
-from tinygrad.helpers import getenv, Context
+from tinygrad.helpers import getenv, Context, RANGEIFY
 from tinygrad.nn.state import get_parameters
 from tinygrad.engine.realize import capturing
 from tinygrad.tensor import _to_np_dtype
@ -34,7 +34,7 @@ from extra.models.efficientnet import EfficientNet
 from extra.models.resnet import ResNet18
 from extra.models.vit import ViT

-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestInferenceMinKernels(unittest.TestCase):
  def setUp(self):
    self.training_old = Tensor.training
@ -90,7 +90,7 @@ class TestInferenceMinKernels(unittest.TestCase):
    with CLCache(100):
      model(inp, 0).realize()

-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOptBinOp(unittest.TestCase):
  def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
    a = Tensor.randn(16, 16)
@ -106,7 +106,7 @@ class TestOptBinOp(unittest.TestCase):
  def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
  def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
  def test_no_binop_rerun_reduce_broadcast(self):
-    return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
+    return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=1 if RANGEIFY else 2)

  @unittest.skip("this test started failing with the new change, based movementop issue")
  def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
@ -117,7 +117,7 @@ class TestOptBinOp(unittest.TestCase):
  #def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
  #def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])

-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOptReduceLoop(unittest.TestCase):
  def test_loop_left(self):
    a = Tensor.randn(16, 16)
@ -139,7 +139,7 @@ class TestOptReduceLoop(unittest.TestCase):
      c.realize()
      assert cache.count == 2, "loop right fusion broken"

-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOptWChild(unittest.TestCase):
  @unittest.skip("this no longer happens, use realize")
  def test_unrealized_child(self):
@ -152,7 +152,7 @@ class TestOptWChild(unittest.TestCase):
      d.realize()
      assert cache.count == 2, "don't fuse if you have children"

-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOpt(unittest.TestCase):
  def test_muladd(self):
    a,b,c = [Tensor.randn(2,2).realize() for _ in range(3)]
@ -164,7 +164,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed(self):
    a = Tensor.randn(16, 16, 16)
-    with CLCache(2):
+    with CLCache(1 if RANGEIFY else 2):
      c = a.sum(2)
      d = c.permute(1,0).contiguous()
      d.realize()
@ -172,7 +172,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed_through_contract_reshape(self):
    a = Tensor.randn(4, 4, 4, 4, 4)
-    with CLCache(2):
+    with CLCache(1 if RANGEIFY else 2):
      c = a.sum(-1)
      d = c.reshape(16,16).permute(1,0).contiguous()
      d.realize()
@ -180,7 +180,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed_through_contractw1s_reshape(self):
    a = Tensor.randn(4, 4, 4, 4, 4)
-    with CLCache(2):
+    with CLCache(1 if RANGEIFY else 2):
      c = a.sum(-1)
      d = c.reshape(16,1,16).permute(2,1,0).contiguous()
      d.realize()
@ -188,7 +188,7 @@ class TestOpt(unittest.TestCase):

  def test_permute_was_pushed_through_expand_reshape(self):
    a = Tensor.randn(16, 16, 16)
-    with CLCache(2):
+    with CLCache(1 if RANGEIFY else 2):
      c = a.sum(2)
      d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
      d.realize()
--- a/test/external/external_test_speed_llama.py
+++ b/test/external/external_test_speed_llama.py
@ -20,7 +20,7 @@ class TestLLaMASpeed(unittest.TestCase):
  def test_llama_compile(self):
    backup_program = Device[Device.DEFAULT].runtime
    backup_allocator = Device[Device.DEFAULT].allocator
-    backup_compiler = Device[Device.DEFAULT].compiler
+    backup_compiler = Device[Device.DEFAULT].compiler.compile_cached
    Device[Device.DEFAULT].runtime = FakeProgram
    Device[Device.DEFAULT].allocator = FakeAllocator(Device.default)

@ -44,14 +44,14 @@ class TestLLaMASpeed(unittest.TestCase):
    run_llama("codegen(1)")

    # test no compiler use for this
-    Device[Device.DEFAULT].compiler = None
+    Device[Device.DEFAULT].compiler.compile_cached = None
    run_llama("methodcache", False)
    with Profiling(sort='time', frac=0.1, fn="/tmp/llama.prof", ts=5):
      run_llama("profile", False)

    Device[Device.DEFAULT].runtime = backup_program
    Device[Device.DEFAULT].allocator = backup_allocator
-    Device[Device.DEFAULT].compiler = backup_compiler
+    Device[Device.DEFAULT].compiler.compile_cached = backup_compiler

 if __name__ == '__main__':
  TestLLaMASpeed().test_llama_compile()
--- a/test/external/external_uop_gc.py
+++ b/test/external/external_uop_gc.py
@ -1,6 +1,6 @@
 import gc
 from tinygrad import Tensor, UOp, Device
-from tinygrad.shape.shapetracker import views_to_indexed_uops
+from tinygrad.shape.shapetracker import views_to_valid_uop
 from tinygrad.engine.realize import method_cache, get_program

 def uops_allocated(): return sum([isinstance(x, UOp) for x in gc.get_objects()])
@ -60,7 +60,7 @@ if __name__ == "__main__":

    # these caches will keep uops alive
    method_cache.clear()
-    views_to_indexed_uops.cache_clear()
+    views_to_valid_uop.cache_clear()

    new_uops = uops_allocated()
    gc.collect()
--- a/test/external/fuzz_fast_idiv.py
+++ b/test/external/fuzz_fast_idiv.py
@ -11,7 +11,7 @@ if __name__ == "__main__":
  for i in range(10_000):
    if i % 1000 == 0:
      print(f"Progress: {i}")
-    dt = random.choice(dtypes.ints)
+    dt = random.choice(dtypes.ints + tuple(dt.vec(4) for dt in dtypes.ints))
    u = UOp.variable('x', random.randint(dt.min, 0), random.randint(1, dt.max), dtype=dt)
    d = random.randint(1, max(1, u.arg[2]))
    if d in powers_of_two: continue
--- a/test/external/fuzz_linearizer.py
+++ b/test/external/fuzz_linearizer.py
@ -16,7 +16,7 @@ if os.getenv("VALIDATE_HCQ", 0) != 0:
  try:
    import extra.qcom_gpu_driver.opencl_ioctl
    from tinygrad import Device
-    _, _ = Device["QCOM"], Device["GPU"]
+    _, _ = Device["QCOM"], Device["CL"]
  except Exception: pass

 from tinygrad import Tensor, Device, dtypes
@ -42,9 +42,9 @@ if getenv("VALIDATE_HCQ"):
    on_linearizer_did_run = extra.nv_gpu_driver.nv_ioctl.collect_last_launch_state
    compare_states = extra.nv_gpu_driver.nv_ioctl.compare_launch_state
  elif Device.DEFAULT == "QCOM":
-    print("VALIDATE_HCQ: Comparing QCOM to GPU")
+    print("VALIDATE_HCQ: Comparing QCOM to CL")
    import extra.qcom_gpu_driver.opencl_ioctl
-    validate_device = Device["GPU"]
+    validate_device = Device["CL"]
    on_linearizer_will_run = extra.qcom_gpu_driver.opencl_ioctl.before_launch
    on_linearizer_did_run = extra.qcom_gpu_driver.opencl_ioctl.collect_last_launch_state
    compare_states = extra.qcom_gpu_driver.opencl_ioctl.compare_launch_state
@ -302,7 +302,7 @@ if __name__ == "__main__":
    for i, ast in enumerate(ast_strs[:getenv("FUZZ_N", len(ast_strs))]):
      if (nth := getenv("FUZZ_NTH", -1)) != -1 and i != nth: continue
      if getenv("FUZZ_IMAGEONLY") and "dtypes.image" not in ast: continue
-      if "dtypes.image" in ast and Device.DEFAULT not in {"GPU", "QCOM"}: continue  # IMAGE is only for GPU
+      if "dtypes.image" in ast and Device.DEFAULT not in {"CL", "QCOM"}: continue  # IMAGE is only for CL
      if ast in seen_ast_strs: continue
      seen_ast_strs.add(ast)

--- a/test/helpers.py
+++ b/test/helpers.py
@ -57,8 +57,8 @@ def eval_uop(uop:UOp, inputs:list[tuple[DType, list[Any]]]|None=None):
  return out_buf.cast(uop.dtype.fmt).tolist()[0]

 def not_support_multi_device():
-  # GPU and CUDA don't support multi device if in CI
-  return CI and REAL_DEV in ("GPU", "CUDA")
+  # CL and CUDA don't support multi device if in CI
+  return CI and REAL_DEV in ("CL", "CUDA")

 # NOTE: This will open REMOTE if it's the default device
 REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)
--- a/test/models/test_bert.py
+++ b/test/models/test_bert.py
@ -1,14 +1,14 @@
 #!/usr/bin/env python
 import unittest
+from tinygrad import Tensor
 import numpy as np
-from tinygrad.tensor import Tensor
 import torch

 def get_question_samp(bsz, seq_len, vocab_size, seed):
  np.random.seed(seed)
  in_ids= np.random.randint(vocab_size, size=(bsz, seq_len))
  mask = np.random.choice([True, False], size=(bsz, seq_len))
-  seg_ids = np.random.randint(1, size=(bsz, seq_len))
+  seg_ids = np.random.randint(2, size=(bsz, seq_len))  # type_vocab_size
  return in_ids, mask, seg_ids

 def set_equal_weights(mdl, torch_mdl):
@ -45,7 +45,7 @@ class TestBert(unittest.TestCase):

    seeds = (1337, 3141)
    bsz, seq_len = 1, 16
-    for _, seed in enumerate(seeds):
+    for seed in seeds:
      in_ids, mask, seg_ids = get_question_samp(bsz, seq_len, config['vocab_size'], seed)
      out = mdl(Tensor(in_ids), Tensor(mask), Tensor(seg_ids))
      torch_out = torch_mdl.forward(torch.from_numpy(in_ids).long(), torch.from_numpy(mask), torch.from_numpy(seg_ids).long())[:2]
--- a/test/models/test_efficientnet.py
+++ b/test/models/test_efficientnet.py
@ -1,12 +1,10 @@
-import ast
-import pathlib
-import unittest
+import ast, pathlib, unittest

 import numpy as np
 from PIL import Image

-from tinygrad.helpers import getenv
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor
+from tinygrad.helpers import getenv, CI
 from extra.models.efficientnet import EfficientNet
 from extra.models.vit import ViT
 from extra.models.resnet import ResNet50
@ -40,19 +38,13 @@ def preprocess(img, new=False):
    img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
  return img

+def _infer(model: EfficientNet, img):
+  with Tensor.train(False):
+    out = model.forward(Tensor(img)).argmax(axis=-1)
+  return out.tolist()

-def _infer(model: EfficientNet, img, bs=1):
-  old_training = Tensor.training
-  Tensor.training = False
-  img = preprocess(img)
-  # run the net
-  if bs > 1: img = img.repeat(bs, axis=0)
-  out = model.forward(Tensor(img))
-  Tensor.training = old_training
-  return _LABELS[np.argmax(out.numpy()[0])]
-
-chicken_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg')
-car_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg')
+chicken_img = preprocess(Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg'))
+car_img = preprocess(Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg'))

 class TestEfficientNet(unittest.TestCase):
  @classmethod
@ -64,17 +56,20 @@ class TestEfficientNet(unittest.TestCase):
  def tearDownClass(cls):
    del cls.model

+  @unittest.skipIf(CI, "covered by test_chicken_car")
  def test_chicken(self):
-    label = _infer(self.model, chicken_img)
-    self.assertEqual(label, "hen")
-
-  def test_chicken_bigbatch(self):
-    label = _infer(self.model, chicken_img, 2)
-    self.assertEqual(label, "hen")
+    labels = _infer(self.model, chicken_img)
+    self.assertEqual(_LABELS[labels[0]], "hen")

+  @unittest.skipIf(CI, "covered by test_chicken_car")
  def test_car(self):
-    label = _infer(self.model, car_img)
-    self.assertEqual(label, "sports car, sport car")
+    labels = _infer(self.model, car_img)
+    self.assertEqual(_LABELS[labels[0]], "sports car, sport car")
+
+  def test_chicken_car(self):
+    labels = _infer(self.model, np.concat([chicken_img, car_img], axis=0))
+    self.assertEqual(_LABELS[labels[0]], "hen")
+    self.assertEqual(_LABELS[labels[1]], "sports car, sport car")

 class TestViT(unittest.TestCase):
  @classmethod
@ -87,12 +82,12 @@ class TestViT(unittest.TestCase):
    del cls.model

  def test_chicken(self):
-    label = _infer(self.model, chicken_img)
-    self.assertEqual(label, "cock")
+    labels = _infer(self.model, chicken_img)
+    self.assertEqual(_LABELS[labels[0]], "cock")

  def test_car(self):
-    label = _infer(self.model, car_img)
-    self.assertEqual(label, "racer, race car, racing car")
+    labels = _infer(self.model, car_img)
+    self.assertEqual(_LABELS[labels[0]], "racer, race car, racing car")

 class TestResNet(unittest.TestCase):
  @classmethod
@ -105,12 +100,12 @@ class TestResNet(unittest.TestCase):
    del cls.model

  def test_chicken(self):
-    label = _infer(self.model, chicken_img)
-    self.assertEqual(label, "hen")
+    labels = _infer(self.model, chicken_img)
+    self.assertEqual(_LABELS[labels[0]], "hen")

  def test_car(self):
-    label = _infer(self.model, car_img)
-    self.assertEqual(label, "sports car, sport car")
+    labels = _infer(self.model, car_img)
+    self.assertEqual(_LABELS[labels[0]], "sports car, sport car")

 if __name__ == '__main__':
  unittest.main()
--- a/test/models/test_onnx.py
+++ b/test/models/test_onnx.py
@ -5,12 +5,8 @@ from tinygrad.frontend.onnx import OnnxRunner
 from tinygrad.device import Device
 from tinygrad.helpers import fetch, Context

-try:
-  from extra.onnx_helpers import validate
-  from extra.huggingface_onnx.huggingface_manager import DOWNLOADS_DIR, snapshot_download_with_retry
-  HUGGINGFACE_AVAILABLE = True
-except ModuleNotFoundError:
-  HUGGINGFACE_AVAILABLE = False
+from extra.onnx_helpers import validate
+from extra.huggingface_onnx.huggingface_manager import DOWNLOADS_DIR, snapshot_download_with_retry

 def run_onnx_torch(onnx_model, inputs):
  import torch
@ -62,7 +58,7 @@ class TestOnnxModel(unittest.TestCase):
    print(cls, _LABELS[cls])
    assert "car" in _LABELS[cls] or _LABELS[cls] == "convertible"

-@unittest.skipUnless(HUGGINGFACE_AVAILABLE and Device.DEFAULT == "METAL", "only run on METAL")
+@unittest.skipUnless(Device.DEFAULT == "METAL", "only run on METAL")
 class TestHuggingFaceOnnxModels(unittest.TestCase):
  @classmethod
  def setUpClass(cls):
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@ -53,8 +53,8 @@ class TestRealWorld(unittest.TestCase):
  @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16")
  def test_stable_diffusion(self):
    params = unet_params
-    params["model_ch"] = 16
-    params["ctx_dim"] = 16
+    params["model_ch"] = 8
+    params["ctx_dim"] = 8
    params["num_res_blocks"] = 1
    params["n_heads"] = 2
    model = UNetModel(**params)
@ -114,7 +114,7 @@ class TestRealWorld(unittest.TestCase):

      helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.07, 93)

-  @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "GPU"}, "slow")
+  @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
  def test_train_cifar(self):
    with Tensor.train():
      model = SpeedyResNet(Tensor.ones((12,3,2,2)))
@ -144,6 +144,7 @@ class TestRealWorld(unittest.TestCase):
                                final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=4)
      assert not np.isnan(lr_scheduler.min_lr), "lr too small or initial_div_facotr too big for half"

+  @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
  def test_bert(self):
    with Tensor.train():
      args_tiny = {"attention_probs_dropout_prob": 0.0, "hidden_dropout_prob": 0.0, "vocab_size": 30522, "type_vocab_size": 2,
@ -167,9 +168,5 @@ class TestRealWorld(unittest.TestCase):
      helper_test("train_bert", lambda: (data["input_ids"], data["segment_ids"], data["input_mask"], data["masked_lm_positions"], \
          data["masked_lm_ids"], data["masked_lm_weights"], data["next_sentence_labels"]), train, 0.25, 347)

-  def test_bert_fuse_arange(self):
-    with Context(FUSE_ARANGE=1):
-      self.test_bert()
-
 if __name__ == '__main__':
  unittest.main()
--- a/test/models/test_rnnt.py
+++ b/test/models/test_rnnt.py
@ -1,8 +1,8 @@
 #!/usr/bin/env python
 import unittest
-import numpy as np
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor
 from extra.models.rnnt import LSTM
+import numpy as np
 import torch

 class TestRNNT(unittest.TestCase):
--- a/test/models/test_train.py
+++ b/test/models/test_train.py
@ -1,9 +1,8 @@
-import unittest
-import time
+import unittest, time
 import numpy as np
+from tinygrad import Device
 from tinygrad.nn.state import get_parameters
 from tinygrad.nn import optim
-from tinygrad.tensor import Device
 from tinygrad.helpers import getenv, CI
 from extra.training import train
 from extra.models.convnext import ConvNeXt
@ -27,7 +26,7 @@ def train_one_step(model,X,Y):
  print("done in %.2f ms" % (et*1000.))

 def check_gc():
-  if Device.DEFAULT == "GPU":
+  if Device.DEFAULT == "CL":
    from extra.introspection import print_objects
    assert print_objects() == 0

@ -40,7 +39,6 @@ class TestTrain(unittest.TestCase):
    check_gc()

  @unittest.skipIf(CI, "slow")
-  @unittest.skipIf(Device.DEFAULT in ["METAL", "WEBGPU"], "too many buffers for webgpu and metal")
  def test_efficientnet(self):
    model = EfficientNet(0)
    X = np.zeros((BS,3,224,224), dtype=np.float32)
@ -49,7 +47,6 @@ class TestTrain(unittest.TestCase):
    check_gc()

  @unittest.skipIf(CI, "slow")
-  @unittest.skipIf(Device.DEFAULT in ["METAL", "WEBGPU"], "too many buffers for webgpu and metal")
  def test_vit(self):
    model = ViT()
    X = np.zeros((BS,3,224,224), dtype=np.float32)
@ -57,7 +54,7 @@ class TestTrain(unittest.TestCase):
    train_one_step(model,X,Y)
    check_gc()

-  @unittest.skipIf(Device.DEFAULT in ["METAL", "WEBGPU"], "too many buffers for webgpu and metal")
+  @unittest.skipIf(CI, "slow")
  def test_transformer(self):
    # this should be small GPT-2, but the param count is wrong
    # (real ff_dim is 768*4)
--- a/test/opt/test_kernel_opts.py
+++ b/test/opt/test_kernel_opts.py
@ -1,6 +1,6 @@
 import unittest
 from tinygrad import Device, Tensor, dtypes
-from tinygrad.helpers import CI
+from tinygrad.helpers import CI, RANGEIFY
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError

 # TODO: write a clean version of this
@ -93,7 +93,7 @@ class TestKernelOpts(unittest.TestCase):
    a = Tensor.rand(8, N, 8, N)
    r = a.sum(axis=(1,3))
    helper_linearizer_opt(r, [
-      # openCL / GPU=1 is 256 max threads
+      # openCL / CL=1 is 256 max threads
      [Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)],
      [Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce.
      [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)],
@ -327,13 +327,14 @@ class TestKernelOpts(unittest.TestCase):
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
  def test_arange_opts(self):
    a = Tensor.arange(128)
+    # NOTE: arange no longer has reduce ops available for opt
    helper_linearizer_opt(a, [
-      [Opt(OptOps.GROUP, 0, 32)],
-      [Opt(OptOps.GROUPTOP, 0, 32)],
+      #[Opt(OptOps.GROUP, 0, 32)],
+      #[Opt(OptOps.GROUPTOP, 0, 32)],
      [Opt(op=OptOps.LOCAL, axis=0, arg=8)],
      [Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0)],
-      [Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8)],
-      [Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], # noqa: E501
+      #[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8)],
+      #[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], # noqa: E501
    ])

  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_threads, "test requires threads")
@ -350,5 +351,18 @@ class TestKernelOpts(unittest.TestCase):
    ] + [[Opt(OptOps.THREAD, 0, 4)] if Device[Device.DEFAULT].renderer.global_max[0] >= 4 else []]
      + [[Opt(OptOps.THREAD, 0, 8)] if Device[Device.DEFAULT].renderer.global_max[0] >= 8 else []])

+  @unittest.skipUnless(RANGEIFY>=1, "Kernel only fuses with rangeify")
+  def test_double_sum_group(self):
+    a = Tensor.rand(4, 4, 4)
+    r = a.sum((1, 2)).sum()
+    with self.assertRaises(KernelOptError):
+      helper_linearizer_opt(r, [[Opt(OptOps.GROUPTOP, 0, 16)],])
+    r = a.sum((1, 2)).sum()
+    with self.assertRaises(KernelOptError):
+      helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 1, 4), Opt(OptOps.GROUPTOP, 0, 16)],])
+    r = a.sum((1, 2)).sum()
+    with self.assertRaises(KernelOptError):
+      helper_linearizer_opt(r, [[Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.GROUPTOP, 0, 16)],])
+
 if __name__ == '__main__':
  unittest.main()
--- a/test/speed/external_test_copy_speed.py
+++ b/test/speed/external_test_copy_speed.py
@ -77,9 +77,9 @@ class TestCopySpeed(unittest.TestCase):
      np.testing.assert_equal(t.numpy(), x.numpy())

  @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
-  @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
+  @unittest.skipIf(Device.DEFAULT != "CL", "only test this on CL")
  def testCopyCPUto6GPUs(self):
-    from tinygrad.runtime.ops_gpu import CLDevice
+    from tinygrad.runtime.ops_cl import CLDevice
    if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
    t = Tensor.ones(N, N, device="CPU").contiguous().realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
@ -87,8 +87,8 @@ class TestCopySpeed(unittest.TestCase):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
        with Timing("queue: "):
          for g in range(6):
-            t.to(f"gpu:{g}").realize()
-        Device["gpu"].synchronize()
+            t.to(f"CL:{g}").realize()
+        Device["CL"].synchronize()

 if __name__ == '__main__':
  unittest.main()
--- a/test/test_arange.py
+++ b/test/test_arange.py
@ -1,51 +1,29 @@
 import unittest
 import numpy as np
 from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
-from tinygrad.helpers import CI, Context, getenv
+from tinygrad.helpers import CI, Context, getenv, RANGEIFY
 from tinygrad.engine.realize import run_schedule
-from tinygrad.codegen.opt import Opt, OptOps
 from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
 from tinygrad.uop.ops import Ops

 class TestArange(unittest.TestCase):
-  def _get_flops(self, N, opts=None):
+  def _get_flops(self, N):
    GlobalCounters.reset()
    tt = Tensor.arange(N)
    sched = tt.schedule()
    self.assertEqual(len(sched), 1)
-    p = get_program(sched[-1].ast, opts=opts)
-    print(p.name)
-    #print(p.src)
+    p = get_program(sched[-1].ast)
    ExecItem(CompiledRunner(p), [tt.uop.buffer]).run()
    np.testing.assert_equal(tt.numpy(), np.arange(N))
    return p.estimates.ops

-  def test_complexity(self, opts=None, limit=None):
-    f1 = self._get_flops(256, opts)
-    f2 = self._get_flops(2560, opts)
-    print(f"{f1=}, {f2=}")
-    # add 1 to avoid divide by 0. arange is 0 flops now!
-    assert (f1 < 6000 and f2 < 6000) or ((f2+1) / (f1+1) < 16), f"bad complexity, flops {(f2+1) / (f1+1):.1f}X while inputs 10X"
-    if limit is not None and not getenv("PTX"):
-      # PTX counts index ALU in flops
-      assert f1 <= limit, f"{f1=}, {limit=}"
+  def test_complexity(self):
+    self.assertEqual(self._get_flops(256), 0)
+    self.assertEqual(self._get_flops(2560), 0)

-  def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=0)
-  def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=0)
-  def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=0)
-  def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=0)
-  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=0)
-
-  if Device.default.renderer.has_local:
-    # TODO: fix limit
-    def test_complexity_w_group(self): return self.test_complexity([Opt(OptOps.GROUP, 0, 16)], limit=81920)
-    def test_complexity_w_group_top(self): return self.test_complexity([Opt(OptOps.GROUPTOP, 0, 16)], limit=106496)
-
-    def test_complexity_w_local(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16)], limit=0)
-    @unittest.skip("doesn't work yet. TODO: this absolutely should work")
-    def test_complexity_w_local_unroll4(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.UNROLL, 0, 4)], limit=0)
-    @unittest.skip("doesn't work yet")
-    def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.PADTO, axis=1, arg=32)])
+  def test_arange_cat(self):
+    t = Tensor.arange(2, dtype=dtypes.int)+Tensor([3])
+    self.assertEqual(t.cat(t).tolist(), [3, 4, 3, 4])

 class TestRand(unittest.TestCase):
  def test_fused_rand_less_ops(self, noopt=1):
@ -133,7 +111,7 @@ class TestIndexing(unittest.TestCase):
      X = dataset[idxs]
      assert X.shape == (4,DDIM)
      sched = X.schedule()
-      self.assertEqual(len(sched), 2)
+      self.assertEqual(len(sched), 1 if RANGEIFY else 2)
      run_schedule(sched)
      assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops} != {4*DSET}"
    np.testing.assert_allclose(real_index, X.numpy())
--- a/test/test_const_folding.py
+++ b/test/test_const_folding.py
@ -3,6 +3,7 @@ from tinygrad import Tensor, Device, dtypes
 from tinygrad.dtype import DType, ConstType
 from tinygrad.uop.ops import Ops, UOp
 from tinygrad.codegen import full_rewrite_to_sink
+from tinygrad.helpers import RANGEIFY
 from tinygrad.device import is_dtype_supported
 import numpy as np
 from test.helpers import not_support_multi_device
@ -155,7 +156,7 @@ class TestMovedConstFolding(unittest.TestCase):

  def test_add_padded_zero(self):
    # TODO: it's 1 now, this might be possible to fold
-    _check_ast_count(1, Tensor([1.0, 2, 3, 4]) + Tensor.zeros(2).pad(((1, 1),)))
+    _check_ast_count(0 if RANGEIFY else 1, Tensor([1.0, 2, 3, 4]) + Tensor.zeros(2).pad(((1, 1),)))

  def test_mul_shrunk_one(self):
    _check_ast_count(0, Tensor([1.0, 2, 3, 4]) * Tensor.ones(6).shrink(((1, 5),)))
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@ -4,12 +4,13 @@ import torch
 from typing import Any, List
 from tinygrad.device import is_dtype_supported
 from tinygrad.helpers import getenv, DEBUG, CI
-from tinygrad.dtype import DType, DTYPES_DICT, least_upper_dtype, fp8_to_float, float_to_fp8, _to_np_dtype, _to_torch_dtype
+from tinygrad.dtype import DType, DTYPES_DICT, least_upper_dtype, fp8_to_float, float_to_fp8, _to_np_dtype, _to_torch_dtype, truncate
+from tinygrad.renderer.ptx import PTXRenderer
+from tinygrad.renderer.nir import NIRRenderer
 from tinygrad import Device, Tensor, dtypes
 from hypothesis import assume, given, settings, strategies as strat
 from test.helpers import rand_for_dtype
 from test.unit.test_dtype_spec import _assert_eq, core_dtypes, dtype_ints, dtype_floats, FP8E4M3_MAX, FP8E5M2_MAX
-import ml_dtypes
 import pytest
 pytestmark = pytest.mark.filterwarnings("ignore")

@ -25,6 +26,7 @@ def get_available_cast_dtypes(dtype: DType) -> List[DType]:

 def _to_torch_storage_type(dtype:DType):
  if dtype == dtypes.bfloat16: return torch.float32
+  if dtype in dtypes.fp8s: return torch.float32
  return _to_torch_dtype(dtype)

 def _test_to_np(a:Tensor, np_dtype, target):
@ -47,12 +49,15 @@ def _test_cast(a:Tensor, target_dtype:DType):
    # TODO: struct.pack cannot pack value > 65504 (max of half) into e format
    a = (a > 65504).where(65504, a)

-  _test_op(lambda: a.cast(target_dtype), target_dtype, list(a.numpy().astype(_to_np_dtype(target_dtype))))
+  expected = list(a.numpy().astype(_to_np_dtype(target_dtype)))
+  if target_dtype in dtypes.fp8s: expected = list(map(lambda x: truncate[target_dtype](x), expected))
+  _test_op(lambda: a.cast(target_dtype), target_dtype, expected)
 def _test_bitcast(a:Tensor, target_dtype:DType, target=None):
-  if getenv("PTX") and a.dtype == dtypes.int8 and target_dtype.itemsize != a.dtype.itemsize:
+  if isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) and a.dtype == dtypes.int8 and target_dtype.itemsize != a.dtype.itemsize:
    raise unittest.SkipTest("shape changing bitcast of int8 broken on PTX")
-  expected = torch.tensor(a.tolist(), dtype=_to_torch_storage_type(a.dtype)).view(_to_torch_dtype(target_dtype))
-  _test_op(lambda: a.bitcast(target_dtype), target_dtype, target or expected.tolist())
+  expected = torch.tensor(a.tolist(), dtype=_to_torch_storage_type(a.dtype)).view(_to_torch_dtype(target_dtype)).tolist()
+  if target_dtype in dtypes.fp8s: expected = list(map(lambda x: fp8_to_float(x, target_dtype), expected))
+  _test_op(lambda: a.bitcast(target_dtype), target_dtype, target or expected)

 class TestDType(unittest.TestCase):
  DTYPE: Any = None
@ -100,8 +105,7 @@ class TestDType(unittest.TestCase):
    ))

  @unittest.skipIf(Device.DEFAULT == "PYTHON", "skip for now")
-  @unittest.skipIf(getenv("PTX"), "skip for now")
-  @unittest.skipIf(getenv("NIR"), "skip for now")
+  @unittest.skipIf(isinstance(r:=Device[Device.DEFAULT].renderer, PTXRenderer) or isinstance(r, NIRRenderer), "skip for now")
  def test_uint_overflow(self):
    if not dtypes.is_unsigned(self.DTYPE): raise unittest.SkipTest("only for unsigned")
    v = dtypes.max(self.DTYPE)
@ -129,11 +133,10 @@ class TestDType(unittest.TestCase):
      np.testing.assert_allclose(tin, tor, atol=1e-6, rtol=1e-3)

  def test_finfo(self):
-    if self.DTYPE not in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: return
-    info = ml_dtypes.finfo(ml_dtypes.bfloat16 if self.DTYPE is dtypes.bfloat16 else _to_np_dtype(self.DTYPE))
-    assert info.bits == self.DTYPE.itemsize*8
-    assert info.nexp == dtypes.finfo(self.DTYPE)[0]
-    assert info.nmant == dtypes.finfo(self.DTYPE)[1]
+    if self.DTYPE not in [dtypes.float16, dtypes.float32, dtypes.float64]: return
+    info = np.finfo(_to_np_dtype(self.DTYPE))
+    self.assertEqual(info.bits, self.DTYPE.itemsize*8)
+    self.assertEqual((info.nexp, info.nmant), dtypes.finfo(self.DTYPE))

 def _test_ops(a_dtype:DType, b_dtype:DType, target_dtype=None):
  target_dtype = target_dtype or least_upper_dtype(a_dtype, b_dtype)
@ -151,7 +154,8 @@ class TestFp8s(unittest.TestCase):

 class TestFp8sConversions(unittest.TestCase):
  @given(strat.floats(width=32, allow_subnormal=True, allow_nan=False, allow_infinity=False, min_value=-FP8E4M3_MAX, max_value=FP8E4M3_MAX))
-  def test_float_to_fp8e4m3(self, x): np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e4m3), ml_dtypes.float8_e4m3fn(x).tobytes()[0])
+  def test_float_to_fp8e4m3(self, x):
+    np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e4m3), torch.tensor(x, dtype=torch.float8_e4m3fn).view(torch.uint8).item())

  def test_float_to_fp8e4m3_extreme_values(self):
    np.testing.assert_equal(float_to_fp8(FP8E4M3_MAX, dtypes.fp8e4m3), 126)
@ -164,7 +168,8 @@ class TestFp8sConversions(unittest.TestCase):
    np.testing.assert_equal(float_to_fp8(-math.nan, dtypes.fp8e4m3), 255)

  @given(strat.floats(width=32, allow_subnormal=True, allow_nan=False, allow_infinity=False, min_value=-FP8E5M2_MAX, max_value=FP8E5M2_MAX))
-  def test_float_to_fp8e5m2(self, x): np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e5m2), ml_dtypes.float8_e5m2(x).tobytes()[0])
+  def test_float_to_fp8e5m2(self, x):
+    np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e5m2), torch.tensor(x, dtype=torch.float8_e5m2).view(torch.uint8).item())

  def test_float_to_fp8e5m2_extreme_values(self):
    np.testing.assert_equal(float_to_fp8(FP8E5M2_MAX, dtypes.fp8e5m2), 123)
@ -177,10 +182,12 @@ class TestFp8sConversions(unittest.TestCase):
    np.testing.assert_equal(float_to_fp8(-math.nan, dtypes.fp8e5m2), 254)

  @given(strat.integers(min_value=0, max_value=255))
-  def test_fp8e4m3_to_float(self, x): np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e4m3), np.uint8(x).view(ml_dtypes.float8_e4m3fn).item())
+  def test_fp8e4m3_to_float(self, x):
+    np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e4m3), torch.tensor(x, dtype=torch.uint8).view(torch.float8_e4m3fn).float().item())

  @given(strat.integers(min_value=0, max_value=255))
-  def test_fp8e5m2_to_float(self, x): np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e5m2), np.uint8(x).view(ml_dtypes.float8_e5m2).item())
+  def test_fp8e5m2_to_float(self, x):
+    np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e5m2), torch.tensor(x, dtype=torch.uint8).view(torch.float8_e5m2).float().item())

@unittest.skipUnless(is_dtype_supported(dtypes.bfloat16), "bfloat16 not supported")
 class TestBFloat16(unittest.TestCase):
@ -256,8 +263,9 @@ class TestFloatDType(TestDType):

 class TestDoubleDType(TestDType):
  DTYPE = dtypes.double
-  @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX") or getenv("NIR"),
-                   "conversion not supported on CI, CUDA, PTX and NIR")  # TODO: why not?
+  @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or \
+                    isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) or \
+                    isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "conversion not supported on CI CUDA, PTX, and NIR")  # TODO: why not?
  def test_float64_increased_precision(self):
    for func in [
      lambda t: t.exp(),
@ -281,21 +289,21 @@ class TestDoubleDType(TestDType):

 class TestInt8DType(TestDType):
  DTYPE = dtypes.int8
-  @unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
+  @unittest.skipIf(getenv("CUDA",0)==1 or isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "cuda saturation works differently")
  def test_int8_to_uint8_negative(self):
    _test_op(lambda: Tensor([-1, -2, -3, -4], dtype=dtypes.int8).cast(dtypes.uint8), dtypes.uint8, [255, 254, 253, 252])

  def test_int8_to_uint16_negative(self):
    _test_op(lambda: Tensor([-1, -2, -3, -4], dtype=dtypes.int8).cast(dtypes.uint16), dtypes.uint16, [2**16-1, 2**16-2, 2**16-3, 2**16-4])

-  @unittest.skipIf(getenv("PTX"), "broken in ptx")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken in ptx")
  def test_bitcast_alt(self):
    a = Tensor([72, -90, 27, 40, -53, 70, 96, 51], dtype=dtypes.int8).bitcast(dtypes.short)
    self.assertListEqual(a.tolist(), [-22968, 10267, 18123, 13152])

 class TestUint8DType(TestDType):
  DTYPE = dtypes.uint8
-  @unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
+  @unittest.skipIf(getenv("CUDA",0)==1 or isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "cuda saturation works differently")
  def test_uint8_to_int8_overflow(self):
    _test_op(lambda: Tensor([255, 254, 253, 252], dtype=dtypes.uint8).cast(dtypes.int8), dtypes.int8, [-1, -2, -3, -4])

@ -303,9 +311,11 @@ class TestBitCast(unittest.TestCase):
  @given(strat.sampled_from(dtype_ints + dtype_floats), strat.sampled_from(dtype_ints + dtype_floats))
  def test_shape_change_bitcast(self, dt1, dt2):
    # NOTE: this has to be assume to prevent hypothesis from skipping all samples
-    assume(not (getenv("PTX") and dt1 == dtypes.int8)) # TODO: bitcasting int8 fails in PTX
+    assume(not (isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) and dt1 == dtypes.int8)) # TODO: bitcasting int8 fails in PTX
    data = rand_for_dtype(dt1, 32).reshape(2, 2, 8)
    expected = torch.tensor(data.tolist(), dtype=_to_torch_storage_type(dt1)).view(_to_torch_dtype(dt2))
+    if dt2 in dtypes.fp8s:
+      expected = torch.tensor(list(map(lambda x: fp8_to_float(x, dt2), expected.view(-1).tolist()))).view_as(expected)
    _test_op(lambda: Tensor(data, dtype=dt1).bitcast(dt2), dt2, expected.tolist())

  def test_shape_change_bitcast_exceptions(self):
@ -348,6 +358,9 @@ class TestBoolDType(TestDType): DTYPE = dtypes.bool

 class TestBFloat16Type(TestDType): DTYPE = dtypes.bfloat16

+class TestFp8e4m3(TestDType): DTYPE = dtypes.fp8e4m3
+class TestFp8e5m2(TestDType): DTYPE = dtypes.fp8e5m2
+
 class TestPtrDType(unittest.TestCase):
  def test_vec_double(self):
    dt1 = dtypes.float.vec(4).ptr().vec(4)
@ -424,7 +437,7 @@ class TestDtypeUsage(unittest.TestCase):
 class TestOpsBFloat16(unittest.TestCase):
  def test_cast(self):
    # TODO: helper_test_op breaks in unrelated part
-    # TODO: wrong output with GPU=1 on mac
+    # TODO: wrong output with CL=1 on mac
    data = [60000.0, 70000.0, 80000.0]
    np.testing.assert_allclose(Tensor(data).cast("bfloat16").numpy(), torch.tensor(data).type(torch.bfloat16).float().numpy())

--- a/test/test_dtype_alu.py
+++ b/test/test_dtype_alu.py
@ -1,13 +1,14 @@
 import unittest, operator, math
 from tinygrad import Tensor, dtypes, Device
-from tinygrad.dtype import DType
+from tinygrad.dtype import DType, truncate
 from tinygrad.helpers import CI, getenv
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
 from tinygrad.runtime.ops_python import from_storage_scalar
+from tinygrad.renderer.ptx import PTXRenderer
 import numpy as np
 import pytest
-from hypothesis import given, strategies as strat, settings, HealthCheck
+from hypothesis import assume, given, strategies as strat, settings, HealthCheck

 pytestmark = pytest.mark.filterwarnings("ignore")

@ -47,6 +48,8 @@ class ht:
  int64 = strat.integers(-9223372036854775808, 9223372036854775807)
  bool = strat.booleans()
 ht.bfloat16 = ht.uint16
+ht.fp8e4m3 = ht.uint8
+ht.fp8e5m2 = ht.uint8

 def universal_test(a, b, dtype, op):
  if not isinstance(op, tuple): op = (op, op)
@ -56,8 +59,9 @@ def universal_test(a, b, dtype, op):
  ta, tb = Tensor([a], dtype=dtype), Tensor([b], dtype=dtype)
  tensor_value = (op[0](ta, tb)).numpy()
  numpy_value = op[1](ta.numpy(), tb.numpy())
+  if dtype in dtypes.fp8s: numpy_value = truncate[dtype](numpy_value)
  if dtype in dtypes.floats:
-    atol, rtol = {dtypes.bfloat16:(1e-3, 1e-2)}.get(dtype, (1e-10, 1e-7))
+    atol, rtol = {dtypes.bfloat16:(1e-3, 1e-2), dtypes.fp8e4m3:(1e-1, 1e-1), dtypes.fp8e5m2:(1.0, 5e-1)}.get(dtype, (1e-10, 1e-7))
    np.testing.assert_allclose(tensor_value, numpy_value, atol=atol, rtol=rtol)
  else: np.testing.assert_equal(tensor_value, numpy_value)

@ -70,8 +74,10 @@ def universal_test_unary(a, dtype, op):
  out: Tensor = op[0](ta)
  tensor_value = out.numpy()
  numpy_value = op[1](ta.numpy())
+  if dtype in dtypes.fp8s: numpy_value = truncate[dtype](numpy_value)
  if dtype in dtypes.floats:
-    atol, rtol = {dtypes.float16:(1e-3, 1e-2), dtypes.bfloat16:(1e-3, 2e-2)}.get(dtype, (1e-6, 1e-5))
+    atol, rtol = { dtypes.float16:(1e-3, 1e-2), dtypes.bfloat16:(1e-3, 2e-2),
+      dtypes.fp8e4m3:(1e-1, 1e-1), dtypes.fp8e5m2: (1.0, 5e-1)}.get(dtype, (1e-6, 1e-5))
    np.testing.assert_allclose(tensor_value, numpy_value, atol=atol, rtol=rtol)
  else: np.testing.assert_equal(tensor_value, numpy_value)

@ -91,7 +97,7 @@ def universal_test_midcast(a, b, c, op1, op2, d1:DType, d2:DType):
  an, bn, cn = np.array([a]).astype(_to_np_dtype(d1)), np.array([b]).astype(_to_np_dtype(d1)), np.array([c]).astype(_to_np_dtype(d2))
  tensor_value = op2[0](op1[0](at, bt).cast(d2), ct).numpy()
  numpy_value = op2[1](op1[1](an, bn).astype(_to_np_dtype(d2)), cn)
-  np.testing.assert_allclose(tensor_value, numpy_value, rtol=1e-6 if getenv("PTX") else 1e-7)
+  np.testing.assert_allclose(tensor_value, numpy_value, rtol=1e-6 if isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) else 1e-7)

 class TestDTypeALU(unittest.TestCase):
  @unittest.skipUnless(is_dtype_supported(dtypes.float64), f"no float64 on {Device.DEFAULT}")
@ -110,6 +116,16 @@ class TestDTypeALU(unittest.TestCase):
  def test_bfloat16(self, a, b, op):
    universal_test(from_storage_scalar(a, dtypes.bfloat16), from_storage_scalar(a, dtypes.bfloat16), dtypes.bfloat16, op)

+  @unittest.skipUnless(is_dtype_supported(dtypes.fp8e4m3), f"no fp8e4m3 on {Device.DEFAULT}")
+  @given(ht.fp8e4m3, ht.fp8e4m3, strat.sampled_from(binary_operations))
+  def test_fp8e4m3(self, a, b, op):
+    universal_test(from_storage_scalar(a, dtypes.fp8e4m3), from_storage_scalar(b, dtypes.fp8e4m3), dtypes.fp8e4m3, op)
+
+  @unittest.skipUnless(is_dtype_supported(dtypes.fp8e5m2), f"no fp8e5m2 on {Device.DEFAULT}")
+  @given(ht.fp8e5m2, ht.fp8e5m2, strat.sampled_from(binary_operations))
+  def test_fp8e5m2(self, a, b, op):
+    universal_test(from_storage_scalar(a, dtypes.fp8e5m2), from_storage_scalar(b, dtypes.fp8e5m2), dtypes.fp8e5m2, op)
+
  @given(ht.float32, strat.sampled_from(unary_operations))
  def test_float32_unary(self, a, op): universal_test_unary(a, dtypes.float32, op)

@ -121,6 +137,18 @@ class TestDTypeALU(unittest.TestCase):
  @given(ht.bfloat16, strat.sampled_from(unary_operations))
  def test_bfloat16_unary(self, a, op): universal_test_unary(from_storage_scalar(a, dtypes.bfloat16), dtypes.bfloat16, op)

+  @unittest.skipUnless(is_dtype_supported(dtypes.fp8e4m3), f"no fp8e4m3 on {Device.DEFAULT}")
+  @given(ht.fp8e4m3, strat.sampled_from(unary_operations))
+  def test_fp8e4m3_unary(self, a, op):
+    if op[1] == np.reciprocal: assume(from_storage_scalar(a, dtype=dtypes.fp8e4m3) != 0.0)
+    universal_test_unary(from_storage_scalar(a, dtype=dtypes.fp8e4m3), dtypes.fp8e4m3, op)
+
+  @unittest.skipUnless(is_dtype_supported(dtypes.fp8e5m2), f"no fp8e5m2 on {Device.DEFAULT}")
+  @given(ht.fp8e5m2, strat.sampled_from(unary_operations))
+  def test_fp8e5m2_unary(self, a, op):
+    if op[1] == np.reciprocal: assume(from_storage_scalar(a, dtype=dtypes.fp8e5m2) != 0.0)
+    universal_test_unary(from_storage_scalar(a, dtype=dtypes.fp8e5m2), dtypes.fp8e5m2, op)
+
  @given(ht.uint8, ht.uint8, strat.sampled_from(integer_binary_operations))
  def test_uint8(self, a, b, op): universal_test(a, b, dtypes.uint8, op)

--- a/test/test_image_dtype.py
+++ b/test/test_image_dtype.py
@ -7,7 +7,7 @@ from tinygrad.engine.realize import lower_schedule
 from tinygrad.helpers import prod, unwrap
 from test.helpers import REAL_DEV

-IMAGE_SUPPORTED_DEVICES = ("QCOM", "GPU")
+IMAGE_SUPPORTED_DEVICES = ("QCOM", "CL")

@unittest.skipUnless(REAL_DEV in IMAGE_SUPPORTED_DEVICES, "Images not supported")
 class TestImageCopy(unittest.TestCase):
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -609,21 +609,22 @@ class TestJitFree(unittest.TestCase):
    ext_tensor = Tensor([1,24,23,45,1])
    @TinyJit
    def fxn(x:Tensor):
-      out = (x*2+ext_tensor).reshape(5,1).expand(5, 100).contiguous()
-      return out.sum()
+      t1 = (x * 2).contiguous().realize()
+      t2 = (t1 + ext_tensor).contiguous().realize()
+      out = (t2.sum()).contiguous().realize()
+      return out
    for i in range(5):
-      out = fxn(Tensor([i,1,2,3,4]))
-      self.assertEqual(out.item(), 11400+200*i)
+      out = fxn(inp:=Tensor([i,1,2,3,4]))
+      self.assertEqual(out.item(), 114+2*i)
    pre_free = GlobalCounters.mem_used
    fxn.captured.free_intermediates()
    savings_after_free = pre_free - GlobalCounters.mem_used

-    # Different allocator implementations have different savings.
-    expected_savings = 8196 if hasattr(Device[Device.DEFAULT].allocator, '_offset') else 2024
+    expected_savings = (len(inp) * inp.dtype.itemsize * 2) + dtypes.float32.itemsize # (t1 and t2) + out

    self.assertEqual(savings_after_free, expected_savings)
    out = fxn(Tensor([11,1,2,3,4]))
-    self.assertEqual(out.item(), 13600)
+    self.assertEqual(out.item(), 136)

    # Try one more time...
    pre_free = GlobalCounters.mem_used
@ -633,7 +634,7 @@ class TestJitFree(unittest.TestCase):

    self.assertEqual(savings_after_free, expected_savings)
    out = fxn(Tensor([11,1,2,3,4]))
-    self.assertEqual(out.item(), 13600)
+    self.assertEqual(out.item(), 136)

  def test_updated_not_freed(self):
    x = Tensor([1]).realize()
--- a/test/test_kernel_cache.py
+++ b/test/test_kernel_cache.py
@ -16,14 +16,14 @@ class TestKernelCache(unittest.TestCase):

    a1 = Tensor.rand(4,4).realize()
    b1 = Tensor.rand(4,4).realize()
-    orig_compile_func = Device['CPU'].compiler
-    Device['CPU'].compiler = None # making it not callable
+    orig_compile_func = Device['CPU'].compiler.compile_cached
+    Device['CPU'].compiler.compile_cached = None # making it not callable

    try:
      x1 = a1 + b1 + unique_const
      x1.realize() # Same kernel should be from cache.
    finally:
-      Device['CPU'].compiler = orig_compile_func
+      Device['CPU'].compiler.compile_cached = orig_compile_func

 if __name__ == "__main__":
  unittest.main()
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@ -10,9 +10,10 @@ from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 from tinygrad.tensor import Tensor, _to_np_dtype
 from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner, get_program
-from tinygrad.helpers import Context, getenv, flatten, dedup, TC_SELECT, TC_OPT
+from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT
 from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
 from tinygrad.codegen import apply_rewrites, rewrites_for_views
+from tinygrad.renderer.ptx import PTXRenderer

 class TestLinearizer(unittest.TestCase):
  def test_arg_dedup(self):
@ -155,7 +156,7 @@ class TestLinearizer(unittest.TestCase):
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
-  @unittest.skipIf(getenv("PTX") or getenv("NIR"), "broken on ptx/nir because of indexing patterns")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken on ptx for some reason")
  def test_upcast_with_locals(self):
    x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
    r = (x@y).relu()
@ -366,7 +367,7 @@ class TestLinearizer(unittest.TestCase):
    helper(Tensor.arange(255), max_ops=2)

  @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
-  @unittest.skipIf(getenv("PTX") or getenv("NIR"), "broken on ptx/nir because of indexing patterns")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken on ptx for some reason")
  def test_grouped_store_phis(self):
    """
    float4 acc0 = float4(0.0,0.0,0.0,0.0);
@ -420,7 +421,7 @@ class TestLinearizer(unittest.TestCase):
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
-  @unittest.skipIf(getenv("PTX") or getenv("NIR"), "broken on ptx/nir because of indexing patterns")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken on ptx for some reason")
  def test_grouped_store_local_only(self):
    x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
    r = (x@y).relu()
@ -481,7 +482,7 @@ def helper_realized_ast(r:Tensor|list[Tensor]) -> tuple[UOp, list[Buffer]]:
  assert s[-1].ast.op is Ops.SINK, f"helper_realized_ast expects a SINK {s[-1]}"
  # now all input buffers in s[-1] should be realized
  # create fresh buffers for the outputs
-  bufs = [Buffer((x).device, x.size, x.dtype).allocate() if i < len(s[-1].ast.src) else x for i,x in enumerate(s[-1].bufs)]
+  bufs = [Buffer(x.device, x.size, x.dtype).allocate() if i < len(s[-1].ast.src) else x for i,x in enumerate(s[-1].bufs)]
  return push_views(s[-1].ast), bufs

 def helper_linearizer_ast(ast:UOp, inputs:list[Tensor], *args, **kwargs):
@ -503,7 +504,7 @@ def reset_bufs(bufs:list[Buffer]):

 def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[],
                               apply_tc=False, atol=1e-4, rtol=1e-4, color_sizes=[], wanna_output=[]):
-  outbufs = [real_bufs[x.src[0].base.arg] for x in realized_ast.src]
+  outbufs = real_bufs[:len(realized_ast.src)]
  device = real_bufs[0].device
  wanna_output = [np.array(x).flatten() for x in wanna_output]

--- a/test/test_linearizer_dumb.py
+++ b/test/test_linearizer_dumb.py
@ -6,13 +6,12 @@ import unittest
 from tinygrad import Device, dtypes
 from tinygrad.device import is_dtype_supported
 from tinygrad.uop.ops import UOp, Ops, AxisType, KernelInfo
-from tinygrad.helpers import getenv
 from tinygrad.shape.shapetracker import ShapeTracker, View
 from tinygrad.codegen.opt.search import Opt, OptOps
 from tinygrad.engine.realize import get_program
+from tinygrad.renderer.ptx import PTXRenderer

 class TestLinearizerFailure(unittest.TestCase):
-  @unittest.expectedFailure
  @unittest.skipUnless(Device.DEFAULT == "METAL", "only tested on METAL")
  def test_failure_beam_mnist(self):
    c0 = UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(4014080), arg=0, src=())
@ -47,7 +46,8 @@ class TestLinearizerDumb(unittest.TestCase):
    c10 = UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1000), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=())
    c11 = c1.store((c4.alu(Ops.CMPNE, c7).alu(Ops.CMPNE, UOp.const(dtypes.bool, True, src=c8)).cast(dtypes.int)*(c9.f(Ops.VALID, dtype=dtypes.bool).where(UOp.const(dtypes.int, -1, src=c10), UOp.const(dtypes.int, 0, src=c10)).f(Ops.REDUCE_AXIS, arg=(Ops.ADD, (1,)))+UOp.const(dtypes.int, 1000, src=c8))))
    ast = c11.sink()
-    opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)]
+    #opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)]
+    opts = [Opt(op=OptOps.LOCAL, axis=0, arg=8)]
    prg = get_program(ast, Device[Device.DEFAULT].renderer, opts)
    print(prg.src)
    assert prg.uops is not None and not any(uop.op is Ops.MAX for uop in prg.uops), "leftover MAX"
@ -93,7 +93,7 @@ class TestLinearizerDumb(unittest.TestCase):

  @unittest.expectedFailure
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need float4")
-  @unittest.skipIf(getenv("PTX"), "this is somehow correct in PTX")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "this is somehow correct in PTX")
  def test_upcasted_stores_out_of_order(self):
    c0 = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9360), arg=0, src=())
    c1 = c0.view(ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 1, 1, 4, 3, 3), strides=(2340, 468, 36, 0, 0, 0, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=True),)))
--- a/test/test_method_cache.py
+++ b/test/test_method_cache.py
@ -5,9 +5,9 @@ from tinygrad.nn.state import get_state_dict

 class TestMethodCache(unittest.TestCase):
  def setUp(self):
-    self.backup_compiler = Device[Device.DEFAULT].compiler
+    self.backup_compiler = Device[Device.DEFAULT].compiler.compile_cached
  def tearDown(self):
-    Device[Device.DEFAULT].compiler = self.backup_compiler
+    Device[Device.DEFAULT].compiler.compile_cached = self.backup_compiler

  def test_simple_methodcache(self):
    a = Tensor([1])
@ -15,19 +15,19 @@ class TestMethodCache(unittest.TestCase):
    c = Tensor([3])
    d = Tensor([4])
    (a+b).realize()
-    Device[Device.DEFAULT].compiler = None
+    Device[Device.DEFAULT].compiler.compile_cached = None
    (c+d).realize()

  def test_nested_methodcache(self):
    a,b,c,d = Tensor([1]), Tensor([2]), Tensor([3]), Tensor([4])
    ((a+b)+(a+b)).realize()
-    Device[Device.DEFAULT].compiler = None
+    Device[Device.DEFAULT].compiler.compile_cached = None
    ((c+d)+(c+d)).realize()

  def test_nested_methodcache_swap(self):
    a,b,c,d = Tensor([1]), Tensor([2]), Tensor([3]), Tensor([4])
    ((a+b)+(c+d)).realize()
-    Device[Device.DEFAULT].compiler = None
+    Device[Device.DEFAULT].compiler.compile_cached = None
    ((c+d)+(a+b)).realize()

  @unittest.skip("incorrect use of transformer")
@ -38,7 +38,7 @@ class TestMethodCache(unittest.TestCase):
    # NOTE: you have to do this twice due to the k-v cache
    for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()
    for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()
-    Device[Device.DEFAULT].compiler = None
+    Device[Device.DEFAULT].compiler.compile_cached = None
    for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()

 if __name__ == '__main__':
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@ -2,7 +2,7 @@ import unittest, functools, random
 from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable
 from tinygrad.device import is_dtype_supported
 from tinygrad.uop.ops import Ops, UOp
-from tinygrad.helpers import CI, getenv, prod, Context
+from tinygrad.helpers import CI, getenv, prod, Context, RANGEIFY
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
 import numpy as np
@ -178,16 +178,14 @@ class TestMultiTensor(unittest.TestCase):
      run_schedule(sched)
      np.testing.assert_equal(xt.numpy(), X_np[i*2:i*2+2])

-  @given(strat.sampled_from((4, 5)), strat.sampled_from((devices_2, devices_3)),
+  @given(strat.sampled_from((devices_2, devices_3)),
         strat.sampled_from((Ops.ADD, Ops.MUL, Ops.MAX)),
-         strat.sampled_from((None, 0, 1)), strat.sampled_from((None, 0, 1)), strat.sampled_from((1, 0, -1)))
-  def test_simple_reduce(self, N, devices, rop, shard_axis, reduce_axis, sign):
-    N = N * len(devices)
-    X = Tensor.rand(N*N).reshape(N, N).mul(sign)
+         strat.sampled_from((None, 0, 1)), strat.sampled_from((None, 0, 1)))
+  def test_simple_reduce(self, devices, rop, shard_axis, reduce_axis):
+    N = 4 * len(devices)
+    X = (Tensor.rand(N*N)-1).reshape(N, N).shard_(devices, shard_axis)
    n = X.numpy()
-    X.shard_(devices, shard_axis)
-    f = {Ops.ADD: lambda x: x.sum(reduce_axis), Ops.MUL: lambda x: x.prod(reduce_axis),
-         Ops.MAX: lambda x: x.max(reduce_axis)}[rop]
+    f = {Ops.ADD: lambda x: x.sum(reduce_axis), Ops.MUL: lambda x: x.prod(reduce_axis), Ops.MAX: lambda x: x.max(reduce_axis)}[rop]
    fX = f(X)
    fn = f(n)
    np.testing.assert_allclose(fX.numpy(), fn, rtol=1e-6, atol=1e-6)
@ -374,6 +372,7 @@ class TestMultiTensor(unittest.TestCase):

  # NOTE: this is failing on LLVM CI, no idea why. Works locally.
  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
+  @unittest.skipIf(RANGEIFY, "TODO: pm_rangeify hangs")
  def test_data_parallel_resnet(self):
    from extra.models.resnet import ResNet18

@ -410,6 +409,7 @@ class TestMultiTensor(unittest.TestCase):
    np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)

  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
+  @unittest.skipIf(RANGEIFY, "TODO: pm_rangeify hangs")
  def test_data_parallel_resnet_train_step(self):
    from extra.models.resnet import ResNet18
    fake_image = Tensor.rand((2, 3, 224//16, 224//16))
@ -417,6 +417,7 @@ class TestMultiTensor(unittest.TestCase):
    m = ResNet18()
    self._test_model_train_step(m, fake_image, labels)

+  @unittest.skipIf(RANGEIFY, "TODO: pm_rangeify hangs")
  def test_data_parallel_simple_train_step(self):
    class Model:
      def __init__(self): self.conv1 = nn.Linear(128,128)
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -229,7 +229,8 @@ class TestNN(unittest.TestCase):
    torch_z = torch_layer(torch_x)
    torch_z.sum().backward()

-    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-6, rtol=5e-6)
+    # TODO: why is torch numbers all 0?
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=5e-6)

  def test_layernorm(self):
    N, C, H, W = 20, 5, 10, 10
@ -332,7 +333,7 @@ class TestNN(unittest.TestCase):

      np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-6, rtol=5e-6)
      np.testing.assert_allclose(x.grad.numpy(), torch_x.grad.detach().numpy(), atol=1e-3, rtol=1e-3)
-      np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=2e-3, rtol=1e-3)
+      np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=3e-3, rtol=1e-3)
      np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=1e-3, rtol=1e-3)

  def test_rmsnorm(self):
--- a/test/test_ops.py
+++ b/test/test_ops.py
@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
 import numpy as np
 from typing import List, Callable
 import torch
-from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, CPU_LLVM, AMD_LLVM
+from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, CPU_LLVM, AMD_LLVM, RANGEIFY
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
@ -234,7 +234,8 @@ class TestOps(unittest.TestCase):
  def test_unfold(self):
    helper_test_op([(8,)], lambda x: x.unfold(0, 2, 1))
    helper_test_op([(8,)], lambda x: x.unfold(0, 2, 2))
-    helper_test_op([(8,)], lambda x: x.unfold(0, 7, 3))
+    # TODO: something is wrong with unfold
+    if not getenv("TINY_BACKEND"): helper_test_op([(8,)], lambda x: x.unfold(0, 7, 3))
    helper_test_op([(3,3,3)], lambda x: x.unfold(2, 2, 8))
    helper_test_op([(3,3,3)], lambda x: x.unfold(1, 0, 8))
    helper_test_op([(3,3,3,3,3)], lambda x: x.unfold(-1, 2, 2))
@ -311,6 +312,11 @@ class TestOps(unittest.TestCase):
    helper_test_op([], lambda: torch.nn.functional.pad(torch.ones(256,256), pad=(0,64,0,0)).sum(axis=1),
                       lambda: Tensor.ones(256,256).pad(((0,0), (0,64))).sum(axis=1), forward_only=True)

+  def test_sum_twice(self):
+    helper_test_op([(4, 4, 4)], lambda x: x.sum((0, 1)).sum())
+    helper_test_op([(4, 4, 4)], lambda x: x.sum((0, 2)).sum())
+    helper_test_op([(4, 4, 4)], lambda x: x.sum((1, 2)).sum())
+
  # this is more complex and won't fold for a while
  def test_sum_cat_collapse(self):
    helper_test_op([], lambda: torch.cat([torch.ones(256,256), torch.zeros(256,64)], dim=1).sum(axis=1),
@ -1304,7 +1310,7 @@ class TestOps(unittest.TestCase):
                                                                         np.arange(64,128,dtype=np.float32).reshape(8,8)])
  def test_small_gemm_eye(self):
    helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
-  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "GPU", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
+  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
  or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
  def test_gemm_fp16(self):
    helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3)
@ -1407,6 +1413,11 @@ class TestOps(unittest.TestCase):
    helper_test_op(None, lambda x: x.max(), forward_only=True, vals=[[False, True]])
    helper_test_op(None, lambda x: x.max(), forward_only=True, vals=[[True, False]])

+  def test_const_reduce(self):
+    helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).sum(), lambda x: (x.full_like(2)).sum(), forward_only=True)
+    helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).prod(), lambda x: (x.full_like(2)).prod(), forward_only=True)
+    helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).max(), lambda x: (x.full_like(2)).max(), forward_only=True)
+
  @unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
  def test_any(self):
    helper_test_op([(3,4,5,6)], lambda x: x.any(), forward_only=True)
@ -1458,6 +1469,7 @@ class TestOps(unittest.TestCase):
  def test_mean_zero_axis(self):
    helper_test_op([(1,0,3,0,5)], lambda x: x.mean(axis=(1,3)))

+  @slow_test
  def test_var(self):
    helper_test_op([(15, 25, 35)], lambda x: x.var())
    helper_test_op([(15, 25, 35)], lambda x: x.var(correction=0))
@ -1493,6 +1505,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(15, 25, 35)], lambda x: x.var(keepdim=True))
    helper_test_op([(15, 25, 35)], lambda x: x.var(0, keepdim=True, correction=0))

+  @slow_test
  def test_std(self):
    helper_test_op([(15, 25, 35)], lambda x: x.std())
    helper_test_op([(15, 25, 35)], lambda x: x.std(correction=0))
@ -1525,6 +1538,7 @@ class TestOps(unittest.TestCase):
  def test_std_keepdim(self):
    helper_test_op([(15, 25, 35)], lambda x: x.std(keepdim=True))
    helper_test_op([(15, 25, 35)], lambda x: x.std(0, keepdim=True, correction=0))
+  @slow_test
  def test_std_mean(self):
    helper_test_op([(15,25,35)], lambda x: torch.stack(torch.std_mean(x)),
                                 lambda x: Tensor.stack(*x.std_mean()))
@ -2040,12 +2054,14 @@ class TestOps(unittest.TestCase):
      lambda x,w,b: torch.nn.functional.conv2d(x,w,b),
      lambda x,w,b: Tensor.conv2d(x,w,b), grad_rtol=1e-5)

+  @slow_test
  @unittest.skipIf(IMAGE>0, "no conv3d on images")
  def test_simple_conv3d(self):
    helper_test_op([(1,4,9,9,9), (4,4,3,3,3)],
      lambda x,w: torch.nn.functional.conv3d(x,w),
      lambda x,w: Tensor.conv2d(x,w), grad_rtol=1e-5)

+  @slow_test
  @unittest.skipIf(IMAGE>0, "no conv3d on images")
  def test_padded_conv3d(self):
    helper_test_op([(1,4,5,5,5), (4,4,3,3,3)],
@ -2102,6 +2118,7 @@ class TestOps(unittest.TestCase):
      lambda x,w: torch.nn.functional.conv_transpose2d(x,w,groups=2),
      lambda x,w: Tensor.conv_transpose2d(x,w,groups=2), grad_rtol=1e-5)

+  @slow_test
  def test_padded_conv_transpose2d(self):
    for padding in [(1,2), (2,1), 2, 1, 0]:
      helper_test_op([(2,4,9,9), (4,4,3,3)],
@ -2110,6 +2127,7 @@ class TestOps(unittest.TestCase):
    self.helper_test_exception([(2,16,2,2), (32,16,3,3)], lambda x,w: torch.nn.functional.conv_transpose2d(x,w,padding=(1,1,1)),
                   lambda x,w: Tensor.conv_transpose2d(x,w,padding=(1,1,1)), expected=(RuntimeError, ValueError))

+  @slow_test
  def test_dilated_conv_transpose2d(self):
    for dilation in [(1,2), (2,1), 2, 1]:
      helper_test_op([(2,4,9,9), (4,4,3,3)],
@ -2122,6 +2140,7 @@ class TestOps(unittest.TestCase):
        lambda x,w: torch.nn.functional.conv_transpose2d(x,w, stride=stride),
        lambda x,w: Tensor.conv_transpose2d(x,w,stride=stride), atol=1e-5, grad_rtol=1e-5)

+  @slow_test
  def test_output_padded_conv_transpose2d(self):
    for output_padding, stride in [((1,1), (2,3)), ((2,1), (3,2))]:
      helper_test_op([(2,4,6,5), (4,4,3,3),(4,)],
@ -2183,8 +2202,10 @@ class TestOps(unittest.TestCase):
              lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups),
              lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5)
  def test_conv2d(self): self._test_conv2d(bs=1, cin=3)
+  @slow_test
  def test_conv2d_bs_4_cin_3(self): self._test_conv2d(bs=4, cin=3, cout=2)
  def test_conv2d_bs_1_cin_1(self): self._test_conv2d(bs=1, cin=1)
+  @slow_test
  def test_conv2d_bs_4_cin_1(self): self._test_conv2d(bs=4, cin=1)

  def test_conv2d_errors(self):
@ -2256,6 +2277,7 @@ class TestOps(unittest.TestCase):
      lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups),
      lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5)

+  @slow_test
  def test_strided_conv2d_simple(self):
    bs,H,W = 2,3,1
    helper_test_op([(bs,1,5,1), (1,1,H,W)],
@ -2266,6 +2288,7 @@ class TestOps(unittest.TestCase):
  def test_strided_conv2d_simple_vec(self):
    with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()

+  @slow_test
  def test_strided_conv2d(self):
    bs = 4
    cin = 3
@ -2501,6 +2524,7 @@ class TestOps(unittest.TestCase):
      ),
      forward_only=True)

+  @slow_test
  def test_avg_pool2d(self):
    shape = (32,2,111,28)
    for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
@ -2928,13 +2952,13 @@ class TestOps(unittest.TestCase):
  @slow_test
  def test_scatter_reduce(self):
    b = torch.randint(3, size=[3,4,5], dtype=torch.int64, requires_grad=False)
-    a = Tensor(b.detach().cpu().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False)
+    a = Tensor(b.detach().cpu().numpy().astype(np.int32), requires_grad=False)
    for reduce in ("sum", "prod", "mean", "amin", "amax"):
      for dim in (-1,1,-3):
-        helper_test_op([(4,5,6), (4,5,6)],
+        helper_test_op([(3,4,5), (3,4,5)],
          lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce),
          lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce), forward_only=True)
-        helper_test_op([(4,5,6), (4,5,6)],
+        helper_test_op([(3,4,5), (3,4,5)],
          lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce, include_self=False),
          lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce, include_self=False), forward_only=True)

@ -3015,6 +3039,8 @@ class TestOps(unittest.TestCase):
    helper_test_op([(32,10), (32,10)], lambda x,y: torch.nn.functional.binary_cross_entropy_with_logits(x,y.clip(0,1),
                                                                                                        pos_weight=torch.tensor(pos_weight)),
                                       lambda x,y: x.binary_crossentropy_logits(y.clip(0,1),pos_weight=Tensor(pos_weight)))
+
+  @unittest.skipIf(RANGEIFY > 1, "broken on RANGEIFY > 1, TODO: fix")
  def test_cross_entropy_class_probabilities(self):
    helper_test_op([(32,), (32,)], lambda x,y: torch.nn.functional.cross_entropy(x, y), lambda x,y: x.cross_entropy(y))
    helper_test_op([(32,10), (32,10)], lambda x,y: torch.nn.functional.cross_entropy(x, y), lambda x,y: x.cross_entropy(y))
--- a/test/test_opts.py
+++ b/test/test_opts.py
@ -13,7 +13,7 @@ class TestOpts(unittest.TestCase):
    out = (a+b).contiguous(arg=opts)
    s = out.schedule()
    self.assertEqual(s[-1].ast.arg.opts_to_apply, opts)
-    if Device.DEFAULT in {"CPU", "GPU", "METAL"} and not CPU_LLVM:
+    if Device.DEFAULT in {"CPU", "CL", "METAL"} and not CPU_LLVM:
      prg = get_program(s[-1].ast)
      self.assertIn('float4', prg.src)

--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@ -17,7 +17,7 @@ def helper_collect_profile(*devs):
  cpu_events.clear()

  profile_list = []
-  with Context(PROFILE=1):
+  with Context(VIZ=1):
    yield profile_list
    for dev in devs: dev.synchronize()
    for dev in devs: dev._at_profile_finalize()
--- a/test/test_randomness.py
+++ b/test/test_randomness.py
@ -1,15 +1,17 @@
 import unittest, math
 from functools import partial

-import numpy as np
-import torch
-from tinygrad import nn, dtypes, Tensor, Device, TinyJit
-from tinygrad.helpers import getenv, CI
+from tinygrad import nn, dtypes, Tensor, Device, TinyJit, Variable
+from tinygrad.helpers import getenv, CI, OSX
 from tinygrad.device import is_dtype_supported
 from tinygrad.engine.realize import lower_schedule, CompiledRunner
-from hypothesis import given, settings, strategies as strat
+from tinygrad.renderer.ptx import PTXRenderer
 from test.helpers import not_support_multi_device

+import numpy as np
+import torch
+from hypothesis import given, settings, strategies as strat
+
 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")

@ -98,7 +100,7 @@ class TestRandomness(unittest.TestCase):

    np.testing.assert_allclose(jr, r)

-  @unittest.skipIf(getenv("PTX"), "fails with PTX")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "fails with PTX")
  def test_threefry_doesnt_use_long(self):
    for (_,ei) in lower_schedule(Tensor.rand(20).schedule()):
      if isinstance(ei.prg, CompiledRunner):
@ -359,5 +361,20 @@ class TestRandomness(unittest.TestCase):
    assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).weight, lambda _: torch.nn.BatchNorm2d(*params).weight.detach())
    assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).bias, lambda _: torch.nn.BatchNorm2d(*params).bias.detach())

+# TODO: still fails with MAX_KERNEL_BUFFERS
+@unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
+class TestSample(unittest.TestCase):
+  def test_sample(self):
+    X = Tensor.rand(10000, 50).realize()
+    BS = 16
+    idxs = np.random.randint(0, X.shape[0], size=(BS))
+    # this uncovered a bug with arg sort order
+    batch = [Variable(f'idx{i}', 0, X.shape[0]-1).bind(s) for i,s in enumerate(idxs.tolist())]
+    x = Tensor.cat(*[X.shrink(((batch[i], batch[i]+1), None)) for i in range(BS)])
+    print(idxs)
+    ret = x.numpy()
+    base = X.numpy()[idxs]
+    np.testing.assert_equal(ret, base)
+
 if __name__ == "__main__":
  unittest.main()
--- a/test/test_rangeify.py
+++ b/test/test_rangeify.py
@ -1,8 +1,21 @@
 import unittest
-from tinygrad import Tensor
+from tinygrad import Tensor, nn
 from tinygrad.helpers import RANGEIFY, Context, GlobalCounters
 from tinygrad.uop.ops import UOp

+@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
+class TestRangeifyAssign(unittest.TestCase):
+  def test_assign_permuted(self):
+    A = Tensor.empty(4, 4, dtype='int')
+    B = Tensor.arange(16).reshape(4,4)
+    ret = A.permute(1,0).assign(B)
+    lst = ret.tolist()
+    lst2 = A.tolist()
+    lst3 = B.tolist()
+    print(lst)
+    print(lst2)
+    print(lst3)
+
 N = 256

@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
@ -93,6 +106,16 @@ class TestRangeify(unittest.TestCase):
    w2 = Tensor.empty(12, 8, 3, 3)
    x.conv2d(w1).conv2d(w2).realize()

+  def test_conv_maxpool_contig(self): self.test_conv_maxpool(True)
+  def test_conv_maxpool(self, contig=False):
+    GlobalCounters.reset()
+    x = Tensor.empty(32, 16, 64, 64)
+    l1 = nn.Conv2d(16, 16, 3)
+    for p in nn.state.get_parameters(l1): p.replace(Tensor.empty(p.shape))
+    x = l1(x)
+    if contig: x = x.contiguous()
+    x.max_pool2d().realize()
+
  def test_double_conv2d_half_contig(self):
    x = Tensor.empty(1, 4, 32, 32)
    w1 = Tensor.empty(8, 4, 3, 3)
@ -144,6 +167,7 @@ class TestRangeify(unittest.TestCase):

 # contiguous + reduce can support ranges?

+@unittest.skip("okay to disable this for now")
@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
 class TestOuterworld(unittest.TestCase):
  def test_passthrough_range(self):
--- a/test/test_sample.py
+++ b/test/test_sample.py
@ -1,22 +0,0 @@
-import unittest
-import numpy as np
-from tinygrad import Tensor, Variable, Device
-from tinygrad.helpers import OSX
-
-# TODO: still fails with MAX_KERNEL_BUFFERS
-@unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
-class TestSample(unittest.TestCase):
-  def test_sample(self):
-    X = Tensor.rand(10000, 50).realize()
-    BS = 16
-    idxs = np.random.randint(0, X.shape[0], size=(BS))
-    # this uncovered a bug with arg sort order
-    batch = [Variable(f'idx{i}', 0, X.shape[0]-1).bind(s) for i,s in enumerate(idxs.tolist())]
-    x = Tensor.cat(*[X.shrink(((batch[i], batch[i]+1), None)) for i in range(BS)])
-    print(idxs)
-    ret = x.numpy()
-    base = X.numpy()[idxs]
-    np.testing.assert_equal(ret, base)
-
-if __name__ == '__main__':
-  unittest.main()
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@ -12,9 +12,9 @@ from tinygrad import nn, dtypes, Device, Tensor
 from tinygrad.device import is_dtype_supported
 from tinygrad.dtype import DType, ImageDType
 from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.uop.ops import PatternMatcher, UOp, Ops, GroupOp, UPat, graph_rewrite, track_rewrites
+from tinygrad.uop.ops import UOp, Ops, GroupOp, UPat, graph_rewrite, track_rewrites
 from tinygrad.uop.symbolic import symbolic_simple
-from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, SPLIT_REDUCEOP, GlobalCounters, Context, getenv, all_same, temp
+from tinygrad.helpers import CI, DEBUG, SPLIT_REDUCEOP, GlobalCounters, Context, getenv, all_same, temp, RANGEIFY
 from tinygrad.schedule.kernelize import merge_views, get_kernelize_map, Kernel
 from tinygrad.engine.schedule import create_schedule_with_vars
 from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule
@ -33,6 +33,7 @@ def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Te
  # test lowering all the ScheduleItems to ExecItems
  kernel_cnt = len([si for si,ei in lower_schedule(sched.copy()) if isinstance(ei.prg, CompiledRunner) or not filter_sink])
  if kernel_cnt != allowed:
+    if RANGEIFY: return sched # allow different kernel count, TODO: fix the asserts
    print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}")
    if DEBUG >= 3:
      for i,s in enumerate(sched):
@ -41,6 +42,8 @@ def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Te
    raise KernelCountException(f"{kernel_cnt} != {allowed}")
  return sched

+def expect_rangeify_fails(fxn): return (unittest.expectedFailure if RANGEIFY else (lambda f:f))(fxn)
+
 def _realize_weights(m):
  for p in nn.state.get_parameters(m): p.realize()

@ -111,6 +114,7 @@ class TestSchedule(unittest.TestCase):
    self.assertListEqual(a.tolist(), [[15]])

  @unittest.skipIf(Device.DEFAULT == "CPU", "devices must mismatch")
+  @expect_rangeify_fails
  def test_error_on_device_mismatch(self):
    a = Tensor.empty(10)
    b = Tensor.empty(10, device="CPU")
@ -118,11 +122,12 @@ class TestSchedule(unittest.TestCase):
    with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 1)

  @unittest.skipIf(Device.DEFAULT == "CPU", "devices must mismatch")
+  @expect_rangeify_fails
  def test_error_on_device_mismatch_alt(self):
    a = Tensor.empty(10)
    b = Tensor.empty((1,), device="CPU").expand(10).contiguous()
    c = a+b
-    with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 1)
+    with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 2 if RANGEIFY else 1)

  @unittest.skipUnless(is_dtype_supported(dtypes.half) and getenv("CAST_AFTER_EXPAND"), "need half and CAST_AFTER_EXPAND=1")
  @unittest.skip("CAST_AFTER_EXPAND is not supported")
@ -140,6 +145,7 @@ class TestSchedule(unittest.TestCase):
    np.testing.assert_equal(xt.numpy(), X.numpy()[1][0])

  @unittest.skipIf(CI and Device.DEFAULT == "NV", "crashes on NV CI")
+  @unittest.skipIf(RANGEIFY, "rangeify doesn't implement input buffer limiting")
  def test_add_chain_buffers(self):
    N = 31
    with Context(TRACK_MATCH_STATS=0, DEBUG=0):
@ -198,9 +204,10 @@ class TestSchedule(unittest.TestCase):

  def test_simplify_padded_const(self):
    a = Tensor.empty(1022).cummax(axis=0)
-    sched = check_schedule(a, 5)
-    ast = sched[0].ast
-    self.assertLessEqual(len([u for u in ast.toposort() if u.op is Ops.WHERE]), 6)
+    check_schedule(a, 5)
+    # TODO: what is this testing?
+    #ast = sched[0].ast
+    #self.assertLessEqual(len([u for u in ast.toposort() if u.op is Ops.WHERE]), 6)

  def test_basic_binop_fusion(self):
    a = Tensor.empty(10)
@ -278,7 +285,7 @@ class TestSchedule(unittest.TestCase):
    a = Tensor.empty(10,10,10)
    b = Tensor.empty(10,10,1)
    c = a.sum(axis=0, keepdim=True).permute(2,1,0) + b
-    with self.assertRaises(KernelCountException): check_schedule(c, 1)
+    check_schedule(c, 2)

  def test_allow_push_permutes(self):
    a = Tensor.randn(10,10,10).realize()
@ -316,7 +323,7 @@ class TestSchedule(unittest.TestCase):
    b = Tensor.empty(10)
    c = a+b
    d = a.reshape(10,1)+b.reshape(10,1)
-    with self.assertRaises(KernelCountException): check_schedule(d, 0, [c])
+    check_schedule(d, 1, [c])

  # failing in new lazy
  def test_cache_binaryop_transpose(self):
@ -324,7 +331,7 @@ class TestSchedule(unittest.TestCase):
    b = Tensor.empty(10,10)
    c = (a.T*b.T).T #.contiguous()
    d = a*b
-    with self.assertRaises(KernelCountException): check_schedule(d, 0, [c])
+    check_schedule(d, 1, [c])

  def test_cache_two_reduceops(self):
    a = Tensor.empty(10)
@ -339,7 +346,7 @@ class TestSchedule(unittest.TestCase):
    r1 = (x - r0).sum(axis=0).div(2)
    out = r0 + r1
    schedule = check_schedule(out, 2)
-    reduceops = [x for si in schedule for x in si.ast.toposort() if x.op is Ops.REDUCE_AXIS]
+    reduceops = [x for si in schedule for x in si.ast.toposort() if x.op in {Ops.REDUCE_AXIS, Ops.REDUCE}]
    assert len(reduceops) == 2

  def test_cache_reduce_multiple_children(self):
@ -349,9 +356,9 @@ class TestSchedule(unittest.TestCase):
    r1 = (x - r0).sum(axis=0).div(2)
    out0 = r0 + y
    out1 = r1 + y
-    schedule = check_schedule([out0, out1], 4)
-    reduceops = [x for si in schedule for x in si.ast.toposort() if x.op is Ops.REDUCE_AXIS]
-    assert len(reduceops) == 2
+    schedule = check_schedule([out0, out1], 2 if RANGEIFY else 4)
+    reduceops = [x for si in schedule for x in si.ast.toposort() if x.op in {Ops.REDUCE_AXIS, Ops.REDUCE}]
+    assert len(reduceops) == (3 if RANGEIFY else 2)

  def test_div_collapse_buffer(self):
    a = Tensor.full((4,), 4.0).contiguous().realize()
@ -394,6 +401,7 @@ class TestSchedule(unittest.TestCase):
    # a and b share the same underlying device memory
    self.assertIs(a.uop.realized, b.uop.realized)

+  @expect_rangeify_fails
  def test_clone_doesnt_dedup(self):
    src = Tensor.ones(4).contiguous().realize()
    a = src.clone()
@ -417,6 +425,11 @@ class TestSchedule(unittest.TestCase):
    b = Tensor.full((4, 4), 1.).contiguous().realize()
    check_schedule([a+b, a+b], 1)

+  def test_const_realize(self):
+    t = Tensor.ones(2)
+    check_schedule(t[0], 0)
+    check_schedule(t[1], 0)
+
  def test_fold_double_unary(self):
    y = Tensor.empty(2)
    out = y.sum(keepdim=True).sqrt().neg()
@ -558,7 +571,7 @@ class TestSchedule(unittest.TestCase):
    c = a+b
    d = a.reshape(10,1)+b.reshape(10,1)
    out = c.sum() + d.sum()
-    with self.assertRaises(KernelCountException): check_schedule(out, 1)
+    check_schedule(out, 2)

  def test_children_dont_push(self):
    a = Tensor.empty(10, 10, 1)
@ -569,6 +582,7 @@ class TestSchedule(unittest.TestCase):
    check_schedule(f, 2)

  # failing in new lazy
+  @unittest.skip("always fusing elementwise")
  def test_dont_fuse_binops_with_children(self):
    a = Tensor.empty(10)
    b = Tensor.empty(10)
@ -576,8 +590,8 @@ class TestSchedule(unittest.TestCase):
    keep_me = a+b
    e = keep_me.sum() # noqa: F841 give keep_me a child (NOTE: BinaryOps won't be a child since it will instant fuse)
    d = keep_me+c
-    with self.assertRaises(KernelCountException): check_schedule(d, 2)
-    with self.assertRaises(KernelCountException): check_schedule(keep_me, 0, [d])
+    check_schedule(d, 2)
+    check_schedule(keep_me, 0, [d])

  #@unittest.skip("failing in old lazy")
  def test_permute_breaks_fusion(self):
@ -627,7 +641,8 @@ class TestSchedule(unittest.TestCase):
    x = x.image_conv2d(w3, b3)

    # NOOP, 3 convs, contiguous
-    with self.assertRaises(KernelCountException): check_schedule(x, 5)
+    #check_schedule(x, 5)
+    check_schedule(x, 8)

  def test_image_conv_fusion_minimal(self):
    b1 = Tensor.empty(16)
@ -682,6 +697,7 @@ class TestSchedule(unittest.TestCase):
    c = (a.sum(2).contiguous() + b).contiguous()
    check_schedule(c, 2)

+  @expect_rangeify_fails
  def test_kernelize(self):
    a = Tensor.empty(10)
    b = Tensor.empty(10)
@ -689,12 +705,14 @@ class TestSchedule(unittest.TestCase):
    d = c+2
    check_schedule(d, 2)

+  @expect_rangeify_fails
  def test_kernelize_view(self):
    a = Tensor.empty(4,1)
    b = a*2
    c = b.kernelize()+Tensor.empty(4,4)
    check_schedule(c, 2)

+  @expect_rangeify_fails
  def test_kernelize_diamond(self):
    a = Tensor([0]).realize()
    prev_a = (a+1).contiguous()
@ -703,6 +721,7 @@ class TestSchedule(unittest.TestCase):
    assert prev_a.uop in a.uop.src, "contiguous usage must run before assign"
    self.assertEqual((prev_a+a*3).item(), 1+2*3)

+  @expect_rangeify_fails
  def test_multioutput_ast(self):
    a = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
    b = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
@ -714,6 +733,7 @@ class TestSchedule(unittest.TestCase):
    self.assertEqual(b.buffer.numpy(), [12])

  # unlike schedule, kernelize can be called multiple times on a Tensor
+  @expect_rangeify_fails
  def test_double_kerenlize(self):
    a = Tensor.empty(10)
    b = Tensor.empty(10)
@ -722,6 +742,7 @@ class TestSchedule(unittest.TestCase):
    e = c.kernelize()+d.kernelize()
    check_schedule(e, 3)

+  @expect_rangeify_fails
  def test_kernelize_bw(self):
    a = Tensor.full((3,), 2.0, requires_grad=True).contiguous()
    b = Tensor.full((3,), 3.0, requires_grad=True).contiguous()
@ -732,6 +753,7 @@ class TestSchedule(unittest.TestCase):
    self.assertEqual(z.item(), 18.0)
    self.assertEqual(z.grad.item(), 1.0)

+  @expect_rangeify_fails
  def test_kernelize_bw_view(self):
    a = Tensor.full((3,1), 2.0, requires_grad=True).contiguous()
    b = Tensor.full((3,1), 3.0, requires_grad=True).contiguous()
@ -784,6 +806,13 @@ class TestSchedule(unittest.TestCase):
    out = x + 1
    check_schedule(out, 0, filter_sink=False)

+  def test_zero_size_assign(self):
+    f = Tensor.full((2,), 0.).contiguous().realize()
+    a = f.shrink_to((0,))
+    a.assign(Tensor.ones_like(a))
+    check_schedule(a, 0)
+    self.assertEqual(a.tolist(), [])
+
  def test_reduce_permute_nofuse(self):
    x = Tensor.empty(32, 32, 32)
    y = Tensor.empty(32, 32)
@ -888,26 +917,24 @@ class TestSchedule(unittest.TestCase):
    out = x.contiguous() + y.contiguous()
    check_schedule(out, 2, filter_sink=False)

-  @unittest.expectedFailure
  def test_reduce_same_size(self):
    Tensor.manual_seed(0)
    a = Tensor.randn(4, 4).realize()
    out0 = a.sum() + 2
    out1 = a.sum() + 4
    out2 = out0 * out1
-    run_schedule(check_schedule([out0, out1, out2], 1))
+    run_schedule(check_schedule([out0, out1, out2], 1 if RANGEIFY else 4))
    np.testing.assert_allclose(out0.numpy(), out0_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-6)
    np.testing.assert_allclose(out1.numpy(), out1_np:=a.numpy().sum()+4, atol=1e-4, rtol=1e-6)
    np.testing.assert_allclose(out2.numpy(), out0_np*out1_np, atol=1e-4, rtol=1e-6)

-  @unittest.expectedFailure
  def test_reduce_multiple_paths(self):
    Tensor.manual_seed(0)
    a = Tensor.randn(4, 4).realize()
    out0 = a.sum().exp2()
    # out1 has two paths to a.sum()
    out1 = a.sum() + out0
-    run_schedule(check_schedule([out0, out1], 1))
+    run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
    np.testing.assert_allclose(out0.numpy(), out0_np:=np.exp2(a.numpy().sum()), atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(out1.numpy(), a.numpy().sum()+out0_np, atol=1e-4, rtol=1e-6)

@ -983,7 +1010,6 @@ class TestSchedule(unittest.TestCase):
    np.testing.assert_allclose(e.numpy(), e_np:=b.numpy() + out0_np, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(out1.numpy(), r_np + e_np[0][0][0], atol=1e-4, rtol=1e-4)

-  # changed by multireduce
  def test_reduce_expand_child(self):
    Tensor.manual_seed(0)
    a = Tensor.randn((32, 32, 32)).realize()
@ -995,13 +1021,12 @@ class TestSchedule(unittest.TestCase):
    np.testing.assert_allclose(out0.numpy(), a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(out1.numpy(), a.numpy().sum()+b.numpy(), atol=1e-4, rtol=1e-4)

-  @unittest.expectedFailure
  def test_reduce_shrink_child(self):
    a = Tensor.empty(100, 100)
    b = Tensor.empty(10,)
    c = a.sum() + b[0]
    d = a.sum() + 2
-    check_schedule([c, d], 1)
+    check_schedule([c, d], 1 if RANGEIFY else 3)

  def test_reduce_multiple_paths_midshrink(self):
    a = Tensor.empty(4, 4)
@ -1024,20 +1049,6 @@ class TestSchedule(unittest.TestCase):
    run_schedule(check_schedule(out, 2))
    np.testing.assert_allclose(out.numpy(), x.numpy().std(axis=-1, ddof=1), atol=1e-4, rtol=1e-4)

-  def test_argmin_multireduce_fusion(self):
-    Tensor.manual_seed(0)
-    x = Tensor.randn(4, 32).realize()
-    out = x.argmin(-1)
-    run_schedule(check_schedule(out, 2))
-    np.testing.assert_equal(out.numpy(), x.numpy().argmin(axis=-1))
-
-  def test_argmax_multireduce_fusion(self):
-    Tensor.manual_seed(0)
-    x = Tensor.randn(4, 32).realize()
-    out = x.argmax(-1)
-    run_schedule(check_schedule(out, 2))
-    np.testing.assert_equal(out.numpy(), x.numpy().argmax(axis=-1))
-
  def test_scaled_dot_product_attention_multireduce_fusion(self):
    Tensor.manual_seed(0)
    q = Tensor.randn(32,8,16,8).realize()
@ -1179,13 +1190,14 @@ class TestSchedule(unittest.TestCase):
    np.testing.assert_allclose(out.numpy(), expected, atol=1e-4, rtol=1e-4)

  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
+  @expect_rangeify_fails
  def test_softmax_upcast(self):
    # input half, softmax in float
    Tensor.manual_seed(0)
    x = Tensor.randn(4, 12, 64, 64, dtype=dtypes.half).realize()
    out = x.softmax(dtype=dtypes.float)
    sched = out.schedule()
-    self.assertEqual(len(sched), 3)
+    self.assertEqual(len(sched), 2 if RANGEIFY else 3)
    self.assertEqual(sched[0].bufs[0].dtype, dtypes.half)

    # input float, softmax in float
@ -1202,7 +1214,6 @@ class TestSchedule(unittest.TestCase):
    x.softmax().sum().backward()
    run_schedule(check_schedule(x.grad, 4))

-  # changed by: multireduce spec
  def test_layernorm_onelayer_fusion(self):
    Tensor.manual_seed(0)
    layer = nn.LayerNorm([10, 10])
@ -1316,6 +1327,7 @@ class TestSchedule(unittest.TestCase):
      with Context(FUSE_CONV_BW=1): check_schedule(opt.schedule_step(), 14)

  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
+  @expect_rangeify_fails
  def test_prefer_half_buffer(self):
    x = Tensor.ones(4).contiguous().realize()
    # y = Tensor.ones(4).contiguous().realize()
@ -1433,7 +1445,6 @@ class TestSchedule(unittest.TestCase):
    run_schedule(schedule)
    np.testing.assert_allclose(b.numpy(), a.numpy().sum(0)+a.numpy().max(0) + a.numpy().max(1)+a.numpy().sum(1)+2, atol=1e-4, rtol=1e-4)

-  # changed by: multireduce spec
  # pattern in test_transformer
  def test_partial_fuse1(self):
    Tensor.manual_seed(0)
@ -1446,7 +1457,6 @@ class TestSchedule(unittest.TestCase):
    np.testing.assert_allclose(c.numpy(), a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(d.numpy(), (a.numpy().sum() - b.numpy().sum()) * 4, atol=1e-4, rtol=1e-4)

-  # changed by: multireduce spec
  # pattern in conv
  def test_partial_fuse2(self):
    Tensor.manual_seed(0)
@ -1459,9 +1469,7 @@ class TestSchedule(unittest.TestCase):
    np.testing.assert_allclose(c.numpy(), a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(d.numpy(), b.numpy().sum()-(a.numpy().sum()+2), atol=1e-4, rtol=1e-4)

-  # changed by: multireduce spec
  # pattern in adam
-  @unittest.expectedFailure
  def test_partial_fuse3(self):
    Tensor.manual_seed(0)
    a = Tensor.randn(16, 16).realize()
@ -1471,14 +1479,12 @@ class TestSchedule(unittest.TestCase):
    e = c * d
    f = b.sum() - e
    # run_schedule(check_schedule([c, d, e, f], 1))
-    run_schedule(check_schedule([c, d, e, f], 2))
+    run_schedule(check_schedule([c, d, e, f], 2 if RANGEIFY else 5))
    np.testing.assert_allclose(c.numpy(), c_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(d.numpy(), d_np:=a.numpy().sum()*2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(e.numpy(), e_np:=c_np*d_np, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(f.numpy(), b.numpy().sum() - e_np, atol=1e-4, rtol=1e-4)

-  # changed by: multireduce spec
-  @unittest.expectedFailure
  def test_partial_fuse4(self):
    Tensor.manual_seed(0)
    a = Tensor.randn(16, 16).realize()
@ -1488,7 +1494,7 @@ class TestSchedule(unittest.TestCase):
    e = c * d
    f = (b - d).sum() - e
    # run_schedule(check_schedule([c, d, e, f], 1))
-    run_schedule(check_schedule([c, d, e, f], 3))
+    run_schedule(check_schedule([c, d, e, f], 5))
    np.testing.assert_allclose(c.numpy(), c_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(d.numpy(), d_np:=a.numpy().sum()*2, atol=1e-4, rtol=1e-4)
    np.testing.assert_allclose(e.numpy(), e_np:=c_np*d_np, atol=1e-4, rtol=1e-4)
@ -1623,11 +1629,11 @@ class TestSchedule(unittest.TestCase):
    out = x.argmax(1)
    run_schedule(check_schedule(out, 2))

-  def test_conv2d(self): _test_conv2d(7)
-  def test_conv2d_fused(self): _test_conv2d(5, FUSE_CONV_BW=1)
+  def test_conv2d(self): _test_conv2d(4 if RANGEIFY else 7)
+  def test_conv2d_fused(self): _test_conv2d(4 if RANGEIFY else 5, FUSE_CONV_BW=1)

  @unittest.skipUnless(is_dtype_supported(dtypes.half) and is_dtype_supported(dtypes.ulong), "need half and ulong")
-  def test_conv2d_half(self): _test_conv2d(7, dtype=dtypes.half)
+  def test_conv2d_half(self): _test_conv2d(4 if RANGEIFY else 7, dtype=dtypes.half)
  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Causes other tests to fail")
  @unittest.expectedFailure
@ -1654,7 +1660,8 @@ class TestSchedule(unittest.TestCase):
    constv = Tensor.empty(2, 2).uop.const_like(10).contiguous()
    check_schedule(constv, 1)

-  @unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU")
+  @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
+  @expect_rangeify_fails
  def test_image_matmul(self):
    with Context(IMAGE=2):
      x = Tensor.randn((9, 9)).realize()
@ -1690,6 +1697,7 @@ class TestSchedule(unittest.TestCase):
  def test_late_fusion_post_expand(self):
    self._test_fusion([(32, 32)], lambda a:a-a.sum(1), 2)

+  @expect_rangeify_fails
  def test_cast_padded_view(self):
    a = Tensor.arange(4).reshape(1, 4)
    casted_view = a.pad(((0, 1), (0, 0))).cast(dtypes.float)
@ -1719,6 +1727,7 @@ class TestSchedule(unittest.TestCase):
    self.assertListEqual(realized_const_view.tolist(), [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])

  @given(strat.sampled_from(dtypes.all), strat.sampled_from(dtypes.all))
+  @expect_rangeify_fails
  def test_cast_padded_const(self, dt1, dt2):
    assume(is_dtype_supported(dt1) and is_dtype_supported(dt2))
    a = Tensor(1, dtype=dt1).reshape(1, 1).pad(((1, 1), None))
@ -1728,53 +1737,41 @@ class TestSchedule(unittest.TestCase):
    run_schedule(check_schedule(realized_const_view, 1))
    np.testing.assert_equal(realized_const_view.numpy(), [[0], [1], [0]])

-class TestIndexing(unittest.TestCase):
-  def check_schedule(self, xt:Tensor|list[Tensor], cnt:int):
-    with Context(FUSE_ARANGE=getenv("FUSE_ARANGE", 1)):
-      lst = [xt] if isinstance(xt, Tensor) else xt
-      s = Tensor.schedule(*lst)
-      lowered = [x[1] for x in lower_schedule(s.copy())]
-      kernels = [ei for ei in list(lowered) if isinstance(ei.prg, CompiledRunner)]
-      if FUSE_ARANGE and len(kernels) != cnt:
-        raise KernelCountException(f"{len(kernels)} != {cnt}")
-      for ei in lowered: ei.run(do_update_stats=True)
-    return s
-
  def test_simple_indexing(self):
    X = Tensor.randn(10, 10).realize()
    idxs = Tensor([0, 2]).realize()
    xt = X[idxs]
-    self.check_schedule(xt, 2)
+    run_schedule(check_schedule(xt, 2))
    np.testing.assert_equal(xt.numpy(), X.numpy()[idxs.numpy()])

  def test_simple_indexing_alt(self):
    X = Tensor.arange(16).reshape(4, 4)
    xt = X[[1, 2], [-1, 2]]
-    self.check_schedule(xt, 1)
+    run_schedule(check_schedule(xt, 1))
    np.testing.assert_equal(xt.numpy(), (np.arange(16).reshape(4, 4))[[1, 2], [-1, 2]])

  def test_advanced_indexing(self):
    X = Tensor.arange(10)+1
    xt = X[[0, -1]]
-    self.check_schedule(xt, 1)
+    run_schedule(check_schedule(xt, 1))
    np.testing.assert_equal(xt.numpy(), (np.arange(10)+1)[[0, -1]])

  def test_advanced_indexing_alt(self):
    X = Tensor.arange(6).reshape(3, 2)+1
    xt = X[[Tensor([2]), Tensor([1])]]
-    self.check_schedule(xt, 3)
+    run_schedule(check_schedule(xt, 3))
    np.testing.assert_equal(xt.numpy(), 6)

  def test_advanced_simple_indexing_combined(self):
    X = Tensor.arange(16).reshape(4, 4)
    xt = X[1:2, [-1, 2]]
-    self.check_schedule(xt, 1)
+    run_schedule(check_schedule(xt, 1))

  def test_push_through_reshape(self):
    Tensor.manual_seed(0)
    x = Tensor.randn(10, 20).realize()
    out = x.argmax(1)
-    self.check_schedule(out, 2)
+    run_schedule(check_schedule(out, 2))
    np.testing.assert_allclose(out.numpy(), np.argmax(x.numpy(), 1))

  def test_arange_push_through_expand(self):
@ -1782,35 +1779,35 @@ class TestIndexing(unittest.TestCase):
    a = Tensor.arange(4,)
    b = Tensor.randn(4, 4).realize()
    out = (a+b).sum()
-    self.check_schedule(out, 1)
+    run_schedule(check_schedule(out, 1))
    np.testing.assert_allclose(out.numpy(), (np.arange(4)+b.numpy()).sum(), atol=1e-5)

  def test_argmin(self):
    Tensor.manual_seed(0)
    x = Tensor.randn(4, 32).realize()
    out = x.argmin(-1)
-    self.check_schedule(out, 2)
+    run_schedule(check_schedule(out, 2))
    np.testing.assert_equal(out.numpy(), x.numpy().argmin(axis=-1))

  def test_argmax(self):
    Tensor.manual_seed(0)
    x = Tensor.randn(4, 32).realize()
    out = x.argmax(-1)
-    self.check_schedule(out, 2)
+    run_schedule(check_schedule(out, 2))
    np.testing.assert_equal(out.numpy(), x.numpy().argmax(axis=-1))

  def test_arange_transposed(self):
    Tensor.manual_seed(0)
    x = Tensor.randint(4, 1).realize()
    a = ((Tensor.arange(4,)*x).T).sum()
-    self.check_schedule(a, 1)
+    run_schedule(check_schedule(a, 1))
    np.testing.assert_equal(a.numpy(), (np.arange(4)*x.numpy()).T.sum())

  def test_div_padded_arange(self):
    x = Tensor.full((2,2), 16)
    y = x.idiv(Tensor.linspace(2, 8, steps=4, dtype=dtypes.int).reshape(2,2)).pad(((1,1), (1,1)))
    out = y.sum(axis=1)
-    with Context(FUSE_ARANGE=1): run_schedule(check_schedule(out, 2))
+    run_schedule(check_schedule(out, 2))
    self.assertListEqual(out.tolist(), [0, 12, 4, 0])

  def test_arange_transposed_descendants(self):
@ -1819,7 +1816,7 @@ class TestIndexing(unittest.TestCase):
    a = (Tensor.arange(4,)*x).T
    b = Tensor.randint(4, 4).realize()
    out = (a+b).sum()
-    self.check_schedule(out, 1)
+    run_schedule(check_schedule(out, 1))
    np.testing.assert_equal(out.numpy(), ((np.arange(4)*x.numpy()).T+b.numpy()).sum())

  def test_arange_index(self):
@ -1827,7 +1824,7 @@ class TestIndexing(unittest.TestCase):
    x = Tensor.randn(5, 2).realize()
    a = Tensor.arange(10)
    out = (x + a[2]).sum()
-    self.check_schedule(out, 1)
+    run_schedule(check_schedule(out, 1))
    np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6)

  def test_arange_index_shrink(self):
@ -1836,14 +1833,14 @@ class TestIndexing(unittest.TestCase):
      x = Tensor.randn(11).realize()
    a = Tensor.arange(22)
    out = (x + a[:11]).sum()
-    self.check_schedule(out, 1)
+    check_schedule(out, 1)

  def test_arange_index_contiguous(self):
    Tensor.manual_seed(0)
    x = Tensor.randn(5, 2).realize()
    a = Tensor.arange(10).contiguous()
    out = (x + a[2]).sum()
-    self.check_schedule(out, 3)
+    run_schedule(check_schedule(out, 3))
    np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6)

  def test_arange_index_child(self):
@ -1851,62 +1848,24 @@ class TestIndexing(unittest.TestCase):
    x = Tensor.randn(5, 2).realize()
    a = Tensor.arange(10)+1
    out = (x + a[2]).sum()
-    self.check_schedule(out, 1)
+    run_schedule(check_schedule(out, 1))
    np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6)

-  def test_arange_index_contiguous_child(self):
+  def test_user_contiguous(self):
    Tensor.manual_seed(0)
    x = Tensor.randn(5, 2).realize()
    a = (Tensor.arange(10)+1).contiguous()
    out = (x + a[2]).sum()
-    self.check_schedule(out, 3)
+    run_schedule(check_schedule(out, 3))
    np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6)

-  def test_arange_childless_base(self):
-    a = Tensor.arange(4)
-    self.check_schedule(a, 1)
-    np.testing.assert_equal(a.numpy(), np.arange(4))
-
-  def test_arange_childless_view(self):
-    a = Tensor.arange(4).reshape(2, 2)
-    a[0] = 4
-    np.testing.assert_equal(a.numpy(), [[4, 4], [2, 3]])
-
-  def test_arange_group_childless_base(self):
-    Tensor.manual_seed(0)
-    x = Tensor.randint(4).realize()
-    a = Tensor.arange(4)+x
-    self.check_schedule(a, 1)
-    np.testing.assert_equal(a.numpy(), np.arange(4)+x.numpy())
-
-  def test_arange_group_childless_view(self):
-    Tensor.manual_seed(0)
-    x = Tensor.ones(4).contiguous().realize()
-    a = Tensor.arange(4)+x
-    a[0] = 6
-    np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])
-
  @unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
  def test_arange_view_op(self):
    a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
-    sched = self.check_schedule(a, 1)
+    sched = run_schedule(check_schedule(a, 1))
    self.assertIs(sched[1].ast.op, Ops.BUFFER_VIEW)
    np.testing.assert_equal(a.numpy(), [[4, 5]])

-  @unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device")
-  def test_arange_shrink_copy(self):
-    a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).to("CPU")
-    sched = self.check_schedule(a, 2) # NOTE: there is a contiguous between REDUCE_AXIS and COPY
-    self.assertIs(sched[-1].ast.op, Ops.COPY)
-    np.testing.assert_equal(a.numpy(), [[4, 5]])
-
-  @unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device")
-  def test_arange_expand_copy(self):
-    a = Tensor.arange(4).reshape(2, 2, 1).expand(2, 2, 2).contiguous().to("CPU")
-    sched = self.check_schedule(a, 2) # NOTE: there is a contiguous between REDUCE_AXIS and COPY
-    self.assertIs(sched[2].ast.op, Ops.COPY)
-    np.testing.assert_equal(a.numpy(), [[[0, 0], [1, 1]], [[2, 2], [3, 3]]])
-
  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
  def test_precompute_freqs_cis(self):
    from extra.models.llama import precompute_freqs_cis
@ -1922,23 +1881,33 @@ class TestIndexing(unittest.TestCase):
  def test_fuse_assign_contiguous(self):
    x = Tensor.zeros(4, 4, dtype=dtypes.int).contiguous().realize()
    a = Tensor.arange(8).reshape(4, 2)
-    self.check_schedule(x.shrink((None, (0, 2))).assign(a.contiguous()), 2)
+    run_schedule(check_schedule(x.shrink((None, (0, 2))).assign(a.contiguous()), 2))
    np.testing.assert_equal(x.numpy(), [[0, 1, 0, 0], [2, 3, 0, 0], [4, 5, 0, 0], [6, 7, 0, 0]])

-  def test_assign_non_contiguous(self):
-    x = Tensor.zeros(4, 4, dtype=dtypes.int).contiguous().realize()
-    y = Tensor.randint(4, 2)
-    a = Tensor.arange(8).reshape(4, 2)+y
-    x.shrink((None, (0, 2))).assign(a).realize()
-    xref = np.zeros((4, 4), dtype=int)
-    xref[:, :2] = np.arange(8).reshape(4, 2)+y.numpy()
+  def test_assign_non_contiguous_alt(self): self.test_assign_non_contiguous(alt=True)
+  def test_assign_non_contiguous(self, alt=False):
+    x = (Tensor.arange(16)-100).reshape(4,4).contiguous().realize()
+    xref = x.numpy()
+    if alt:
+      y = Tensor.randint(2, 4).contiguous().realize()
+      a = Tensor.arange(8).reshape(2, 4)+y
+      tst = x.shrink(((0, 2), None)).assign(a).realize()
+      xref[:2, :] = np.arange(8).reshape(2, 4)+y.numpy()
+    else:
+      y = Tensor.randint(4, 2).contiguous().realize()
+      a = Tensor.arange(8).reshape(4, 2)+y
+      tst = x.shrink((None, (0, 2))).assign(a).realize()
+      xref[:, :2] = np.arange(8).reshape(4, 2)+y.numpy()
    np.testing.assert_equal(x.numpy(), xref)
+    if RANGEIFY > 0:
+      # NOTE: this is a bug on non rangeify
+      np.testing.assert_equal(tst.numpy(), a.numpy())

  def test_sparse_categorical_crossentropy_simple(self):
    X = Tensor([[0, 2, 3], [1, 2, 3]]).realize()
    Y = Tensor([1, 2]).realize()
    loss = X.sparse_categorical_crossentropy(Y)
-    self.check_schedule(loss, 4)
+    run_schedule(check_schedule(loss, 4))
    np.testing.assert_allclose(loss.item(), 0.878309, atol=1e-5, rtol=1e-6)

  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Validation error on WebGPU")
@ -1950,28 +1919,21 @@ class TestIndexing(unittest.TestCase):
    yt = Tensor.randn(BS, 10).realize()
    with Context(SPLIT_REDUCEOP=0):
      loss = yt.sparse_categorical_crossentropy(Y_train[samples])
-      self.check_schedule(loss, 6)
+      run_schedule(check_schedule(loss, 6))
      loss_fused = loss.numpy()
    loss_ref = torch.nn.CrossEntropyLoss()(torch.tensor(yt.numpy()), torch.tensor(Y_train.numpy())[torch.tensor(samples.numpy())])
    np.testing.assert_allclose(loss_fused, loss_ref.numpy(), atol=1e-6, rtol=1e-6)

-  @unittest.expectedFailure
  def test_arange_fuse_grouped_children(self):
    X = Tensor.randn(4, 4).realize()
    r = (X+Tensor.arange(16).reshape(4, 4)).sum()
    out0 = r+2
    out1 = r+3
-    self.check_schedule([out0, out1], 1)
+    run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
    r_ref = (X.numpy()+np.arange(16).reshape(4, 4)).sum()
    np.testing.assert_allclose(out0.numpy(), r_ref+2, rtol=2e-7)
    np.testing.assert_allclose(out1.numpy(), r_ref+3, rtol=2e-7)

-  def test_dont_fold_arange_contiguous_view(self):
-    X = Tensor.randn(4, 4).realize()
-    r = (X+Tensor.arange(16).reshape(4, 4).contiguous()).sum(1, keepdim=True)
-    self.check_schedule([r], 2)
-    np.testing.assert_allclose(r.numpy(), (X.numpy()+np.arange(16).reshape(4, 4)).sum(1, keepdims=True), atol=1e-5, rtol=1e-6)
-
  @unittest.skip("multi output isn't supported")
  def test_multiview_arange_children(self):
    X = Tensor.randn(2,3,4,4).numpy()
@ -2101,6 +2063,7 @@ class TestView(unittest.TestCase):
    run_schedule(sched)
    np.testing.assert_equal(b.numpy(), 0)

+  @expect_rangeify_fails
  def test_mask_dim_1(self):
    # mask out dim = 1 works too
    a = Tensor.rand(10, 10).realize()
@ -2127,6 +2090,7 @@ class TestView(unittest.TestCase):

  # a*VIEW(x), where VIEW(x) = 0
  # x collapses along with its children
+  @unittest.skipIf(RANGEIFY, "this only fails if you run all of TestSchedule, some global tensor map bug?")
  def test_parent_view_collapses(self):
    a = Tensor([1, 2])
    b = Tensor.arange(3).contiguous()
@ -2208,84 +2172,6 @@ class TestSimplifier(unittest.TestCase):
    assert UPat(Ops.CONST, arg=False).match(sink, {}), f"expected {sink} to collapse to a const False"
    assert sink.shape == a.shape

-tensor_const_pm = PatternMatcher([
-  (UPat(Ops.CONST, src=(UPat(Ops.VIEW, src=(UPat(Ops.DEVICE),)),)), lambda: True),
-  (UPat(Ops.BIND, src=(UPat(Ops.DEFINE_VAR, src=(UPat(Ops.VIEW, src=(UPat(Ops.DEVICE),)))), UPat(Ops.CONST))), lambda: True),
-])
-class TestConst(unittest.TestCase):
-  # ** part 1: basic functionality of a tensor directly created from CONST
-
-  def test_tensor_const(self):
-    a = Tensor(1)
-    print(a.uop)
-    self.assertTrue(tensor_const_pm.rewrite(a.uop))
-
-  def test_tensor_variable(self):
-    vv = UOp.variable("a", 0, 10).bind(1)
-    a = Tensor(vv)
-    print(a.uop)
-    self.assertTrue(tensor_const_pm.rewrite(a.uop))
-
-  def test_const_schedule(self):
-    a = Tensor.ones((4, 4))
-    sched = a.schedule()
-    self.assertEqual(len(sched), 0)
-
-  def test_const_contiguous_schedule(self):
-    # this ends up in the big graph
-    a = Tensor.ones((4,)).contiguous()
-    sched = a.schedule()
-    self.assertEqual(len(sched), 1)
-
-  # ** part 2: scheduler behavior when const folding happens later
-
-  def test_const_folding_no_realize(self):
-    a = Tensor([1, 2, 3, 4])*0
-    sched = a.schedule()
-    self.assertEqual(len(sched), 0)
-
-  def test_src_const_folding(self):
-    with Context(TRACK_MATCH_STATS=0):
-      a = Tensor.full((4,), 1).contiguous().realize()
-      b = Tensor.full((4,), 2).contiguous().realize()
-    mul0 = a*0
-    add = b+mul0
-    sched = add.schedule()
-    self.assertEqual(len(sched), 0)
-    # b+0 and b share the same underlying device memory
-    self.assertIs(add.uop.buffer, b.uop.buffer)
-    self.assertListEqual(add.tolist(), [2, 2, 2, 2])
-
-  def test_src_masked_const_folding(self):
-    with Context(TRACK_MATCH_STATS=0):
-      a = Tensor.full((4,), 1).contiguous().realize()
-      b = Tensor.full((6,), 2).contiguous().realize()
-    mul0 = a*0
-    add = b+mul0.pad((1, 1), value=2)
-    sched = add.schedule()
-    self.assertEqual(len(sched), 1)
-    run_schedule(sched)
-    # add gets assigned to a new buffer
-    self.assertIsNot(add.uop.base.realized, b.uop.base.realized)
-    self.assertListEqual(add.tolist(), [4, 2, 2, 2, 2, 4])
-
-  # ** part 3: Tensor variable bindings
-
-  #@unittest.expectedFailure # TODO: should schedule assert if you try to realize a Variable?
-  def test_var_schedule(self):
-    vv = UOp.variable("a", 0, 10).bind(1)
-    a = Tensor(vv)
-    sched = a.schedule()
-    self.assertEqual(len(sched), 0)
-
-  def test_add_tvar(self):
-    vv = UOp.variable("a", 0, 10).bind(1)
-    a = Tensor(vv)+2
-    sched, var_vals = a.schedule_with_vars()
-    self.assertEqual(len(sched), 1)
-    run_schedule(sched, var_vals)
-    self.assertEqual(a.tolist(), 3)
-
@unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from another device to cpu")
 class TestCopyFolding(unittest.TestCase):
  def test_const_copy_is_free(self):
@ -2299,6 +2185,7 @@ class TestCopyFolding(unittest.TestCase):
    b = (a*zeros).to("CPU")
    run_schedule(check_schedule(b, 0, filter_sink=False))
    self.assertListEqual(b.tolist(), [0, 0, 0])
+    self.assertEqual(b.device, "CPU")

  def test_alu_after_copy(self):
    a = Tensor.ones((4,)).to("CPU")
@ -2307,6 +2194,12 @@ class TestCopyFolding(unittest.TestCase):
    add.kernelize()
    assert all_same([x.device for x in add.uop.src]), f"ALU has different devices! {[x.device for x in add.src]}"

+  def test_alu_before_copy(self):
+    buf = Tensor.ones(1).contiguous().realize()
+    a = buf+1
+    b = a.to("CPU")
+    self.assertListEqual(b.tolist(), [2.])
+
  def test_copy_to_same_device(self):
    a = Tensor.empty(4).uop
    b = a.copy_to_device(a.device)
@ -2323,6 +2216,15 @@ class TestCopyFolding(unittest.TestCase):
    b = schedule_graph_rewrite(b)
    self.assertIs(b.base, a.base)

+  def test_copy_to_same_device_sched(self):
+    a = Tensor.ones(4).contiguous().realize().uop.as_buf()
+    t = Tensor(a.copy_to_device(a.device))
+    sched = t.schedule()
+    assert len([s for s in sched if s.ast.op is Ops.COPY]) == 0
+    run_schedule(sched)
+    assert t.uop.is_realized, f"didn't realize Tensor {t}"
+    self.assertListEqual(t.tolist(), [1.,1.,1.,1.])
+
  def test_clone(self):
    a = Tensor.empty(4)
    check_schedule(a.clone(), 1, filter_sink=False)
@ -2353,6 +2255,7 @@ class TestCopyFolding(unittest.TestCase):
    b.realize()
    self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])

+  @expect_rangeify_fails
  def test_permute_on_disk(self):
    with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().uop.base.buffer.as_buffer())
    a = Tensor.empty(4, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_4_permute')}")
@ -2499,6 +2402,7 @@ class TestUOpBecome(unittest.TestCase):
    self.assertEqual(add.uop.shape, (8, 2))
    assert add.uop is not add.uop.base

+  @expect_rangeify_fails
  def test_new_flat_buffer(self):
    a = Tensor.empty(4,)
    b = Tensor.empty(4,)
@ -2524,6 +2428,7 @@ class TestUOpBecome(unittest.TestCase):
    z = (img*x) / y
    check_schedule(z, 1)

+  @expect_rangeify_fails
  def test_become_existing_buffer(self):
    a = Tensor.empty(4, 4)
    b = a*1
@ -2551,6 +2456,7 @@ class TestUOpBecome(unittest.TestCase):
    check_schedule(b, 0)
    assert UPat(Ops.CONST, arg=0).match(b.uop.base, {}) # scheduling replaces the tensor uop with a VIEW(BUFFER)

+  @expect_rangeify_fails
  def test_become_const_in_view(self):
    # if we shrink the base down to a size 0, only the VIEW becomes CONST, base is unchanged.
    add = Tensor.empty(2, 2)+Tensor.empty(2, 2)
@ -2568,6 +2474,7 @@ class TestUOpBecome(unittest.TestCase):
    assert UPat(Ops.CONST, arg=3).match(const_add.uop.base, {})

  # tensors can become another realized tensor source
+  @expect_rangeify_fails
  def test_become_existing_buf_simple(self):
    a = Tensor.empty(4, 4)
    b = a+0
@ -2576,12 +2483,14 @@ class TestUOpBecome(unittest.TestCase):
    self.assertIs(a.uop, b.uop)

  # they can also chain other movement ops on top of the tensor source
+  @expect_rangeify_fails
  def test_become_existing_buf_view(self):
    a = Tensor.empty(4, 4)
    b = a.permute((1, 0))+0
    check_schedule(b, 0)
    self.assertEqual(b.uop.st, a.uop.permute((1, 0)).st)

+  @expect_rangeify_fails
  def test_become_existing_buf_view_alt(self):
    a = Tensor.empty(4, 4)
    b = a.permute((1, 0)).reshape((8, 2))+0
@ -2589,6 +2498,7 @@ class TestUOpBecome(unittest.TestCase):
    self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)

  # they can also have other base parents that simplified, in that case we just backtrack to the chained mops
+  @expect_rangeify_fails
  def test_become_existing_buf_complex(self):
    a = Tensor.empty(4, 4)
    b = (a.permute((1, 0))+0).reshape((8, 2))+0
@ -2596,6 +2506,7 @@ class TestUOpBecome(unittest.TestCase):
    self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)
    assert b.uop.base.op is Ops.BUFFER

+  @expect_rangeify_fails
  def test_become_multiple_choices(self):
    a = Tensor.empty(16)
    b = (a.reshape(1, 1, 4, 1, 4)+0).reshape(1, 1, 4, 4).shrink(((0, 1), (0, 1), (0, 3), (0, 3)))+0
@ -2607,6 +2518,7 @@ class TestUOpBecome(unittest.TestCase):
    assert b.uop is c.uop
    assert UPat(Ops.VIEW, src=(UPat(Ops.BUFFER),)).match(c.uop, {})

+  @expect_rangeify_fails
  def test_setitem_becomes_subbuffer(self):
    a = Tensor.full((4,), 2.).contiguous().realize()
    b = a.shrink(((0, 2),)).assign(Tensor.full((2,), 1.0))
--- a/test/test_setitem.py
+++ b/test/test_setitem.py
@ -1,4 +1,6 @@
 import unittest
+import random
+from os import getenv
 from tinygrad import Tensor, TinyJit, Variable, dtypes
 from tinygrad.helpers import Context
 import numpy as np
@ -165,6 +167,41 @@ class TestSetitem(unittest.TestCase):
    t[idx] = val
    self.assertEqual(t.tolist(), [val]*idx_size+[idx_size])

+  def test_setitem_advanced_indexing(self):
+    # Example from https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+    t = Tensor.zeros(10,20,30,40,50).contiguous()
+    ind_1 = Tensor([5,3,7,8])
+    ind_2 = Tensor([[[0],[1],[2]],[[3],[4],[5]]])
+    v = Tensor.arange(2*3*4*10*30*50).reshape(2,3,4,10,30,50)
+    t[:, ind_1, :, ind_2, :] = v
+    n = np.zeros((10,20,30,40,50))
+    n[:, ind_1.numpy(), :, ind_2.numpy(), :] = v.numpy()
+    np.testing.assert_allclose(t.numpy(), n)
+
+  def test_setitem_2d_tensor_indexing(self):
+    t = Tensor.zeros(2).contiguous()
+    index = Tensor([[0, 1], [1,0]])
+    v = Tensor.arange(2*2).reshape(2, 2).contiguous()
+    t[index] = v
+    n = np.zeros((2,))
+    n[index.numpy()] = v.numpy()
+    np.testing.assert_allclose(t.numpy(), n)
+
+  @unittest.skip("slow")
+  def test_setitem_tensor_indexing_fuzz(self):
+    random.seed(getenv("SEED", 42))
+    for _ in range(getenv("ITERS", 100)):
+      size = random.randint(5, 10)
+      d0, d1, d2 = random.randint(1,5), random.randint(1,5), random.randint(1,5)
+      t = Tensor.zeros(size).contiguous()
+      n = np.zeros((size,))
+      index = Tensor.randint((d0, d1, d2), low=0, high=size)
+      v = Tensor.arange(d0*d1*d2).reshape(d0, d1, d2)
+      t[index] = v
+      n[index.numpy()] = v.numpy()
+      np.testing.assert_allclose(t.numpy(), n, err_msg=f"failed with index={index.numpy().tolist()} and v={v.numpy().tolist()}")
+
+
 class TestWithGrad(unittest.TestCase):
  def test_no_requires_grad_works(self):
    z = Tensor.rand(8, 8)
--- a/test/test_symbolic_jit.py
+++ b/test/test_symbolic_jit.py
@ -2,6 +2,7 @@ import unittest

 from test.helpers import assert_jit_cache_len
 from tinygrad import Variable, Tensor, TinyJit
+from tinygrad.helpers import RANGEIFY
 import numpy as np

 class TestSymbolicJit(unittest.TestCase):
@ -11,11 +12,23 @@ class TestSymbolicJit(unittest.TestCase):
    a = Tensor.rand(3, 10)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = jf(a[:, :vi]).reshape(3, i).numpy()
+      symbolic = jf(a[:, :vi])[:3, :i].numpy()
      expected = f(a[:, :i]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)

+  def test_plus1_pad(self):
+    # TODO: without contiguous, the pad is not captured in jit
+    def f(a): return (a+1).pad((None, (0, 10-a.shape[1]))).contiguous().realize()
+    jf = TinyJit(f)
+    a = Tensor.rand(3, 10)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      symbolic = jf(a[:, :vi]).numpy()
+      expected = f(a[:, :i]).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert_jit_cache_len(jf, 1 if RANGEIFY else 2) # one add and one pad, can be one kernel?
+
  def test_add(self):
    def f(a, b): return (a+b).realize()
    jf = TinyJit(f)
@ -23,7 +36,8 @@ class TestSymbolicJit(unittest.TestCase):
    b = Tensor.rand(3, 10)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = jf(a[:, :vi], b[:, :vi]).reshape(3, i).numpy()
+      symbolic = jf(a[:, :vi], b[:, :vi])
+      symbolic = symbolic[:3, :i].numpy()
      expected = f(a[:, :i], b[:, :i]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)
@ -63,10 +77,10 @@ class TestSymbolicJit(unittest.TestCase):
    v = Tensor.rand(2, 10, 4, 8)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = jf(q, k[:, :vi], v[:, :vi]).reshape(2, 4, 1, 8).numpy()
+      symbolic = jf(q, k[:, :vi], v[:, :vi])[:2, :4, :1, :8].numpy()
      expected = f(q, k[:, :i], v[:, :i]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
-    assert_jit_cache_len(jf, 5)
+    assert_jit_cache_len(jf, 4 if RANGEIFY else 5)

  def test_cat_dim0(self):
    def f(a, b): return a.cat(b, dim=0).realize()
@ -75,7 +89,7 @@ class TestSymbolicJit(unittest.TestCase):
    b = Tensor.rand(2, 3)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = jf(a[:vi], b).reshape(i+2, 3).numpy()
+      symbolic = jf(a[:vi], b)[:i+2, :3].numpy()
      expected = f(a[:i], b).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)
@ -87,7 +101,7 @@ class TestSymbolicJit(unittest.TestCase):
    b = Tensor.rand(3, 2)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = jf(a[:, :vi], b).reshape(3, i+2).numpy()
+      symbolic = jf(a[:, :vi], b)[:3, :i+2].numpy()
      expected = f(a[:, :i], b).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)
@ -101,7 +115,7 @@ class TestSymbolicJit(unittest.TestCase):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = jf(a[:vi], b[:vj]).reshape(i+j, 3).numpy()
+        symbolic = jf(a[:vi], b[:vj])[:i+j, :3].numpy()
        expected = f(a[:i], b[:j]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)
@ -115,7 +129,7 @@ class TestSymbolicJit(unittest.TestCase):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = jf(a[:, :vi], b[:, :vj]).reshape(3, i+j).numpy()
+        symbolic = jf(a[:, :vi], b[:, :vj])[:3, :i+j].numpy()
        expected = f(a[:, :i], b[:, :j]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)
@ -129,7 +143,7 @@ class TestSymbolicJit(unittest.TestCase):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = jf(a[:vi, :], b[:, :vj]).reshape(i, j).numpy()
+        symbolic = jf(a[:vi, :], b[:, :vj])[:i, :j].numpy()
        expected = f(a[:i, :], b[:, :j]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)
@ -143,7 +157,7 @@ class TestSymbolicJit(unittest.TestCase):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = jf(a[:vj, :], b[:, :vi]).reshape(j, i).numpy()
+        symbolic = jf(a[:vj, :], b[:, :vi])[:j, :i].numpy()
        expected = f(a[:j, :], b[:, :i]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)
@ -195,8 +209,8 @@ class TestSymbolicJit(unittest.TestCase):
      vi = Variable("i", 1, 10).bind(i)
      a = Tensor.ones(vi, 11).contiguous()
      symbolic = a[:, 1:2]
-      symbolic = jf(symbolic).reshape(i, 1).numpy()
-      expected = f(a.reshape(i, 11)[:, 1:2]).numpy()
+      symbolic = jf(symbolic)[:i, :1].numpy()
+      expected = f(a[:i, :][:, 1:2]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
    assert_jit_cache_len(jf, 1)

@ -231,7 +245,7 @@ class TestSymbolicJit(unittest.TestCase):
      expected = b[:i].mean(0).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
      # axis = 1
-      symbolic = jf1(c[:vi]).reshape(i).numpy()
+      symbolic = jf1(c[:vi])[:i].numpy()
      expected = c[:i].mean(1).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -254,11 +268,11 @@ class TestSymbolicJit(unittest.TestCase):
        expected = a[:i, :j].mean().numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
        # axis = 0
-        symbolic = jf0(b[:vi, :vj]).reshape(j).numpy()
+        symbolic = jf0(b[:vi, :vj])[:j].numpy()
        expected = b[:i, :j].mean(0).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
        # axis = 1
-        symbolic = jf1(c[:vi, :vj]).reshape(i).numpy()
+        symbolic = jf1(c[:vi, :vj])[:i].numpy()
        expected = c[:i, :j].mean(1).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -283,7 +297,7 @@ class TestSymbolicJit(unittest.TestCase):
      expected = b[:i].var(0).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
      # axis = 1
-      symbolic = jf1(c[:vi]).reshape(i).numpy()
+      symbolic = jf1(c[:vi])[:i].numpy()
      expected = c[:i].var(1).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -306,11 +320,11 @@ class TestSymbolicJit(unittest.TestCase):
        expected = a[:i, :j].var().numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
        # axis = 0
-        symbolic = jf0(b[:vi, :vj]).reshape(j).numpy()
+        symbolic = jf0(b[:vi, :vj])[:j].numpy()
        expected = b[:i, :j].var(0).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
        # axis = 1
-        symbolic = jf1(c[:vi, :vj]).reshape(i).numpy()
+        symbolic = jf1(c[:vi, :vj])[:i].numpy()
        expected = c[:i, :j].var(1).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

--- a/test/test_symbolic_ops.py
+++ b/test/test_symbolic_ops.py
@ -13,7 +13,16 @@ class TestSymbolicOps(unittest.TestCase):
    a = Tensor.rand(3, 10)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = f(a[:, :vi]).reshape(3, i).numpy()
+      symbolic = f(a[:, :vi])[:3, :i].numpy()
+      expected = f(a[:, :i]).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_plus1_pad(self):
+    def f(a): return (a+1).pad((None, (0, 10-a.shape[1]))).realize()
+    a = Tensor.rand(3, 10)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      symbolic = f(a[:, :vi]).numpy()
      expected = f(a[:, :i]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -23,7 +32,7 @@ class TestSymbolicOps(unittest.TestCase):
    b = Tensor.rand(3, 10)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = f(a[:, :vi], b[:, :vi]).reshape(3, i).numpy()
+      symbolic = f(a[:, :vi], b[:, :vi])[:, :i].numpy()
      expected = f(a[:, :i], b[:, :i]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -46,7 +55,7 @@ class TestSymbolicOps(unittest.TestCase):
      vi = Variable("i", 1, 10).bind(i) if use_symbolic else i
      Tensor.realize(q, k, v)
      GlobalCounters.reset()
-      symbolic = f(q, k[:, :vi, :, :], v[:, :vi, :, :]).reshape(2, 4, 1, 8).numpy()
+      symbolic = f(q, k[:, :vi, :, :], v[:, :vi, :, :])[:2, :4, :1, :8].numpy()
      expected = f(q, k[:, :i, :, :], v[:, :i, :, :]).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -85,7 +94,7 @@ class TestSymbolicOps(unittest.TestCase):
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
      b = Tensor.rand(2, 3)
-      symbolic = f(a[:vi, :], b).reshape(i+2, 3).numpy()
+      symbolic = f(a[:vi, :], b)[:i+2, :3].numpy()
      expected = f(a[:i, :], b).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -95,7 +104,7 @@ class TestSymbolicOps(unittest.TestCase):
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
      b = Tensor.rand(3, 2)
-      symbolic = f(a[:, :vi], b).reshape(3, i+2).numpy()
+      symbolic = f(a[:, :vi], b)[:3, :i+2].numpy()
      expected = f(a[:, :i], b).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -107,7 +116,7 @@ class TestSymbolicOps(unittest.TestCase):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = f(a[:vi, :], b[:vj, :]).reshape(i+j, 3).numpy()
+        symbolic = f(a[:vi, :], b[:vj, :])[:i+j, :3].numpy()
        expected = f(a[:i, :], b[:j, :]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -119,50 +128,41 @@ class TestSymbolicOps(unittest.TestCase):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = f(a[:, :vi], b[:, :vj]).reshape(3, i+j).numpy()
+        symbolic = f(a[:, :vi], b[:, :vj])[:3, :i+j].numpy()
        expected = f(a[:, :i], b[:, :j]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

  def test_two_vars_plus1_ij(self):
    def f(a, b): return (a@b+1).realize()
-    a = Tensor.rand(10, 3)
-    b = Tensor.rand(3, 10)
+    a = Tensor.rand(10, 3).realize()
+    b = Tensor.rand(3, 10).realize()
    for i in range(2, 5):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = f(a[:vi, :], b[:, :vj]).reshape(i, j).numpy()
+        symbolic = f(a[:vi, :], b[:, :vj])[:i, :j].numpy()
        expected = f(a[:i, :], b[:, :j]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

  def test_two_vars_plus1_ji(self):
    # reverse the order of variables
    def f(a, b): return (a@b+1).realize()
-    a = Tensor.rand(10, 3)
-    b = Tensor.rand(3, 10)
+    a = Tensor.rand(10, 3).realize()
+    b = Tensor.rand(3, 10).realize()
    for i in range(2, 5):
      for j in range(2, 5):
        vi = Variable("i", 1, 10).bind(i)
        vj = Variable("j", 1, 10).bind(j)
-        symbolic = f(a[:vj, :], b[:, :vi]).reshape(j, i).numpy()
+        symbolic = f(a[:vj, :], b[:, :vi])[:j, :i].numpy()
        expected = f(a[:j, :], b[:, :i]).numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

-  def test_reshape_from_symbolic(self):
-    a = Tensor.rand(30)
-    for i in range(3, 5):
-      vi = Variable("i", 3, 10).bind(i)
-      symbolic = a[:vi*3].reshape((3, 3)).numpy()
-      # To match symbolic reshape (potential implicit shrink), we need a shrink
-      expected = a[:i*3].shrink(((0, 9),)).reshape((3, 3)).numpy()
-      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
-
  def test_invalid_symbolic_reshape(self):
    a = Tensor.rand(30)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
      # Cannot reshape into symbolic from non-symbolic
-      with self.assertRaises(AssertionError): a.reshape((3, vi))
+      with self.assertRaises(ValueError): a.reshape((3, vi))

  def test_shrink(self):
    for i in range(1, 5):
@ -178,6 +178,7 @@ class TestSymbolicOps(unittest.TestCase):
      vi = Variable("i", 1, 10).bind(i)
      a = Tensor.rand(7, 11)
      symbolic = a[3:5, vi:vi+2]
+      print(symbolic.shape)
      symbolic = symbolic.numpy()
      expected = a[3:5, i:i+2].numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -186,7 +187,7 @@ class TestSymbolicOps(unittest.TestCase):
    a = Tensor.rand(7, 11)
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
-      symbolic = a[3:5, :vi:1].reshape(2, i).numpy()
+      symbolic = a[3:5, :vi:1][:2, :i].numpy()
      expected = a[3:5, :i:1].numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -194,7 +195,7 @@ class TestSymbolicOps(unittest.TestCase):
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
      a = Tensor(1).unsqueeze(0).pad((0, 1)).unsqueeze(0)
-      symbolic = a.expand(vi, 2).reshape(i, 2).numpy()
+      symbolic = a.expand(vi, 2)[:i, :2].numpy()
      expected = a.expand(i, 2).numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

@ -202,8 +203,8 @@ class TestSymbolicOps(unittest.TestCase):
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
      a = Tensor.ones(vi, 11).contiguous()
-      symbolic = a[:, 1:2].reshape(i, 1).numpy()
-      expected = a.reshape(i, 11)[:, 1:2].numpy()
+      symbolic = a[:, 1:2][:i, :1].numpy()
+      expected = Tensor.ones(i, 11)[:, 1:2].numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

  def test_ones_sum(self):
@ -220,7 +221,11 @@ class TestSymbolicOps(unittest.TestCase):
      vi = Variable("i", 1, 10).bind(i)
      for axis in [None, 0, 1]:
        expected = a[:i].mean(axis).numpy()
-        symbolic = a[:vi].mean(axis).reshape(expected.shape).numpy()
+        symbolic = a[:vi].mean(axis)
+        if axis is None:
+          symbolic = symbolic.numpy()
+        else:
+          symbolic = symbolic[:expected.shape[0]].numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

  def test_mean_2d(self):
@ -231,7 +236,11 @@ class TestSymbolicOps(unittest.TestCase):
        vj = Variable("j", 1, 10).bind(j)
        for axis in [None, 0, 1]:
          expected = a[:i, :j].mean(axis).numpy()
-          symbolic = a[:vi, :vj].mean(axis).reshape(expected.shape).numpy()
+          symbolic = a[:vi, :vj].mean(axis)
+          if axis is None:
+            symbolic = symbolic.numpy()
+          else:
+            symbolic = symbolic[:expected.shape[0]].numpy()
          np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

  def test_var(self):
@ -240,7 +249,11 @@ class TestSymbolicOps(unittest.TestCase):
      vi = Variable("i", 1, 10).bind(i)
      for axis in [None, 0, 1]:
        expected = a[:i].var(axis).numpy()
-        symbolic = a[:vi].var(axis).reshape(expected.shape).numpy()
+        symbolic = a[:vi].var(axis)
+        if axis is None:
+          symbolic = symbolic.numpy()
+        else:
+          symbolic = symbolic[:expected.shape[0]].numpy()
        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

  def test_var_2d(self):
@ -251,7 +264,11 @@ class TestSymbolicOps(unittest.TestCase):
        vj = Variable("j", 1, 10).bind(j)
        for axis in [None, 0, 1]:
          expected = a[:i, :j].var(axis).numpy()
-          symbolic = a[:vi, :vj].var(axis).reshape(expected.shape).numpy()
+          symbolic_result = a[:vi, :vj].var(axis)
+          if axis is None:
+            symbolic = symbolic_result.numpy()
+          else:
+            symbolic = symbolic_result[:expected.shape[0]].numpy()
          np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

  def test_bitcast_down(self):
@ -259,7 +276,11 @@ class TestSymbolicOps(unittest.TestCase):
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
      expected = a[:i].bitcast(dtypes.uint8).numpy()
-      symbolic = a[:vi].bitcast(dtypes.uint8).reshape(expected.shape).numpy()
+      symbolic_result = a[:vi].bitcast(dtypes.uint8)
+      if len(expected.shape) == 2:
+        symbolic = symbolic_result[:expected.shape[0], :expected.shape[1]].numpy()
+      else:
+        symbolic = symbolic_result[:].numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=0)

  @unittest.skipUnless(is_dtype_supported(dtypes.uint64), "no uint64")
@ -268,7 +289,11 @@ class TestSymbolicOps(unittest.TestCase):
    for i in range(1, 5):
      vi = Variable("i", 1, 10).bind(i)
      expected = a[:i].bitcast(dtypes.uint64).numpy()
-      symbolic = a[:vi].bitcast(dtypes.uint64).reshape(expected.shape).numpy()
+      symbolic_result = a[:vi].bitcast(dtypes.uint64)
+      if len(expected.shape) == 2:
+        symbolic = symbolic_result[:expected.shape[0], :expected.shape[1]].numpy()
+      else:
+        symbolic = symbolic_result[:].numpy()
      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=0)

  @unittest.expectedFailure
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@ -4,12 +4,12 @@ import torch
 import unittest, copy, mmap, random, math, array
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _METADATA
-from tinygrad.helpers import getenv, temp, mv_address
+from tinygrad.helpers import getenv, temp, mv_address, RANGEIFY
 from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
 from hypothesis import given, settings, strategies as strat
 from tinygrad.device import is_dtype_supported
 from tinygrad.uop.ops import Ops, UOp
-from tinygrad.runtime.support.compiler_cuda import PTX
+from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.codegen import full_rewrite
 from tinygrad.dtype import DType

@ -516,10 +516,6 @@ class TestTinygrad(unittest.TestCase):
    print(c)

  def test_env_overwrite_default_device(self):
-    subprocess.run(['DISK=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT != \\"DISK\\""'],
-                    shell=True, check=True)
-    subprocess.run(['NPY=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT != \\"NPY\\""'],
-                    shell=True, check=True)
    subprocess.run([f'{Device.DEFAULT}=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == \\"{Device.DEFAULT}\\""'],
                    shell=True, check=True)
    subprocess.run([f'DISK=1 {Device.DEFAULT}=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == \\"{Device.DEFAULT}\\""'],
@ -554,6 +550,11 @@ class TestTinygrad(unittest.TestCase):
  def test_shrink(self):
    t = Tensor.arange(32).contiguous().realize()
    self.assertListEqual(t[16:20].tolist(), [16,17,18,19])
+    self.assertListEqual(t.shrink_to(16).tolist(), list(range(16)))
+    t = t.reshape(4, 8).contiguous().realize()
+    self.assertListEqual(t.shrink_to(2, 2).tolist(), [[0, 1], [8, 9]])
+    with self.assertRaises(ValueError): t.shrink_to(2)
+    with self.assertRaises(ValueError): t.shrink_to(2, 2, 2)

@unittest.skip("this test is just flaky, sync issue")
 class TestMoveTensor(unittest.TestCase):
@ -648,17 +649,22 @@ class TestZeroShapeTensor(unittest.TestCase):

  def test_pad(self):
    t = Tensor.rand(3, 2, 0).pad((None, None, (1, 1)), value=1)
-    assert t.shape == (3, 2, 2)
+    self.assertEqual(t.shape, (3, 2, 2))
    np.testing.assert_equal(t.numpy(), np.ones((3, 2, 2)))

    t = Tensor.rand(3, 2, 0).pad((None, (1, 1), None), value=1)
-    assert t.shape == (3, 4, 0)
+    self.assertEqual(t.shape, (3, 4, 0))
    np.testing.assert_equal(t.numpy(), np.ones((3, 4, 0)))

    t = Tensor.rand(3, 2, 0).pad(((1, 1), None, None), value=1)
-    assert t.shape == (5, 2, 0)
+    self.assertEqual(t.shape, (5, 2, 0))
    np.testing.assert_equal(t.numpy(), np.ones((5, 2, 0)))

+    np.testing.assert_equal(Tensor([1, 2]).pad_to(4).numpy(), [1, 2, 0, 0])
+    np.testing.assert_equal(Tensor([[1, 2]]).pad_to(2, 3).numpy(), [[1, 2, 0], [0, 0, 0]])
+    with self.assertRaises(TypeError): Tensor([1, 2]).pad_to(2, 3)
+    with self.assertRaises(TypeError): Tensor([[1, 2]]).pad_to(3)
+
  def test_shrink_into_zero(self):
    t = Tensor.rand(3, 4).realize()
    assert t.shrink((None, (2, 2))).realize().shape == (3, 0)
@ -865,11 +871,18 @@ class TestTensorMetadata(unittest.TestCase):
    self.assertEqual(y.grad.uop.metadata[0].name, "sigmoid")
    self.assertTrue(y.grad.uop.metadata[0].backward)
    si = Tensor.schedule(out, x.grad, y.grad)[-1]
-    self.assertEqual(len(si.metadata), 4, f"failed with {si.metadata}")
-    self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "__mul__", "relu"})
-    bw = [m for m in si.metadata if m.backward]
-    self.assertEqual(len(bw), 2)
-    self.assertEqual(bw[0].name, "sigmoid")
+    if not RANGEIFY:
+      self.assertEqual(len(si.metadata), 4, f"failed with {si.metadata}")
+      self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "__mul__", "relu"})
+      bw = [m for m in si.metadata if m.backward]
+      self.assertEqual(len(bw), 2)
+      self.assertEqual(bw[0].name, "sigmoid")
+    else:
+      self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
+      self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
+      bw = [m for m in si.metadata if m.backward]
+      self.assertEqual(len(bw), 1)
+      self.assertEqual(bw[0].name, "sigmoid")

 class TestIdxUpcast(unittest.TestCase):
  def _find_op(self, ast: UOp, op: Ops):
@ -915,13 +928,17 @@ class TestIdxUpcast(unittest.TestCase):
  def test_regular_sym(self):
    self.do_op_then_assert(dtypes.int, 2048, 2048, UOp.variable("dim3", 1, 64).bind(32))

-  @unittest.skipIf(PTX, "PTX always convert Ops.INDEX to int64")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "PTX always convert Ops.INDEX to int64")
  def test_symfold(self):
    # This would cause an overflow, but after sym fold it's within int32
    a = Tensor.arange(65535)
    uops = self._schedule_render(a)
    assert all(uop.dtype is not dtypes.long for uop in uops)

+  def test_arange_raise_overflow(self):
+    with self.assertRaises(ValueError):
+      self._schedule_render(Tensor.arange(2**33, dtype=dtypes.int))
+
  @unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_int64_unsupported_overflow_sym(self):
    with self.assertRaises(KeyError):
--- a/test/test_tensor_variable.py
+++ b/test/test_tensor_variable.py
@ -38,7 +38,7 @@ class TestTensorVariable(unittest.TestCase):
    vv = Variable("a", 1, 10).bind(2)
    vv2 = Variable("b", 1, 10).bind(2)
    t = Tensor.ones(10, 10).contiguous()[:vv2, :vv]
-    ret = t.mean(axis=1).reshape(2, 1).numpy()
+    ret = t.mean(axis=1)[:2].reshape(2, 1).numpy()
    assert np.all(ret == 1)

  def test_symbolic_mean_2d_add(self):
@ -66,25 +66,25 @@ class TestTensorVariable(unittest.TestCase):
  def test_symbolic_arange(self):
    vv = Variable("a", 1, 10)
    ret = Tensor.arange(0, vv.bind(4))
-    self.assertListEqual(ret.reshape(4).tolist(), [0,1,2,3])
+    self.assertListEqual(ret[:4].tolist(), [0,1,2,3])

  def test_symbolic_arange_sym_start(self):
    vv = Variable("a", 1, 6)
    ret = Tensor.arange(vv.bind(4), 7)
-    self.assertListEqual(ret.reshape(3).tolist(), [4,5,6])
+    self.assertListEqual(ret[:3].tolist(), [4,5,6])

  # TODO: add vmin/vmax pattern for symbolic denominator
  @unittest.expectedFailure
  def test_symbolic_arange_sym_step(self):
    vv = Variable("step", 1, 3)
    ret = Tensor.arange(0, 10, vv.bind(2))
-    self.assertListEqual(ret.reshape(5).tolist(), [0,2,4,6,8])
+    self.assertListEqual(ret[:5].tolist(), [0,2,4,6,8])

  def test_symbolic_arange_two_vars(self):
    begin = Variable("b", 1, 5)
    end = Variable("e", 6, 10)
    ret = Tensor.arange(begin.bind(4), end.bind(7))
-    self.assertListEqual(ret.reshape(3).tolist(), [4,5,6])
+    self.assertListEqual(ret[:3].tolist(), [4,5,6])

  def test_variable_empty(self):
    v = Variable("i", 1, 10)
--- a/test/test_tiny.py
+++ b/test/test_tiny.py
@ -7,6 +7,14 @@ class TestTiny(unittest.TestCase):

  # *** basic functionality ***

+  def test_const(self):
+    const = Tensor(2.0)
+    self.assertEqual(const.item(), 2.0)
+
+  def test_copy(self):
+    out = Tensor([1.,2,3])
+    self.assertListEqual(out.tolist(), [1.0, 2.0, 3.0])
+
  def test_plus(self):
    out = Tensor([1.,2,3]) + Tensor([4.,5,6])
    self.assertListEqual(out.tolist(), [5.0, 7.0, 9.0])
@ -87,7 +95,7 @@ class TestTiny(unittest.TestCase):
    ones = Tensor.ones(10).contiguous()
    for s in [2,5]:
      ret = ones[:i.bind(s)] + 1
-      self.assertListEqual(ret.contiguous().reshape(s).tolist(), [2.0]*s)
+      self.assertListEqual(ret.contiguous()[:s].tolist(), [2.0]*s)

  def test_symbolic_reduce(self):
    i = Variable('i', 1, 10)
@ -137,7 +145,7 @@ class TestTiny(unittest.TestCase):

  # *** image ***

-  @unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU")
+  @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
  def test_image(self):
    with Context(IMAGE=2): self.test_gemm(N=4, out_dtype=dtypes.imagef((4, 1, 4)))

--- a/test/test_uop_graph.py
+++ b/test/test_uop_graph.py
@ -417,13 +417,45 @@ class TestUOpGraph(unittest.TestCase):
      uops = to_uops_list([v.bitcast(dt)])
      self.assertEqual(len([x for x in uops if x.op is Ops.BITCAST]), 0, f"dtype = {dt}")

+  def test_where_on_gated_load_fold(self):
+    ridx0 = UOp.range(100, 0)
+    d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
+    ld = d0.index(ridx0, ridx0<50).load()
+    w = (ridx0<50).where(ld, 5)
+    uops = to_uops_list([w])
+    for u in uops:
+      assert u.op is not Ops.WHERE
+      if u.op is Ops.LOAD: assert u.src[1].arg==5
+
+  def test_where_on_gated_load_folds_swapped_branches(self):
+    ridx0 = UOp.range(100, 0)
+    d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
+    ld = d0.index(ridx0, (ridx0<50).logical_not()).load()
+    w = (ridx0<50).where(5, ld)
+    uops = to_uops_list([w])
+    for u in uops:
+      assert u.op is not Ops.WHERE
+      if u.op is Ops.LOAD: assert u.src[1].arg==5
+
+  def test_where_in_store_becomes_gate(self):
+    ridx0 = UOp.range(100, 0)
+    d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
+    idx = d0.index(ridx0)
+    ld = idx.load()
+    val = (ridx0<50).where(5, ld)
+    st = idx.store(val, ridx0)
+    uops = to_uops_list([st])
+    for u in uops:
+      assert u.op is not Ops.WHERE
+      if u.op is Ops.STORE: assert u.src[1].arg==5
+
  def test_load_idx_becomes_int(self):
    d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
    d1 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 1)
-    l0 = UOp(Ops.LOAD, dtypes.long, (d0.index(UOp.const(dtypes.int, 0)),))
+    l0 = UOp(Ops.LOAD, dtypes.long, (d0.index(UOp.const(dtypes.int, 0)),)).cast(dtypes.index)
    idx = l0 * 600
    valid = (l0<-1).ne(True)&(l0<3000)
-    l1 = UOp(Ops.LOAD, dtypes.long, (d1.index(idx, valid),))
+    l1 = UOp(Ops.LOAD, dtypes.long, (d1.index(idx.valid(valid)),))
    uops = to_uops_list([l1])
    for u in uops:
      if u.op is Ops.INDEX: self.assertEqual(u.src[1].dtype, dtypes.int)
@ -560,7 +592,7 @@ class TestUOpGraph(unittest.TestCase):
    glbl1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), (), 1)
    glbl2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), (), 2)
    idx = UOp.const(dtypes.int, 0)
-    ld0 = UOp(Ops.LOAD, dtypes.int, (glbl1.index(idx, UOp.const(dtypes.bool, False)),))
+    ld0 = UOp(Ops.LOAD, dtypes.int, (glbl1.index(UOp.invalid()),))
    ld1 = UOp(Ops.LOAD, dtypes.int, (glbl2.index(idx, UOp.const(dtypes.bool, True)),))
    uops = to_uops_list([UOp(Ops.STORE, dtypes.void, (glbl0.index(idx), ld1+ld0))])
    ld0 = uops[-1].src[-1]
@ -573,7 +605,7 @@ class TestUOpGraph(unittest.TestCase):
    lidx = UOp(Ops.SPECIAL, dtypes.int, (UOp.const(dtypes.int, 16),), "lidx0")
    st = UOp(Ops.STORE, dtypes.void, (smem.index(lidx), UOp.load(glbl0.index(lidx), dtype=dtypes.int)))
    barrier = UOp(Ops.BARRIER, dtypes.void, (st, ))
-    ld0 = UOp(Ops.LOAD, dtypes.int, (smem.index(lidx+1, UOp.const(dtypes.bool, False)), barrier))
+    ld0 = UOp(Ops.LOAD, dtypes.int, (smem.index(UOp.invalid()), barrier))
    ld1 = UOp(Ops.LOAD, dtypes.int, (smem.index(lidx+2, UOp.const(dtypes.bool, True)), barrier))
    uops = to_uops_list([UOp(Ops.STORE, dtypes.void, (glbl0.index(lidx), ld1+ld0))])

@ -586,7 +618,7 @@ class TestUOpGraph(unittest.TestCase):
    idx0 = UOp.const(dtypes.int, 0)
    idx1 = UOp.const(dtypes.int, 0)
    val = UOp.const(dtypes.int, 42)
-    st0 = glbl.index(idx0, UOp.const(dtypes.bool, False)).store(val)
+    st0 = glbl.index(UOp.invalid()).store(val)
    st1 = glbl.index(idx0, UOp.const(dtypes.bool, True)).store(val)
    uops = to_uops_list([st0, st1])
    # only the second store happens
--- a/test/test_uops.py
+++ b/test/test_uops.py
@ -15,6 +15,7 @@ from tinygrad.codegen import full_rewrite
 from tinygrad.uop.symbolic import sym
 from tinygrad.device import is_dtype_supported
 from tinygrad.codegen.opt import Opt, OptOps
+from tinygrad.renderer.ptx import PTXRenderer

 def to_uops_list(u:list[UOp], opts=None, skip_check=False) -> list[UOp]: return full_rewrite(UOp.sink(*u), opts)

@ -130,9 +131,9 @@ class TestFloatUOps(TestUOps):
 class TestNonFloatUOps(TestUOps):
  def test_add_int32(self): self._test_bop_fxn(Ops.ADD, lambda a,b: int(a)+int(b), (dtypes.int32, dtypes.int32))
  def test_mul_int32(self): self._test_bop_fxn(Ops.MUL, lambda a,b: int(a)*int(b), (dtypes.int32, dtypes.int32))
-  @unittest.skipUnless(getenv("PTX"), "only ptx uses bitshifts")
+  @unittest.skipUnless(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "only ptx uses bitshifts")
  def test_shr_int32(self): self._test_bop_fxn(Ops.SHR, lambda a,b: int(a)>>int(b), (dtypes.int32, dtypes.int32), no_b_neg=True)
-  @unittest.skipUnless(getenv("PTX"), "only ptx uses bitshifts")
+  @unittest.skipUnless(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "only ptx uses bitshifts")
  def test_shl_int32(self): self._test_bop_fxn(Ops.SHL, lambda a,b: int(a)<<int(b), (dtypes.int32, dtypes.int32), no_b_neg=True)
  def test_div_int32(self):
    self._test_bop_fxn(Ops.IDIV, lambda a,b: int(a/b), (dtypes.int32, dtypes.int32), no_b_zero=True)
@ -370,7 +371,7 @@ class TestLocalAccess(unittest.TestCase):
    sres = uop(uops, Ops.LOAD, dtypes.int32, (smem.index(ofs),))
    self.assertEqual(_test_uops_result(dtypes.int32, uops, sres), 42)

-@unittest.skipUnless(getenv("PTX"), "This only tests assembly backends")
+@unittest.skipUnless(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "This only tests assembly backends")
 class TestAssembly(unittest.TestCase):
  def test_bitshift_left(self):
    g1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
@ -512,7 +513,7 @@ class TestUOpStr(unittest.TestCase):
    assert str(eval(str(vec))) == str(vec)

  def test_device_arg(self):
-    device = UOp(Ops.DEVICE, arg="GPU")
+    device = UOp(Ops.DEVICE, arg="CL")
    assert str(eval(str(device))) == str(device)

  def test_reduceop_arg(self):
--- a/test/test_uops_stats.py
+++ b/test/test_uops_stats.py
@ -8,6 +8,7 @@ from tinygrad.uop.ops import Ops, UOp
 from tinygrad.dtype import dtypes
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
 from tinygrad.device import Device
+from tinygrad.renderer.ptx import PTXRenderer

 def flops_mem(uops, ignore_indexing=False):
  est = Estimates.from_uops(uops, ignore_indexing)
@ -158,7 +159,7 @@ class TestUOpsStats(unittest.TestCase):
    self.assertEqual(flops_mem(uops), flops_mem(uops_fma))

 N = 64
-@unittest.skipIf(getenv("PTX"), "wrong in PTX") # maybe?
+@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "wrong in PTX") # maybe?
 class TestStatsOptimized(unittest.TestCase):
  @classmethod
  def setUpClass(cls):
--- a/test/unit/test_device.py
+++ b/test/unit/test_device.py
@ -2,19 +2,19 @@
 import unittest, os, subprocess, sys
 from tinygrad import Tensor
 from tinygrad.device import Device, Compiler
-from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context
+from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, WIN, CI

 class TestDevice(unittest.TestCase):
  def test_canonicalize(self):
    self.assertEqual(Device.canonicalize(None), Device.DEFAULT)
    self.assertEqual(Device.canonicalize("CPU"), "CPU")
    self.assertEqual(Device.canonicalize("cpu"), "CPU")
-    self.assertEqual(Device.canonicalize("GPU"), "GPU")
-    self.assertEqual(Device.canonicalize("GPU:0"), "GPU")
-    self.assertEqual(Device.canonicalize("gpu:0"), "GPU")
-    self.assertEqual(Device.canonicalize("GPU:1"), "GPU:1")
-    self.assertEqual(Device.canonicalize("gpu:1"), "GPU:1")
-    self.assertEqual(Device.canonicalize("GPU:2"), "GPU:2")
+    self.assertEqual(Device.canonicalize("CL"), "CL")
+    self.assertEqual(Device.canonicalize("CL:0"), "CL")
+    self.assertEqual(Device.canonicalize("cl:0"), "CL")
+    self.assertEqual(Device.canonicalize("CL:1"), "CL:1")
+    self.assertEqual(Device.canonicalize("cl:1"), "CL:1")
+    self.assertEqual(Device.canonicalize("CL:2"), "CL:2")
    self.assertEqual(Device.canonicalize("disk:/dev/shm/test"), "DISK:/dev/shm/test")
    self.assertEqual(Device.canonicalize("disk:000.txt"), "DISK:000.txt")

@ -28,6 +28,51 @@ class TestDevice(unittest.TestCase):
    self.assertEqual(Device.canonicalize(None), device)
    Device.DEFAULT = device

+  @unittest.skipIf(WIN and CI, "skipping windows test") # TODO: subproccess causes memory violation?
+  def test_env_overwrite_default_compiler(self):
+    expect_failure = "\ntry: assert Device[Device.DEFAULT].compiler is None;\nexcept RuntimeError: pass"
+
+    if Device.DEFAULT == "CPU":
+      from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
+      try: _, _ = CPULLVMCompiler(), ClangJITCompiler()
+      except Exception as e: self.skipTest(f"skipping compiler test: not all compilers: {e}")
+
+      imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler"
+      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"})
+      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "0"})
+      subprocess.run([f'python3 -c "{imports}; {expect_failure}"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CLANGJIT": "0", "CPU_LLVM": "0"})
+      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CLANGJIT": "0"})
+      subprocess.run([f'python3 -c "{imports}; {expect_failure}"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CLANGJIT": "1", "CPU_LLVM": "1"})
+    elif Device.DEFAULT == "AMD":
+      from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
+      try: _, _ = HIPCompiler(Device[Device.DEFAULT].arch), AMDLLVMCompiler(Device[Device.DEFAULT].arch)
+      except Exception as e: self.skipTest(f"skipping compiler test: not all compilers: {e}")
+
+      imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler"
+      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "1"})
+      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "0"})
+      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_HIP": "1"})
+      subprocess.run([f'python3 -c "{imports}; {expect_failure}"'],
+                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_HIP": "1", "AMD_LLVM": "1"})
+    else: self.skipTest("only run on CPU/AMD")
+
+  def test_compiler_envvar(self):
+    d = Device[Device.DEFAULT]
+    dname = Device.DEFAULT.split(':')[0].upper()
+    assert d._get_compiler_envvar(type("Compiler", (), {})) == f"{dname}_COMPILER"
+    assert d._get_compiler_envvar(type("LLVMCompiler", (), {})) == f"{dname}_LLVM"
+    assert d._get_compiler_envvar(type("RandomCompiler", (), {})) == f"{dname}_RANDOM"
+    assert d._get_compiler_envvar(type(f"{dname}Compiler", (), {})) == f"{dname}_{dname}COMPILER" # do not repeat device name alone
+    assert d._get_compiler_envvar(type(f"{dname}LLVMCompiler", (), {})) == f"{dname}_LLVM" # do not repeat device name
+
 class MockCompiler(Compiler):
  def __init__(self, key): super().__init__(key)
  def compile(self, src) -> bytes: return src.encode()
@ -56,7 +101,7 @@ class TestCompiler(unittest.TestCase):
 class TestRunAsModule(unittest.TestCase):
  def test_module_runs(self):
    p = subprocess.run([sys.executable, "-m", "tinygrad.device"],stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-      env={**os.environ, "DEBUG": "1"}, timeout=10,)
+      env={**os.environ, "DEBUG": "1"}, timeout=30,)
    out = (p.stdout + p.stderr).decode()
    self.assertEqual(p.returncode, 0, msg=out)
    self.assertIn("CPU", out) # for sanity check
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@ -307,7 +307,7 @@ class TestDiskTensor(unittest.TestCase):
    ret = t.bitcast(dtypes.uint16).to("CPU") + 1
    assert ret.tolist() == [2827, 3341, 3855, 4369]

-  @unittest.skipIf(OSX, "new LLVM has an issue on OSX")
+  @unittest.skipIf(OSX or Device.DEFAULT == "CL", "new LLVM has an issue on OSX, CL=1 gives the wrong output")
  def test_bf16_disk_write_read(self):
    t = Tensor([10000, -1, -1000, -10000, 20], dtype=dtypes.float32)
    t.to(f"disk:{temp('dt_bf16_disk_write_read_f32')}").realize()
--- a/test/unit/test_dtype_spec.py
+++ b/test/unit/test_dtype_spec.py
@ -1,12 +1,11 @@
 import unittest, math, operator, subprocess, struct
 from tinygrad.tensor import Tensor, dtypes, Device
-from tinygrad.dtype import DType, DTYPES_DICT, truncate, truncate_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float
+from tinygrad.dtype import DType, DTYPES_DICT, truncate, float_to_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float
 from tinygrad.device import is_dtype_supported
 from tinygrad.helpers import getenv, CI, DEBUG
 from hypothesis import given, settings, strategies as strat
 import numpy as np
 import torch
-import ml_dtypes

 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")
@ -22,7 +21,9 @@ def _assert_eq(tensor:Tensor, target_dtype:DType, target, tol_target_dtype:float
  if DEBUG >= 2: print(tensor.numpy())
  try:
    assert tensor.dtype == target_dtype
-    np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2}.get(target_dtype, tol_target_dtype))
+    np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2,
+      dtypes.fp8e4m3:1e-1, dtypes.fp8e5m2:5e-1}.get(target_dtype, tol_target_dtype))
+
  except AssertionError as e:
    raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e

@ -105,16 +106,16 @@ class TestHelpers(unittest.TestCase):
      self.assertEqual(dt.min, dt.vec(4).min)
      self.assertEqual(dt.max, dt.vec(4).max)

-  def test_truncate_fp16(self):
-    self.assertEqual(truncate_fp16(1), 1)
-    self.assertEqual(truncate_fp16(65504), 65504)
-    self.assertEqual(truncate_fp16(65519.999), 65504)
-    self.assertEqual(truncate_fp16(65520), math.inf)
-    self.assertEqual(truncate_fp16(1e-8), 0.0)
-    self.assertEqual(truncate_fp16(-65504), -65504)
-    self.assertEqual(truncate_fp16(-65519.999), -65504)
-    self.assertEqual(truncate_fp16(-65520), -math.inf)
-    self.assertTrue(math.isnan(truncate_fp16(math.nan)))
+  def test_float_to_fp16(self):
+    self.assertEqual(float_to_fp16(1), 1)
+    self.assertEqual(float_to_fp16(65504), 65504)
+    self.assertEqual(float_to_fp16(65519.999), 65504)
+    self.assertEqual(float_to_fp16(65520), math.inf)
+    self.assertEqual(float_to_fp16(1e-8), 0.0)
+    self.assertEqual(float_to_fp16(-65504), -65504)
+    self.assertEqual(float_to_fp16(-65519.999), -65504)
+    self.assertEqual(float_to_fp16(-65520), -math.inf)
+    self.assertTrue(math.isnan(float_to_fp16(math.nan)))

  def test_float_to_bf16(self):
    # TODO: fuzz this better
@ -190,7 +191,7 @@ class TestHelpers(unittest.TestCase):
    elif math.isinf(x): np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), math.copysign(math.nan, x))
    elif x > FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), FP8E4M3_MAX)
    elif x < -FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), -FP8E4M3_MAX)
-    else: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), ml_dtypes.float8_e4m3fn(x))
+    else: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), torch.tensor(x, dtype=torch.float8_e4m3fn).float().item())

  @given(strat.floats(width=32, allow_subnormal=True, allow_nan=True, allow_infinity=True))
  def test_truncate_fp8e5m2(self, x):
@ -198,7 +199,7 @@ class TestHelpers(unittest.TestCase):
    elif math.isinf(x): np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), x)
    elif x > FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), FP8E5M2_MAX)
    elif x < -FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), -FP8E5M2_MAX)
-    else: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), ml_dtypes.float8_e5m2(x))
+    else: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), torch.tensor(x, dtype=torch.float8_e5m2).float().item())

 class TestTypeSpec(unittest.TestCase):
  def setUp(self):
@ -378,7 +379,7 @@ class TestTypePromotion(unittest.TestCase):
    assert least_upper_dtype(dtypes.int32, dtypes.uint32) == dtypes.int64
    assert least_upper_dtype(dtypes.uint32, dtypes.int64) == dtypes.int64
    # similar to jax but we don't use weak type
-    assert least_upper_dtype(dtypes.int64, dtypes.uint64) == dtypes.float16
+    assert least_upper_dtype(dtypes.int64, dtypes.uint64) == dtypes.fp8e4m3
    assert least_upper_dtype(dtypes.float16, dtypes.float32) == dtypes.float32
    assert least_upper_dtype(dtypes.float32, dtypes.float64) == dtypes.float64

@ -387,6 +388,14 @@ class TestTypePromotion(unittest.TestCase):
    assert least_upper_dtype(dtypes.float16, dtypes.int64) == dtypes.float16
    assert least_upper_dtype(dtypes.float16, dtypes.uint64) == dtypes.float16
    assert least_upper_dtype(dtypes.fp8e4m3, dtypes.fp8e5m2) == dtypes.half
+    assert least_upper_dtype(dtypes.fp8e4m3, dtypes.bfloat16) == dtypes.bfloat16
+    assert least_upper_dtype(dtypes.fp8e5m2, dtypes.bfloat16) == dtypes.bfloat16
+    assert least_upper_dtype(dtypes.fp8e4m3, dtypes.float16) == dtypes.float16
+    assert least_upper_dtype(dtypes.fp8e5m2, dtypes.float16) == dtypes.float16
+    assert least_upper_dtype(dtypes.fp8e4m3, dtypes.int64) == dtypes.fp8e4m3
+    assert least_upper_dtype(dtypes.fp8e4m3, dtypes.uint64) == dtypes.fp8e4m3
+    assert least_upper_dtype(dtypes.fp8e5m2, dtypes.int64) == dtypes.fp8e5m2
+    assert least_upper_dtype(dtypes.fp8e5m2, dtypes.uint64) == dtypes.fp8e5m2

 class TestAutoCastType(unittest.TestCase):
  def setUp(self):
@ -569,10 +578,10 @@ class TestAutoCastType(unittest.TestCase):
  def test_gradient_dtype(self):
    old_default_float = dtypes.default_float

-    for default_dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]:
+    for default_dtype in dtypes.floats:
      if not is_dtype_supported(default_dtype): continue
      dtypes.default_float = default_dtype
-      for dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]:
+      for dtype in  dtypes.floats:
        if not is_dtype_supported(dtype): continue
        if DEBUG >= 2:
          print(f"testing {default_dtype=}, {dtype=}")
--- a/test/unit/test_gguf.py
+++ b/test/unit/test_gguf.py
@ -67,7 +67,7 @@ class TestGGUF(unittest.TestCase):
      return np.array([E] + packed, dtype=np.uint8)

    def decode(code, E):
-      sign = -1.0 if code * 0b1000 else 1.0
+      sign = -1.0 if (code & 0b1000) else 1.0
      exp = (code >> 1) & 0b11
      mant = code & 0b1
      val = (1.0 + 0.5 * mant) * np.exp2(exp - 1) if exp else 0.5 * mant
@ -83,7 +83,8 @@ class TestGGUF(unittest.TestCase):
      expected.extend(decode(c, E) for c in codes)
    tensor = Tensor(np.concatenate(blocks))
    out = ggml_data_to_tensor(tensor, len(expected), MXFP4)
-    self.assertListEqual(out.numpy().tolist(), np.array(expected, dtype=np.float32).tolist())
+    # TODO: should this be exact equal? somehow failed on CI
+    np.testing.assert_allclose(out.numpy(), expected, atol=0.0, rtol=1e-6)

  def test_expected_failure_unknown_type(self):
    with self.assertRaises(ValueError):
--- a/test/unit/test_indexing.py
+++ b/test/unit/test_indexing.py
@ -181,7 +181,7 @@ class TestIndexing(unittest.TestCase):
    # self.assertRaises(TypeError, delitem)

  # TODO: LLVM is quite fast, why are other compiled backends slow?
-  @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "GPU", "METAL", "NV", "AMD"], "slow")
+  @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow")
  def test_advancedindex(self):
    # integer array indexing

--- a/test/unit/test_linearizer_rewrite.py
+++ b/test/unit/test_linearizer_rewrite.py
@ -23,7 +23,6 @@ class TestLinearizerRewrite(unittest.TestCase):
      si = out.schedule()[-1]
      opts_to_apply = []
      opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
-      opts_to_apply.append(Opt(OptOps.UNROLL, 0, 4))
      ast = si.ast.replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
      prg = get_program(ast, Device["CPU"].renderer)
      print(prg.src)
--- a/test/unit/test_shapetracker.py
+++ b/test/unit/test_shapetracker.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import unittest
 import numpy as np
-from tinygrad.dtype import dtypes
+from tinygrad.dtype import dtypes, Invalid
 from tinygrad.helpers import prod
 from tinygrad.shape.shapetracker import ShapeTracker, View
 from tinygrad import Variable
@ -10,7 +10,8 @@ from tinygrad.codegen.late.devectorizer import sym
 from itertools import product

 def shapetracker_getitem(st:ShapeTracker, val:int):
-  idx, valid = st.reshape((st.size,)).to_indexed_uops([UOp.const(dtypes.int, val)])
+  valid_idx = st.reshape((st.size,)).to_valid_uop([UOp.const(dtypes.int, val)])
+  idx, valid = valid_idx.get_idx(), valid_idx.get_valid()
  idx, valid = graph_rewrite(idx, sym), graph_rewrite(valid, sym)
  assert idx.op is Ops.CONST and valid.op is Ops.CONST
  return idx.arg, valid.arg
@ -68,7 +69,7 @@ class CheckingShapeTracker:
  def contiguous(self): return self.st.contiguous

  def assert_same(self):
-    x = [(v[0] if (v:=shapetracker_getitem(self.st, i))[1] else -1) for i in range(prod(self.st.shape))]
+    x = [(v[0] if (v:=shapetracker_getitem(self.st, i))[1] and v[0] is not Invalid else -1) for i in range(prod(self.st.shape))]
    y = [self[i] for i in range(prod(self.shape))]
    assert self.st.shape == self.shape
    assert x == y, f"mismatch shapetracker:{x} real:{y}"
@ -154,7 +155,7 @@ class TestRealStrides(unittest.TestCase):
      View.create((1, 3, 22, 21), (0, 192, 16, 1), 0, ((0, 1), (0, 3), (0, 12), (0, 16))),
      View.create((3, 11, 7, 2, 3), (462, 21, 1, 231, 7), 0, None),
    ))
-    self.assertEqual(st.real_strides(), (132, None, None, None, None))
+    self.assertEqual(st.real_strides(), (132, 12, None, None, None))

 class TestRealSimplifies(unittest.TestCase):
  def tearDown(self):
@ -619,20 +620,6 @@ class TestMaskedShapeTracker(unittest.TestCase):
    st3.reshape((4, 3, 6, 5))
    st3.assert_same()

-  def test_axis_is_masked(self):
-    st = ShapeTracker.from_shape((100, 100, 100, 100)).pad(((0,1),(0,0),(2,0), (0,0)))
-    assert st.axis_is_masked(0)
-    assert not st.axis_is_masked(1)
-    assert st.axis_is_masked(2)
-    assert not st.axis_is_masked(3)
-
-  def test_axis_is_masked_rw1(self):
-    st = ShapeTracker(views=(View(shape=(1, 2, 1, 4, 4, 13, 4, 13), strides=(0, 324, 0, 81, 0, 9, 0, 1), offset=-20,
-                                  mask=((0, 1), (0, 2), (0, 1), (0, 4), (0, 4), (2, 11), (0, 4), (2, 11)), contiguous=False),
-                             View(shape=(2, 4, 11, 11, 4, 3, 3), strides=(10816, 0, 52, 1, 2704, 728, 14), offset=0,
-                                  mask=None, contiguous=False)))
-    assert not st.axis_is_masked(0)
-
 class TestShapeTracker(unittest.TestCase):
  def setUp(self):
    self.st = CheckingShapeTracker((7,4))
@ -830,12 +817,14 @@ class TestShapeTrackerSize(unittest.TestCase):
 class TestRender(unittest.TestCase):
  def test_render(self):
    st = ShapeTracker.from_shape((2, 3))
-    idx, valid = st.to_indexed_uops()
+    valid_idx = st.to_valid_uop()
+    idx, valid = valid_idx.get_idx(), valid_idx.get_valid()
    self.assertEqual(idx.render(), "((ridx0*3)+ridx1)")
    self.assertEqual(valid.render(), "True")

    st = st.pad(((0, 1), (0, 0)))
-    idx, valid = st.to_indexed_uops()
+    valid_idx = st.to_valid_uop()
+    idx, valid = valid_idx.get_idx(), valid_idx.get_valid()
    self.assertEqual(idx.render(), "((ridx0*3)+ridx1)")
    self.assertEqual(valid.render(), "(ridx0<2)")

--- a/test/unit/test_simplify_valid_idx.py
+++ b/test/unit/test_simplify_valid_idx.py
@ -8,13 +8,13 @@ from tinygrad.helpers import Context

 def get_gated_load_uop(valid:UOp, idx:UOp):
  return UOp(Ops.LOAD, dtypes.float, (
-    UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0).index(idx, valid),
+    UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0).index(idx.valid(valid)),
    UOp.const(dtypes.float, 0.0)
  ))

 def get_load_image_uop(image_shape:tuple[int, ...], valid:UOp, idx:tuple[UOp, UOp]):
  return UOp(Ops.LOAD, dtypes.float.vec(4), (
-    UOp(Ops.DEFINE_GLOBAL, dtypes.imagef(image_shape), arg=0).index(UOp(Ops.VECTORIZE, dtypes.int.vec(2), idx), valid),
+    UOp(Ops.DEFINE_GLOBAL, dtypes.imagef(image_shape), arg=0).index(UOp(Ops.VECTORIZE, dtypes.index.vec(2), idx).valid(valid)),
    UOp(Ops.VECTORIZE, dtypes.float.vec(4), src=(UOp.const(dtypes.float, 0.0),) * 4)
  ))

@ -359,7 +359,7 @@ class TestImageSimplification(unittest.TestCase):
    self.check(load, None, "((gidx*3)+-1438)", "0")

  def test_simplify2(self):
-    # from GPU=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d
+    # from CL=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d
    lidx = Special("lidx", 4)
    valid = (lidx<3) & (lidx<1).ne(True)
    idx = ((lidx+1)%2, (lidx+1)//2-1)
--- a/test/unit/test_symbolic_shapetracker.py
+++ b/test/unit/test_symbolic_shapetracker.py
@ -13,7 +13,6 @@ class TestSymbolic(unittest.TestCase):
    assert st.shape == (x, 3)
    assert st.real_strides() == (3, 1)

-  @unittest.expectedFailure
  def test_real_strides_0(self):
    st = ShapeTracker(views=(View(shape=(2, (Variable('start_pos', 1, 8)+1), 1, 1), strides=(8, 1, 0, 0), offset=0, mask=((0, 2), (0, Variable('start_pos', 1, 8)), (0, 1), (0, 1)), contiguous=False), View(shape=(2, (Variable('start_pos', 1, 8)+1)), strides=((Variable('start_pos', 1, 8)+1), 1), offset=0, mask=None, contiguous=True)))   # noqa: E501
    self.assertEqual(st.real_strides(), (8, None))
@ -198,7 +197,7 @@ class TestSymbolicPad(unittest.TestCase):
  def test_pad(self):
    v = Variable("v", 1, 100).bind(5)
    t = Tensor.ones(100)[:v].pad(((4, 0),))
-    t = t.reshape(9)
+    t = t[:9]
    assert t.tolist() == [0,0,0,0,1,1,1,1,1]


--- a/test/unit/test_tensor_uop_representation.py
+++ b/test/unit/test_tensor_uop_representation.py
@ -34,7 +34,7 @@ class TestTensorMutates(unittest.TestCase):
    is_pattern_uop(c.uop.base, realized_pattern)
    # NOTE: we keep movement ops on top of the buffer view
    is_pattern_uop(c.uop, UPat(Ops.BUFFER))
-    is_pattern_uop(d.uop, UPat(Ops.VIEW, src=(realized_pattern,)))
+    assert d.uop is not d.uop.base

  def test_reshape_is_same_child(self):
    a = Tensor([1,2,3])
@ -58,40 +58,6 @@ class TestTensorUopRepresentation(unittest.TestCase):
    print(c.uop)
    is_pattern(c, UPat(Ops.ADD, src=(realized_pattern, realized_pattern)))

-  def test_const_pattern(self):
-    a = Tensor(1)
-    print(a.uop)
-    is_pattern(a, const_pattern) # const in tensor has a DEVICE and VIEW src
-    is_pattern(a, UPat.cvar("x")) # even cvar works!
-
-  def test_consts_do_not_realize(self):
-    a = Tensor(1)
-    print(a.uop)
-    pre_realize = a.uop
-    a.realize()
-    assert a.uop is pre_realize
-
-  def test_viewed_consts_do_not_realize(self):
-    a = Tensor.ones(10, 10)
-    print(a.uop)
-    a.realize()
-    is_pattern(a, const_pattern)
-    self.assertEqual(a.uop.shape, (10, 10))
-
-  # CONST is EXPAND -> RESHAPE -> CONST -> DEVICE
-  def test_consts_dont_have_buffers(self):
-    a = Tensor.ones(10, 10)
-    buffers_in_parents = [x.op for x in a.uop.toposort() if x.op is Ops.BUFFER]
-    self.assertEqual(len(buffers_in_parents), 0)
-    is_pattern(a, UPat(Ops.EXPAND, src=(UPat(Ops.RESHAPE, src=(const_pattern,)),)))
-
-  # COPY has a copyin source and a device.
-  def test_copyin(self):
-    a = Tensor([1.,2,3]).realize()
-    c = a.to("TEST")   # NOTE: this isn't checked
-    print(c.uop)
-    is_pattern(c, UPat(Ops.COPY, src=(realized_pattern, UPat(Ops.DEVICE)), arg=None))
-
  def test_empty_buf(self):
    a = Tensor.empty(3, 3)
    is_pattern(a, UPat(Ops.RESHAPE, src=(UPat(Ops.BUFFER),)))
--- a/test/unit/test_uop_symbolic.py
+++ b/test/unit/test_uop_symbolic.py
@ -2,7 +2,7 @@
 import unittest, pickle, functools, math
 import z3

-from tinygrad.dtype import dtypes, ConstType, DType
+from tinygrad.dtype import dtypes, ConstType, DType, Invalid
 from tinygrad.codegen import full_rewrite
 from tinygrad.helpers import Context
 from tinygrad.uop.ops import UOp, Ops, graph_rewrite, sym_infer, track_rewrites
@ -93,6 +93,37 @@ class TestSymbolic(unittest.TestCase):
    assert idx1+idx2 is not idx2
    assert idx1*idx2 is not idx2*idx1

+  def test_uop_gcd_method(self):
+    a = Variable("a", 0, 8)
+    b = Variable("b", 0, 8)
+    self.assertEqual(UOp.gcd(a, a*b, a*3).simplify(), a)
+    self.assertEqual(UOp.gcd(a*a*a, a*b*a, a*3*a).simplify(), a*a)
+    self.assertEqual(UOp.gcd(a*a*10, b*a*5, a*a*5).simplify(), a*5)
+    self.assertEqual(UOp.gcd(a*10, b*5, a*5).simplify(), a.const_like(5))
+    self.assertEqual(UOp.gcd(a, b*5, a*5).simplify(), a.const_like(1))
+
+  def test_divides_exact(self):
+    a = Variable("a", 1, 8)
+    b = Variable("b", 1, 8)
+    self.assertEqual((a*a*3).divide_exact(a).simplify(), a*3)
+    self.assertEqual((a*a*3).divide_exact(a*a*3).simplify(), a.const_like(1))
+    self.assertEqual((a*b*3).divide_exact(a.const_like(3)).simplify(), a*b)
+    self.assertEqual((a*a*3).divide_exact(a*a.const_like(-3)).simplify(), a*-1)
+    self.assertEqual((a*a*b*3).divide_exact(a*b).simplify(), a*3)
+    self.assertEqual((a*3+a*b).divide_exact(a).simplify(), b+3)
+    self.assertEqual((a*b*3+a*b*b).divide_exact(a*b).simplify(), b+3)
+    self.assertEqual((((a*-2)+14)*b).divide_exact(((a*-2)+14)).simplify(), b)
+
+  def test_divide_exact_not(self):
+    a = Variable("a", 1, 8)
+    b = Variable("b", 1, 8)
+    x = Variable("x", -20, 0)
+    self.assertEqual((a).divide_exact(b), None)
+    self.assertEqual((a+2).divide_exact(a), None)
+    self.assertEqual((x*-1).divide_exact(a), None)
+    self.assertEqual((a*5).divide_exact(a*10), None)
+    self.assertEqual((a*10-1).divide_exact(a*10), None)
+
  def test_factorize(self):
    a = Variable("a", 0, 8)
    b = Variable("b", 0, 8)
@ -110,7 +141,7 @@ class TestSymbolic(unittest.TestCase):
    self.helper_test_variable(-Variable("a", 0, 8), -8, 0, "(a*-1)")

  def test_xor_0(self):
-    self.helper_test_variable(Variable("a", 0, 8, dtypes.int) ^ 0, 0, 8, "a")
+    self.helper_test_variable(Variable("a", 0, 8, dtypes.int) ^ 0, 0, 8, "a", test_z3=False)

  def test_add_1(self):
    self.helper_test_variable(Variable("a", 0, 8)+1, 1, 9, "(a+1)")
@ -374,6 +405,16 @@ class TestSymbolic(unittest.TestCase):
  def test_and_remove(self):
    self.helper_test_variable(uand([uconst(1), Variable("a", 0, 1)]), 0, 1, "a")

+  def test_bool_or_not_tautology(self):
+    a = Variable("a", 0, 10)
+    c = a<10
+    self.helper_test_variable(c | c.logical_not(), True, True, "True")
+
+  def test_bool_and_not_contradiction(self):
+    a = Variable("a", 0, 10)
+    c = a<10
+    self.helper_test_variable(c & c.logical_not(), False, False, "False")
+
  def test_mod_factor_negative(self):
    self.helper_test_variable(usum([uconst(-29), Variable("a", 0, 10), Variable("b", 0, 10)*28]) % 28, -27, 27, "(((a+(b*28))+-29)%28)")
    self.helper_test_variable(usum([uconst(-29), Variable("a", 0, 100), Variable("b", 0, 10)*28]) % 28, -27, 27, "(((a+(b*28))+-29)%28)")
@ -440,6 +481,33 @@ class TestSymbolic(unittest.TestCase):
  def test_mul_div_factor_div_neg(self):
    self.helper_test_variable((Variable("a", 0, 10)*-4+4)//8, -4, 0, "(((a*-1)+1)//2)")

+  def test_div_symbolic_const_gcd(self):
+    a = Variable("a", -10, 10)
+    b = Variable("b", -10, 10)
+    d = Variable("d", 1, 10)
+    self.helper_test_variable((3*a+9*b)//(3*d), -40, 40, "((a+(b*3))//d)")
+
+  def test_symbolic_gcd_div(self):
+    a = Variable("a", -10, 10)
+    b = Variable("b", -10, 10)
+    c = Variable("c", -10, 10)
+    d1 = Variable("d1", 1, 10)
+    d2 = Variable("d2", -10, -1)
+    self.helper_test_variable((d1*a*b*d1)//(d1), -1000, 1000, "(a*(b*d1))")
+    self.helper_test_variable((d1*a*d2*b*d1)//(d1*d2),  -1000, 1000, "(a*(b*d1))")
+    self.helper_test_variable((d1*a + b*d1)//(d1), -20, 20, "(a+b)")
+    self.helper_test_variable((d1*a + b*d1 + c*d1)//(d1), -30, 30, "(c+(a+b))")
+    self.helper_test_variable((3*a*d1 + 9*b*d1)//(3*d1*d2), -40, 40, "(((a+(b*3))//(d2*-1))*-1)")
+    self.helper_test_variable((3*a*d1 + 9*b*d1+3)//(3*d1*d2), -401, 399, "(((((a*d1)+((b*d1)*3))+1)//((d1*d2)*-1))*-1)")
+
+  def test_symbolic_factor_remainder_div(self):
+    a = Variable("a", 0, 10)
+    b = Variable("b", 0, 10)
+    d = Variable("d", 1, 10)
+    self.helper_test_variable((d*a+b)//d, 0, 20, "(a+(b//d))")
+    self.helper_test_variable((d*a*20+b)//(5*d), 0, 42, "((a*4)+(b//(d*5)))")
+    self.helper_test_variable((d*a*20+b*d*5+10)//(5*d), 0, 52, "((b+(a*4))+(2//d))")
+
  def test_mod_gcd_factor_neg(self):
    self.helper_test_variable((Variable("a", 0, 10)*-4+4)%8, -4, 4, "((((a*-1)+1)%2)*4)")

@ -510,6 +578,13 @@ class TestSymbolic(unittest.TestCase):
    self.helper_test_variable((gidx0*4+lidx2*2+lidx3)//12, 0, 4, ("(((lidx2//2)+gidx0)//3)", "((gidx0+(lidx2//2))//3)"))
    self.helper_test_variable((lidx2*2+gidx0*4+lidx3)//12, 0, 4, ("(((lidx2//2)+gidx0)//3)", "((gidx0+(lidx2//2))//3)"))

+  @unittest.expectedFailure  # TODO: improve nest_div_by_smallest_factor
+  def test_sum_div_complex4(self):
+    gidx0 = Variable("gidx0", 0, 2)
+    lidx2 = Variable("lidx2", 0, 12)
+    lidx3 = Variable("lidx3", 0, 12)
+    self.helper_test_variable((gidx0*3+lidx2*19+lidx3*38)//(3*19), 0, 12, ("((lidx2+(lidx3*2))//3)"))
+
  def test_sum_mul_distribute(self):
    gidx0 = Variable("gidx0", 0, 7)
    lidx2 = Variable("lidx2", 0, 12)
@ -581,45 +656,6 @@ class TestSymbolic(unittest.TestCase):
    with self.assertRaises(AssertionError):
      self.helper_test_variable((30 * b + 1) % 18 + ((30 * b + 1) // 18) * 18, 1, 3001, "((b*30)+1)")

-  def test_arange_unrolled4(self):
-    gidx = Variable("gidx", 0, 2559)
-    unrolled_div = (gidx+2561)//4+(gidx+2562)//4+(gidx+2560)//4+(gidx+2559)//4
-    self.helper_test_variable(unrolled_div, 2559, 5118, "(gidx+2559)")
-
-  def test_arange_unrolled4_with_cast(self):
-    gidx = Variable("gidx", 0, 2559, dtypes.index)
-    dt = dtypes.int
-    unrolled_div = ((gidx+2561)//4 + 2).cast(dt)+((gidx+2562)//4).cast(dt)+((gidx+2560)//4).cast(dt)+((gidx+2559)//4).cast(dt)
-    self.helper_test_variable(unrolled_div, 2561, 5120, "((int)(gidx)+2561)")
-
-  def test_arange_unrolled4_mul(self):
-    gidx = Variable("gidx", 0, 2559)
-    unrolled_div = 2*((gidx+2561)//4)+2*((gidx+2562)//4)+2*((gidx+2560)//4)+2*((gidx+2559)//4)
-    self.helper_test_variable(unrolled_div, 5118, 10236, "((gidx*2)+5118)")
-
-  def test_arange_unrolled4_small(self):
-    gidx = Variable("gidx", 0, 3)
-    unrolled_div = (gidx)//4+(gidx+2)//4+(gidx+3)//4+(gidx+1)//4
-    self.helper_test_variable(unrolled_div, 0, 3, "gidx")
-
-    gidx = Variable("gidx", 0, 2)
-    unrolled_div = (gidx)//4+(gidx+2)//4+(gidx+3)//4+(gidx+1)//4
-    self.helper_test_variable(unrolled_div, 0, 2, "gidx")
-
-    gidx = Variable("gidx", 0, 1)
-    unrolled_div = (gidx)//4+(gidx+2)//4+(gidx+3)//4+(gidx+1)//4
-    self.helper_test_variable(unrolled_div, 0, 1, "gidx")
-
-  def test_arange_unrolled2(self):
-    gidx = Variable("gidx", 0, 2559)
-    unrolled_div = (gidx+2559)//2+(gidx+2560)//2+3
-    self.helper_test_variable(unrolled_div, 2562, 5121, "(gidx+2562)")
-
-  def test_arange_unrolled2_neg(self):
-    ridx = Variable("ridx", 0, 255)
-    unrolled_div = -((255-ridx)//2) - ((256-ridx)//2)
-    self.helper_test_variable(unrolled_div, -255, 0, "(ridx+-255)")
-
  def test_gated_load(self):
    idx = Variable("idx", 0, 24)
    self.helper_test_variable(idx//4, 0, 6, "(idx//4)")
@ -924,6 +960,46 @@ class TestSymbolicSymbolicOps(unittest.TestCase):
    assert c == uconst(2)
 """

+class TestInvalidIndex(unittest.TestCase):
+  def test_invalid_times_0(self):
+    ridx = Variable("ridx", 0, 10)
+    idx = (ridx<5).where(ridx, UOp.invalid())*0
+    self.assertIs(idx.simplify(), (ridx<5).where(0, UOp.invalid()), "multiplying an index by 0 should preserve the invalid")
+
+  def test_invalid_comparison_drops_invalid(self):
+    # comparisons return a bool, and bools can't be invalid
+    ridx = Variable("ridx", 0, 10)
+    idx = (ridx<5).where(ridx, UOp.invalid())<3
+    self.assertIs(idx.simplify(), (ridx<3), "comparison of index should drop the invalid")
+    self.assertIs(idx.where(UOp.const(dtypes.int, 1), 0).simplify(), (ridx<3).where(UOp.const(dtypes.int, 1), 0),
+      "comparison of index should drop the invalid")
+
+  def test_alu_moves_inside_invalid(self):
+    ridx = Variable("ridx", 0, 10)
+    idx = (ridx<5).where(ridx, UOp.invalid())*10
+    self.assertIs(idx.simplify(), (ridx<5).where(ridx*10, UOp.invalid()), "multiplying an index by 0 should preserve the invalid")
+
+  def test_merge_invalid_conditions(self):
+    ridx0 = Variable("ridx0", 0, 10)
+    ridx1 = Variable("ridx1", 0, 10)
+    idx0 = (ridx0<5).where(ridx0, UOp.invalid())
+    idx1 = (ridx1<5).where(idx0//2, UOp.invalid())
+    self.assertIs(idx1.simplify(), ((ridx1<5)&(ridx0<5)).where(ridx0//2, UOp.invalid()),
+      "valid inside a valid should make a single valid and & the conditions")
+
+  def test_alu_invalid(self):
+    self.assertIs((UOp.invalid()*2).simplify(), UOp.invalid())
+    self.assertIs((UOp.invalid()*0).simplify(), UOp.invalid())
+    self.assertIs((UOp.invalid()+8).simplify(), UOp.invalid())
+    self.assertIs((UOp.invalid()+Variable("a",0,10)).simplify(), UOp.invalid())
+    self.assertIs((UOp.invalid()*Variable("a",0,10)).simplify(), UOp.invalid())
+    self.assertIs((UOp.invalid()<Variable("a",0,10)).simplify().dtype, dtypes.bool)
+
+  def test_alu_invalid_vconst(self):
+    c1 = UOp.const(dtypes.index.vec(4), (1, 1, Invalid, Invalid))
+    c2 = UOp.const(dtypes.index.vec(4), (1, Invalid, 1, 1))
+    self.assertIs((c1+c2).simplify(), UOp.const(dtypes.index.vec(4), (2, Invalid, Invalid, Invalid)))
+
 class TestSymbolicRealWorld(unittest.TestCase):
  def test_resnet_half(self):
    gidx0 = Variable("gidx0", 0, 3)
--- a/test/unit/test_uop_vmin_vmax.py
+++ b/test/unit/test_uop_vmin_vmax.py
@ -1,6 +1,6 @@
 import unittest, math
 from tinygrad.uop.ops import UOp, Ops
-from tinygrad.dtype import dtypes
+from tinygrad.dtype import dtypes, Invalid

 class TestVminVmaxProperties(unittest.TestCase):
  def test_vmin_vmax_constant(self):
@ -122,6 +122,15 @@ class TestVminVmaxProperties(unittest.TestCase):
    self.assertEqual(x_uint.vmin, dtypes.min(dtypes.uint))
    self.assertEqual(x_uint.vmax, dtypes.max(dtypes.uint))

+  def test_vmin_vmax_invalid(self):
+    i = UOp.invalid()
+    self.assertNotEqual(i.vmin, i.vmax)
+
+  def test_vmin_vmax_invalid_vconst(self):
+    x = UOp.const(dtypes.index.vec(4), (0, 4, Invalid, Invalid))
+    self.assertLess(x.vmin, 0)
+    self.assertGreater(x.vmax, 4)
+
 class TestVminVmaxDivMod(unittest.TestCase):
  def test_vmin_vmax_division_positive(self):
    # vmin and vmax for division of a variable by a positive constant
--- a/test/unit/test_viz.py
+++ b/test/unit/test_viz.py
@ -408,7 +408,7 @@ class TestVizProfiler(unittest.TestCase):
      get_profile(prof)

  def test_python_marker(self):
-    with Context(PROFILE=1):
+    with Context(VIZ=1):
      a = Tensor.empty(1, device="NULL")
      b = Tensor.empty(1, device="NULL")
      (a+b).realize()
--- a/test/unit/test_winograd.py
+++ b/test/unit/test_winograd.py
@ -1,7 +1,7 @@
 import unittest, sys
 import numpy as np
 from tinygrad import Tensor, GlobalCounters, dtypes, Context, nn
-from tinygrad.helpers import CI, Profiling, WINO
+from tinygrad.helpers import CI, Profiling, WINO, RANGEIFY

@unittest.skipIf(sys.platform.startswith("win"), "flaky on Windows")
 class TestWinogradClose(unittest.TestCase):
@ -35,32 +35,35 @@ class TestWinograd(unittest.TestCase):
  def test_forward_kernels(self):
    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
    out = Tensor.conv2d(x,w)
-    self.assertEqual(len(out.schedule()), 4)
+    self.assertEqual(len(out.schedule()), 2 if RANGEIFY else 4)

  def test_backward_kernels(self):
    x,w = Tensor.empty(1,4,9,9,requires_grad=True).realize(), Tensor.empty(4,4,3,3,requires_grad=True).realize()
    out = Tensor.conv2d(x,w, padding=1)
    out.mean().backward()
    backward_schedule = Tensor.schedule(x.grad, w.grad)
-    self.assertEqual(len(backward_schedule), 9)
+    self.assertEqual(len(backward_schedule), 6 if RANGEIFY else 9)

  def test_counters(self):
    IC, OC, X, Y = 4,4,9,9
    #OC, IC, X, Y = 512, 256, 8, 8
    x,w = Tensor.rand(1,IC,Y,X).realize(), Tensor.rand(OC,IC,3,3).realize()
    GlobalCounters.reset()
-    Tensor.conv2d(x,w).realize()
+    with Context(WINO=1):
+      Tensor.conv2d(x,w).realize()
    ops_wino, mem_wino = GlobalCounters.global_ops, GlobalCounters.global_mem
-    WINO.value = 0
    GlobalCounters.reset()
-    Tensor.conv2d(x,w).realize()
+    with Context(WINO=0):
+      Tensor.conv2d(x,w).realize()
    ops_normal, mem_normal = GlobalCounters.global_ops, GlobalCounters.global_mem

    ops_ratio, mem_ratio = ops_wino/ops_normal, mem_wino/mem_normal
    print(f"ops: normal {ops_normal:9d} wino {ops_wino:9d} ratio {ops_ratio:.2f}")
    print(f"mem: normal {mem_normal:9d} wino {mem_wino:9d} ratio {mem_ratio:.2f}")
-    self.assertLess(ops_ratio, 2.6)  # TODO: there's issues with factorization now
-    self.assertLess(mem_ratio, 10)
+
+    if not RANGEIFY:
+      self.assertLess(ops_ratio, 2.6)  # TODO: there's issues with factorization now
+      self.assertLess(mem_ratio, 10)

  def test_dtype(self):
    IC, OC, X, Y = 4,4,9,9
--- a/tinygrad/codegen/init.py
+++ b/tinygrad/codegen/init.py
@ -10,7 +10,7 @@ from tinygrad.renderer import Renderer
 from tinygrad.codegen.lowerer import pm_lowerer, get_index
 from tinygrad.codegen.quantize import pm_quant
 from tinygrad.codegen.gpudims import pm_add_gpudims
-from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing, cast_folding
+from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing
 from tinygrad.uop.decompositions import get_late_rewrite_patterns
 from tinygrad.codegen.late.expander import migrate_indexing, expander, pm_pre_expander
 from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \
@ -18,8 +18,8 @@ from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_in
 from tinygrad.codegen.late.linearize import block_create, pm_blockend_merge, block_merge, pm_finalize, BlockContext
 from tinygrad.codegen.opt.swizzler import view_left, view_right, fix_kernel_ops
 from tinygrad.codegen.opt.postrange import pm_postrange_opt
-from tinygrad.codegen.simplify import pm_simplify_ranges
-from tinygrad.schedule.rangeify import pm_add_buffers_local, rangeify_codegen
+from tinygrad.codegen.simplify import pm_simplify_ranges, pm_reduce_simplify, pm_flatten_range
+from tinygrad.schedule.rangeify import pm_add_buffers, rangeify_codegen

@dataclass
 class RewriteStep:
@ -62,10 +62,11 @@ def _get_rewrites_for_renderer(opts:Renderer, optimize:bool, linearizer:bool, _Q
    ret.append(RewriteStep(pm_lowerer, get_index, name="lowerer", bottom_up=True))

    # symbolic (NOTE: this is a requirement for pm_simplify_ranges to be correct)
-    ret.append(RewriteStep(sym, name="initial symbolic"))
+    ret.append(RewriteStep(sym+pm_flatten_range, name="initial symbolic"))

    # optimize (schedule) the AST
    ret.append(RewriteStep(pm_simplify_ranges, name="simplify ranges"))
+    ret.append(RewriteStep(pm_reduce_simplify, name="simplify reduces"))
    ret.append(RewriteStep(pm_postrange_opt, ctx=lambda _: opts, name="post optimize ast"))

  # ** expander (expand_rewrite) **
@ -75,7 +76,7 @@ def _get_rewrites_for_renderer(opts:Renderer, optimize:bool, linearizer:bool, _Q
  ret.append(RewriteStep(sym+pm_pre_expander+expander, name="expander"))

  # add locals
-  ret.append(RewriteStep(pm_add_buffers_local+rangeify_codegen, name="add local buffers"))
+  ret.append(RewriteStep(pm_add_buffers+rangeify_codegen, name="add local buffers"))

  # ** devectorizer (full_graph_rewrite) **
  # remove reduce
@ -94,7 +95,7 @@ def _get_rewrites_for_renderer(opts:Renderer, optimize:bool, linearizer:bool, _Q
  extra_matcher = opts.extra_matcher if opts.extra_matcher is not None else PatternMatcher([])

  # lower the index dtype to a concrete int
-  ret.append(RewriteStep(pm_lower_index_dtype+cast_folding+load_store_indexing, lambda _: opts.device, name="lower all index dtypes"))
+  ret.append(RewriteStep(load_store_indexing+pm_lower_index_dtype, lambda _: opts.device, name="lower all index dtypes"))

  # optional pre matcher
  if opts.pre_matcher is not None: ret.append(RewriteStep(opts.pre_matcher, name="pre_matcher"))
--- a/Show more
+++ b/Show more