Merge remote-tracking branch 'upstream/master' into new_x86_backend

2026-06-24 02:14:17 +00:00 · 2025-12-20 20:43:21 +00:00 · 2025-12-20 20:43:21 +00:00 · b8f06970fa
commit b8f06970fa
parent edb592f314 5228f7bd06
335 changed files with 27215 additions and 27875 deletions
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@ -61,7 +61,7 @@ runs:
      uses: actions/cache@v4
      with:
        path: ${{ github.workspace }}/.venv
-        key: venv-${{ runner.os }}-python-${{ steps.setup-python.outputs.python-version }}-${{ inputs.deps }}-${{ inputs.pydeps }}-${{ hashFiles('**/pyproject.toml') }}-${{ env.CACHE_VERSION }}
+        key: venv-${{ runner.os }}-python-${{ steps.setup-python.outputs.python-version }}-${{ inputs.deps }}-${{ inputs.pydeps }}-${{ env.CACHE_VERSION }}

    # **** Caching downloads ****

@ -70,13 +70,13 @@ runs:
      uses: actions/cache@v4
      with:
        path: ~/.cache/tinygrad/downloads/
-        key: downloads-cache-${{ inputs.key }}-${{ env.CACHE_VERSION }}
+        key: downloads-${{ github.job }}-${{ inputs.key }}-${{ env.CACHE_VERSION }}
    - name: Cache downloads (macOS)
      if: inputs.key != '' && runner.os == 'macOS'
      uses: actions/cache@v4
      with:
        path: ~/Library/Caches/tinygrad/downloads/
-        key: osx-downloads-cache-${{ inputs.key }}-${{ env.CACHE_VERSION }}
+        key: downloads-${{ github.job }}-${{ inputs.key }}-${{ env.CACHE_VERSION }}

    # **** Python deps ****

@ -221,7 +221,7 @@ runs:
        sudo mkdir -p /usr/local/lib
        curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/nimlgen/amdcomgr_dylib/releases/latest | \
          jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
-          sudo xargs curl -L -o /usr/local/lib/libamd_comgr.dylib
+          sudo xargs curl -fL -o /usr/local/lib/libamd_comgr.dylib
        cargo build --release --manifest-path ./extra/remu/Cargo.toml

    # **** gpuocelot ****
@ -278,7 +278,7 @@ runs:
      if: inputs.webgpu == 'true' && runner.os == 'Linux'
      shell: bash
      run: |
-        sudo curl -L https://github.com/wpmed92/pydawn/releases/download/v0.1.6/libwebgpu_dawn.so -o /usr/local/lib/libwebgpu_dawn.so
+        sudo curl -fL https://github.com/wpmed92/pydawn/releases/download/v0.1.6/libwebgpu_dawn.so -o /usr/local/lib/libwebgpu_dawn.so
        sudo ldconfig
    - name: Install WebGPU dawn (macOS)
      if: inputs.webgpu == 'true' && runner.os == 'macOS'
@ -298,7 +298,7 @@ runs:
    - name: Install mesa (linux)
      if: inputs.mesa == 'true' && runner.os == 'Linux'
      shell: bash
-      run: sudo curl -L https://github.com/sirhcm/tinymesa/releases/download/tinymesa-32dc66c/libtinymesa_cpu-mesa-25.2.4-linux-amd64.so -o /usr/lib/libtinymesa_cpu.so
+      run: sudo curl -fL https://github.com/sirhcm/tinymesa/releases/download/v1/libtinymesa_cpu-mesa-25.2.7-linux-amd64.so -o /usr/lib/libtinymesa_cpu.so
    - name: Install mesa (macOS)
      if: inputs.mesa == 'true' && runner.os == 'macOS'
      shell: bash
--- a/.github/workflows/autogen.yml
+++ b/.github/workflows/autogen.yml
@ -13,9 +13,11 @@ on:
  pull_request:
    paths:
    - 'tinygrad/runtime/autogen/**/*'
+    - 'tinygrad/runtime/support/autogen.py'
  workflow_dispatch:
    paths:
    - 'tinygrad/runtime/autogen/**/*'
+    - 'tinygrad/runtime/support/autogen.py'

 jobs:
  autogen:
@ -114,11 +116,9 @@ jobs:
    - name: Verify Qualcomm autogen
      run: |
        mv tinygrad/runtime/autogen/kgsl.py /tmp/kgsl.py.bak
-        mv tinygrad/runtime/autogen/adreno.py /tmp/adreno.py.bak
        mv tinygrad/runtime/autogen/qcom_dsp.py /tmp/qcom_dsp.py.bak
-        python3 -c "from tinygrad.runtime.autogen import kgsl, adreno, qcom_dsp"
+        python3 -c "from tinygrad.runtime.autogen import kgsl, qcom_dsp"
        diff /tmp/kgsl.py.bak tinygrad/runtime/autogen/kgsl.py
-        diff /tmp/adreno.py.bak tinygrad/runtime/autogen/adreno.py
        diff /tmp/qcom_dsp.py.bak tinygrad/runtime/autogen/qcom_dsp.py
    - name: Verify libusb autogen
      run: |
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -14,12 +14,6 @@ on:
      - update_benchmark
      - update_benchmark_staging
  workflow_dispatch:
-    inputs:
-      run_process_replay:
-        description: "Run process replay tests"
-        required: false
-        default: false
-        type: boolean

 jobs:
  testmacbenchmark:
@ -39,6 +33,7 @@ jobs:
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
+        mkdir -p extra/disassemblers
        ln -s ~/tinygrad/extra/disassemblers/applegpu extra/disassemblers/applegpu
        ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
@ -54,9 +49,9 @@ jobs:
    - name: Print macOS version
      run: sw_vers
    - name: Run Stable Diffusion
-      run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=800 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
+      run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
    - name: Run Stable Diffusion without fp16
-      run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=800 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
+      run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
    - name: Run Stable Diffusion v2
      # TODO: very slow step time
      run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 ASSERT_MIN_STEP_TIME=4500 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
@ -64,7 +59,7 @@ jobs:
    - name: Run SDXL
      run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=5000 CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
    - name: Run model inference benchmark
-      run: METAL=1 python3.11 test/external/external_model_benchmark.py
+      run: METAL=1 NOCLANG=1 python3.11 test/external/external_model_benchmark.py
    - name: Test speed vs torch
      run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
    - name: Test tensor cores
@ -124,14 +119,6 @@ jobs:
    # TODO: too slow
    # - name: Run 10 CIFAR training steps w winograd
    #   run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 ASSERT_MIN_STEP_TIME=150 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
-    - name: UsbGPU boot time
-      run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
-    - name: UsbGPU tiny tests
-      run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/test_tiny.py
-    - name: UsbGPU copy speeds
-      run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds
-    #- name: UsbGPU openpilot test
-    #  run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (Mac)
@ -165,6 +152,37 @@ jobs:
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3.11 process_replay.py

+  testusbgpu:
+    name: UsbGPU Benchmark
+    env:
+      PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache
+    runs-on: [self-hosted, macOS]
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash -e -o pipefail {0}
+    if: github.repository_owner == 'tinygrad'
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: setup staging db
+      if: github.ref == 'refs/heads/update_benchmark_staging'
+      run: |
+        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
+        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
+    - name: UsbGPU boot time
+      run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
+    - name: UsbGPU tiny tests
+      run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/test_tiny.py
+    - name: UsbGPU copy speeds
+      run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds
+    #- name: UsbGPU openpilot test
+    #  run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
+    - name: UsbGPU (USB4/TB) boot time
+      run: PYTHONPATH=. DEBUG=3 NV=1 NV_IFACE=PCI NV_NAK=1 time python3.11 test/test_tiny.py TestTiny.test_plus
+    - name: UsbGPU (USB4/TB) tiny tests
+      run: PYTHONPATH=. NV=1 NV_IFACE=PCI NV_NAK=1 python3.11 test/test_tiny.py
+
  testnvidiabenchmark:
    name: tinybox green Benchmark
    runs-on: [self-hosted, Linux, tinyboxgreen]
@ -318,31 +336,31 @@ jobs:
    # TODO: too slow
    # - name: Fuzz Padded Tensor Core GEMM (PTX)
    #   run: NV=1 NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
+    - name: HEVC Decode Benchmark
+      run: VALIDATE=1 MAX_FRAMES=100 NV=1 PYTHONPATH=. python3 extra/hevc/decode.py
    - name: Train MNIST
      run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
-    # TODO: too slow
    - name: Run 10 CIFAR training steps
-      run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=1300 NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
-    # - name: Run 10 CIFAR training steps w HALF
-    #   run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=240 NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
-    # - name: Run 10 CIFAR training steps w BF16
-    #   run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=270 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
-    # TODO: too slow
+      run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=120 NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
+    - name: Run 10 CIFAR training steps w HALF
+      run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=110 NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
+    - name: Run 10 CIFAR training steps w BF16
+      run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=120 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
    # - name: Run 10 CIFAR training steps w winograd
    #   run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=350 NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
-    # - name: Run full CIFAR training w 1 GPU
-    #   run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
-    # - name: Run full CIFAR training steps w 6 GPUS
-    #   run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
+    - name: Run full CIFAR training w 1 GPU
+      run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
+    - name: Run full CIFAR training steps w 6 GPUS
+      run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
    - name: Run MLPerf resnet eval on training data
      run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
-    #- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
-    #  run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
-    #- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
-    #  run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
+      run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
+    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
+      run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
+      run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA Training)
@ -433,9 +451,8 @@ jobs:
      run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Run Stable Diffusion
      run: BENCHMARK_LOG=stable_diffusion ASSERT_MIN_STEP_TIME=550 AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
-    # TODO: too slow
-    # - name: Run SDXL
-    #   run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3200 CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
+    - name: Run SDXL
+      run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3200 CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
    - name: Run LLaMA 7B
      run: |
        BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
@ -525,22 +542,19 @@ jobs:
      run: test/external/process_replay/reset.py
    - name: Train MNIST
      run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
-    # TODO: too slow
    - name: Run 10 CIFAR training steps
-      run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=2000 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
-    # - name: Run 10 CIFAR training steps w HALF
-    #   run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=390 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
+      run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=200 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
+    - name: Run 10 CIFAR training steps w HALF
+      run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=200 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
    # - name: Run 10 CIFAR training steps w BF16
    #   run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
    # TODO: too slow
    # - name: Run 10 CIFAR training steps w winograd
    #   run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
-    # - name: Run full CIFAR training w 1 GPU
-    #   run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
-    #- name: Run full CIFAR training steps w 6 GPUS
-    #  run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
-    #- name: Run full CIFAR training steps w 6 GPUS (REMOTE)
-    #  run: time BENCHMARK_LOG=cifar_6gpu_remote REMOTE=1 REMOTEDEV=AMD DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu_remote.txt
+    - name: Run full CIFAR training w 1 GPU
+      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
+    - name: Run full CIFAR training steps w 6 GPUS
+      run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AMD Training)
@ -552,7 +566,6 @@ jobs:
          train_cifar_wino.txt
          train_cifar_one_gpu.txt
          train_cifar_six_gpu.txt
-          train_cifar_six_gpu_remote.txt
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

@ -590,13 +603,13 @@ jobs:
      run: test/external/process_replay/reset.py
    - name: Run MLPerf resnet eval
      run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
-    #- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
-    #  run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
-    #- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
-    #  run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
+    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
+      run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
+    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
+      run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
-      run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
+      run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (AMD MLPerf)
@ -625,32 +638,28 @@ jobs:
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
-    # - name: openpilot compile3 0.9.9 driving_vision
-    #   run: BENCHMARK_LOG=openpilot_0_9_9_vision PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_vision.onnx
-    # - name: openpilot compile3 0.9.9 driving_policy
-    #   run: BENCHMARK_LOG=openpilot_0_9_9_policy PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_policy.onnx
-    # - name: openpilot compile3 0.9.9 dmonitoring
-    #   run: BENCHMARK_LOG=openpilot_0_9_9_dmonitoring PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/dmonitoring_model.onnx
    - name: openpilot compile3 0.10.0 driving_policy
      run: BENCHMARK_LOG=openpilot_0_10_0_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=4 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.10.0/selfdrive/modeld/models/driving_policy.onnx
    - name: openpilot compile3 0.10.0 dmonitoring
      run: BENCHMARK_LOG=openpilot_0_10_0_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=11 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.10.0/selfdrive/modeld/models/dmonitoring_model.onnx
    - name: DEBUG=2 openpilot compile3 0.10.1 driving_vision
      run: PYTHONPATH="." DEBUG=2 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
+    - name: DEBUG=2 IMAGE=1 openpilot compile3 0.10.1 driving_vision
+      run: PYTHONPATH="." DEBUG=2 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: openpilot compile3 0.10.1 driving_vision
      run: BENCHMARK_LOG=openpilot_0_10_1_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: openpilot compile3 0.10.1 driving_policy
      run: BENCHMARK_LOG=openpilot_0_10_1_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=4 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_policy.onnx
    - name: openpilot compile3 0.10.1 dmonitoring
      run: BENCHMARK_LOG=openpilot_0_10_1_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=10 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/dmonitoring_model.onnx
-    # - name: benchmark MobileNetV2 on DSP
-    #   run: |
-    #     # generate quantized weights
-    #     ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
-    #     ln -s /data/home/tiny/tinygrad/testsig-*.so .
-    #     PYTHONPATH=. CC=clang-19 CPU=1 CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
-    #     # benchmark on DSP with NOOPT=1, the devectorizer has issues
-    #     PYTHONPATH=. CC=clang-19 DSP=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
+    - name: benchmark MobileNetV2 on DSP
+      run: |
+        # generate quantized weights
+        ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
+        ln -s /data/home/tiny/tinygrad/testsig-*.so .
+        PYTHONPATH=. CC=clang-19 CPU=1 CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
+        # benchmark on DSP with NOOPT=1, the devectorizer has issues
+        PYTHONPATH=. CC=clang-19 DSP=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

@ -706,10 +715,8 @@ jobs:
      run: |
        AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
        AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
-    # TODO: too slow
-    # - name: Run full CIFAR training w 1 GPU
-    #   run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
-    # TODO: enable
+    - name: Run full CIFAR training w 1 GPU
+      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
    # - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
    #   run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee am_train_resnet_one_gpu.txt
    - name: Run 10 MLPerf Bert training steps (1 gpu)
@ -770,11 +777,10 @@ jobs:
        NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
    - name: Test LLAMA-3
      run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt
-    # TODO: too slow
-    # - name: Run full CIFAR training w 1 GPU
-    #   run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt
-    #- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
-    #  run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt
+    - name: Run full CIFAR training w 1 GPU
+      run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt
+    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
+      run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt
    - name: Run 10 MLPerf Bert training steps (1 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
      run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt
--- a/.github/workflows/szdiff.yml
+++ b/.github/workflows/szdiff.yml
@ -56,15 +56,15 @@ jobs:
        uses: actions/checkout@v4
        with:
          path: base
-      - name: Set up Python 3.10
+      - name: Set up Python 3.12
        uses: actions/setup-python@v5
        with:
-          python-version: '3.10'
+          python-version: '3.12'
      - name: Count Line Diff
        run: |
-          pip install tabulate
          BASE="$GITHUB_WORKSPACE/base"
          PR="$GITHUB_WORKSPACE/pr"
+          pip install tabulate $BASE
          cp "$BASE/sz.py" .
          echo "loc_content<<EOF" >> "$GITHUB_ENV"
          python sz.py "$BASE" "$PR" >> "$GITHUB_ENV"
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -1,7 +1,7 @@
 name: Unit Tests
 env:
  # increment this when downloads substantially change to avoid the internet
-  CACHE_VERSION: '13'
+  CACHE_VERSION: '15'
  CAPTURE_PROCESS_REPLAY: 1
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PYTHONPATH: ${{ github.workspace }}
@ -71,9 +71,7 @@ jobs:
    - name: Test Docs Build
      run: python -m mkdocs build --strict
    - name: Test Docs
-      run: |
-        python docs/abstractions2.py
-        python docs/abstractions3.py
+      run: python docs/abstractions3.py
    - name: Test README
      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py &&  python README.py
    - name: Test Quickstart
@ -86,65 +84,67 @@ jobs:
        clang -O2 recognize.c -lm -o recognize
        cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock

-  # TODO: fix the torch backend and reenable
-  # torchbackend:
-  #   name: Torch Backend Tests
-  #   runs-on: ubuntu-latest
-  #   timeout-minutes: 15
-  #   steps:
-  #   - name: Checkout Code
-  #     uses: actions/checkout@v4
-  #   - name: Setup Environment
-  #     uses: ./.github/actions/setup-tinygrad
-  #     with:
-  #       key: torch-backend-pillow-torchvision-et-pt
-  #       deps: testing_minimal
-  #       pydeps: "pillow torchvision expecttest"
-  #       llvm: 'true'
-  #   - name: Install ninja
-  #     run: |
-  #       sudo apt update || true
-  #       sudo apt install -y --no-install-recommends ninja-build
-  #   - name: Lint with ruff
-  #     run: |
-  #       pip3 install --upgrade --force-reinstall ruff==0.11.0
-  #       python3 -m ruff check extra/torch_backend/backend.py
-  #   - name: Test one op
-  #     run: FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_ops.py TestOps.test_add
-  #   - name: Test ResNet-18
-  #     run: DEBUG=2 python3 extra/torch_backend/example.py
-  #   - name: My (custom) tests
-  #     run: python3 extra/torch_backend/test.py
-  #   - name: Test one op in torch tests
-  #     run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
-  #   - name: Test Ops with TINY_BACKEND
-  #     run: CPU=1 CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/test_ops.py --durations=20
-  #   - name: Test in-place operations on views
-  #     run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
-  #   - name: Test multi-gpu
-  #     run: CPU=1 CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
+  torchbackend:
+    name: Torch Backend Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: torch-backend-pillow-torchvision-et-pt
+        deps: testing_minimal
+        pydeps: "pillow torchvision expecttest"
+        llvm: 'true'
+    - name: Install ninja
+      run: |
+        sudo apt update || true
+        sudo apt install -y --no-install-recommends ninja-build
+    - name: Lint with ruff
+      run: |
+        pip3 install --upgrade --force-reinstall ruff==0.11.0
+        python3 -m ruff check extra/torch_backend/backend.py
+    - name: Test one op
+      run: FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_ops.py TestOps.test_add
+    - name: Test ResNet-18
+      run: DEBUG=2 python3 extra/torch_backend/example.py
+    - name: My (custom) tests
+      run: python3 extra/torch_backend/test.py
+    - name: Test one op in torch tests
+      run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
+    - name: Test Ops with TINY_BACKEND
+      run: CPU=1 CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/test_ops.py --durations=20
+    - name: Test in-place operations on views
+      run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
+    - name: Test multi-gpu
+      run: CPU=1 CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
+    - name: Test kernel fusion
+      run: python3 extra/torch_backend/test_kernel_fusion.py

-  # torchbackendmore:
-  #   name: Torch Backend Tests More
-  #   runs-on: ubuntu-latest
-  #   timeout-minutes: 15
-  #   steps:
-  #   - name: Checkout Code
-  #     uses: actions/checkout@v4
-  #   - name: Setup Environment
-  #     uses: ./.github/actions/setup-tinygrad
-  #     with:
-  #       key: torch-backend-pillow-torchvision-et-pt
-  #       deps: testing_minimal
-  #       llvm: 'true'
-  #   - name: Install ninja
-  #     run: |
-  #       sudo apt update || true
-  #       sudo apt install -y --no-install-recommends ninja-build
-  #   - name: Test beautiful_mnist in torch with TINY_BACKEND
-  #     run: STEPS=20 CPU=1 TARGET_EVAL_ACC_PCT=90.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py
-  #   - name: Test some torch tests (expect failure)
-  #     run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true
+
+  torchbackendmore:
+    name: Torch Backend Tests More
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: torch-backend-pillow-torchvision-et-pt
+        deps: testing_minimal
+        llvm: 'true'
+    - name: Install ninja
+      run: |
+        sudo apt update || true
+        sudo apt install -y --no-install-recommends ninja-build
+    - name: Test beautiful_mnist in torch with TINY_BACKEND
+      run: STEPS=20 CPU=1 TARGET_EVAL_ACC_PCT=90.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py
+    - name: Test some torch tests (expect failure)
+      run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true

  bepython:
    name: Python Backend
@ -236,6 +236,7 @@ jobs:
        pip3 install --upgrade --force-reinstall ruff==0.11.0
        python3 -m ruff check .
        python3 -m ruff check examples/mlperf/ --ignore E501
+        python3 -m ruff check extra/thunder/tiny/ --ignore E501 --ignore F841 --ignore E722
    - name: Run mypy
      run: |
        python -m mypy --strict-equality --lineprecision-report .
@ -261,7 +262,9 @@ jobs:
    - name: Check Device.DEFAULT
      run: python -c "from tinygrad import Device; assert Device.DEFAULT == 'CPU', Device.DEFAULT"
    - name: Run unit tests
-      run: CPU=1 python -m pytest -n=auto test/unit/ --durations=20
+      run: |
+        CPU=1 python test/unit/test_device.py TestRunAsModule.test_module_runs
+        CPU=1 python -m pytest -n=auto test/unit/ --durations=20 --deselect=test/unit/test_device.py::TestRunAsModule::test_module_runs
    - name: Run targetted tests on NULL backend
      run: NULL=1 python3 -m unittest test.test_multitensor.TestMultiTensor.test_data_parallel_resnet_train_step test/device/test_null.py
    # TODO: too slow
@ -287,8 +290,8 @@ jobs:
        python extra/optimization/extract_dataset.py
        gzip -c /tmp/sops > extra/datasets/sops.gz
        #DEBUG=1 MIN_ASTS=1 python extra/optimization/get_action_space.py
-    - name: Repo line count < 19000 lines
-      run: MAX_LINE_COUNT=19000 python sz.py
+    - name: Repo line count < 20000 lines
+      run: MAX_LINE_COUNT=20000 python sz.py

  spec:
    strategy:
@ -306,8 +309,9 @@ jobs:
      with:
        key: spec-unit
        deps: testing_unit
+        python-version: '3.14'
    - name: Test SPEC=2
-      run: IGNORE_OOB=0 SPEC=2 PYTHONPATH="." pytest --maxfail=10 -n auto --durations=30 --ignore=test/models --ignore test/unit/test_hashing.py --timeout 60 -k "not test_setitem_big" --splits 2 --group ${{ matrix.group }}
+      run: IGNORE_OOB=0 SPEC=2 PYTHONPATH="." pytest --maxfail=10 -n auto --durations=30 --ignore=test/models --ignore test/test_custom_kernel.py --ignore test/unit/test_hashing.py --timeout 60 -k "not test_setitem_big" --splits 2 --group ${{ matrix.group }}

  fuzzing:
    name: Fuzzing
@ -323,6 +327,8 @@ jobs:
        deps: testing_unit
    - name: Fuzz Test symbolic
      run: python test/external/fuzz_symbolic.py
+    - name: Fuzz Test symbolic (symbolic divisors)
+      run: python test/external/fuzz_symbolic_symbolic_div.py
    - name: Fuzz Test fast idiv
      run: python test/external/fuzz_fast_idiv.py
    - name: Fuzz Test shape ops
@ -442,7 +448,7 @@ jobs:
        with:
          key: onnxoptl
          deps: testing
-          pydeps: "tensorflow==2.15.1 tensorflow_addons"
+          pydeps: "tensorflow==2.19"
          python-version: '3.11'
          opencl: 'true'
      - name: Test ONNX (CL)
@ -460,7 +466,7 @@ jobs:
      - name: Test Bert training
        run: NULL=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=24 GPUS=4 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py
      - name: Test llama 3 training
-        run: NULL=1 SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
+        run: NULL=1 SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=1 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

@ -716,71 +722,6 @@ jobs:
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

-  amdremote:
-    name: Linux (remote)
-    runs-on: ubuntu-22.04
-    timeout-minutes: 20
-    env:
-      REMOTE: 1
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Setup Environment
-        uses: ./.github/actions/setup-tinygrad
-        with:
-          key: linux-remote
-          deps: testing_minimal
-          amd: 'true'
-          llvm: 'true'
-          opencl: 'true'
-      - name: Start remote server
-        run: |
-          start_server() {
-            systemd-run --user \
-              --unit="$1" \
-              --setenv=REMOTEDEV="$2" \
-              --setenv=MOCKGPU=1 \
-              --setenv=PYTHONPATH=. \
-              --setenv=PORT="$3" \
-              --working-directory="$(pwd)" \
-              python tinygrad/runtime/ops_remote.py
-          }
-
-          start_server "remote-server-amd-1" "AMD" 6667
-          start_server "remote-server-amd-2" "AMD" 6668
-          start_server "remote-server-gpu" "CL" 7667
-          start_server "remote-server-cpu" "CPU" 8667
-      - name: Check Device.DEFAULT and print some source
-        env:
-          HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
-        run: |
-          python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT"
-          python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'AMD', Device.default.properties.real_device"
-          DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
-      - name: Run REMOTE=1 Test (AMD)
-        env:
-          HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
-        run: |
-          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
-      - name: Run REMOTE=1 Test (CL)
-        env:
-          HOST: 127.0.0.1:7667*6
-        run: |
-          python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20
-          IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
-      - name: Run REMOTE=1 Test (CPU)
-        env:
-          HOST: 127.0.0.1:8667*6
-        run: |
-          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20
-      - name: Show remote server logs
-        if: always()
-        run: |
-          journalctl --user -u remote-server-amd-1 --no-pager
-          journalctl --user -u remote-server-amd-2 --no-pager
-          journalctl --user -u remote-server-gpu --no-pager
-          journalctl --user -u remote-server-cpu --no-pager
-
 # ****** OSX Tests ******

  testmetal:
@ -878,30 +819,6 @@ jobs:
    - name: Test ONNX Runner (WEBGPU)
      run: WEBGPU=1 python3 test/external/external_test_onnx_runner.py

-  osxremote:
-   name: MacOS (remote metal)
-   runs-on: macos-15
-   timeout-minutes: 10
-   env:
-     REMOTE: 1
-     REMOTEDEV: METAL
-   steps:
-     - name: Checkout Code
-       uses: actions/checkout@v4
-     - name: Setup Environment
-       uses: ./.github/actions/setup-tinygrad
-       with:
-         key: macos-remote
-         deps: testing_minimal
-     - name: Check Device.DEFAULT and print some source
-       run: |
-         python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT"
-         python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'METAL', Device.default.properties.real_device"
-         DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
-     - name: Run REMOTE=1 Test
-       run: |
-         python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_tensor_variable.py
-
  osxtests:
    strategy:
      fail-fast: false
@ -967,3 +884,33 @@ jobs:
        run: |
          python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT"
          python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20
+
+# ****** Compile-only Tests ******
+
+  compiletests:
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: [ir3, nak]
+    name: Compile-only (${{ matrix.backend }})
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      - name: Setup Environment
+        uses: ./.github/actions/setup-tinygrad
+        with:
+          key: compile-${{ matrix.backend }}
+          deps: testing_minimal
+          mesa: ${{ (matrix.backend == 'ir3' || matrix.backend == 'nak') && 'true' }}
+          python-version: '3.14'
+      - name: Set env
+        shell: bash
+        run: printf "NULL=1\n${{ matrix.backend == 'ir3' && 'NULL_IR3=1' || matrix.backend == 'nak' && 'NULL_NAK=1' }}" >> $GITHUB_ENV
+      - name: Run test_ops
+        shell: bash
+        run: |
+          python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
+          DEBUG=4 python3 test/test_ops.py TestOps.test_add
+          python -m pytest -n=auto test/test_ops.py --durations=20
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -27,8 +27,8 @@ repos:
        always_run: true
        pass_filenames: false
      - id: tests
-        name: subset of tests
-        entry: env OMP_NUM_THREADS=1 PYTHONPATH="." python3 -m pytest -n=6 test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py
+        name: comprehensive test suite
+        entry: env OMP_NUM_THREADS=1 SKIP_SLOW_TEST=1 PYTHONPATH="." python3 -m pytest -n=6 test/test_ops.py test/test_schedule.py test/test_assign.py test/test_tensor.py test/test_jit.py test/unit/test_schedule_cache.py test/unit/test_pattern_matcher.py test/unit/test_uop_symbolic.py test/unit/test_helpers.py
        language: system
        always_run: true
        pass_filenames: false
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -0,0 +1,227 @@
+# Claude Code Guide for tinygrad
+
+## Architecture Overview
+
+tinygrad compiles tensor operations into optimized kernels. The pipeline:
+
+1. **Tensor** (`tensor.py`) - User-facing API, creates UOp graph
+2. **UOp** (`uop/ops.py`) - Unified IR for all operations (both tensor and kernel level)
+3. **Schedule** (`engine/schedule.py`, `schedule/`) - Converts tensor UOps to kernel UOps
+4. **Codegen** (`codegen/`) - Converts kernel UOps to device code
+5. **Runtime** (`runtime/`) - Device-specific execution
+
+## Key Concepts
+
+### UOp (Universal Operation)
+Everything is a UOp - tensors, operations, buffers, kernels. Key properties:
+- `op`: The operation type (Ops enum)
+- `dtype`: Data type
+- `src`: Tuple of source UOps
+- `arg`: Operation-specific argument
+- `tag`: Optional tag for graph transformations
+
+UOps are **immutable and cached** - creating the same UOp twice returns the same object (ucache).
+
+### PatternMatcher
+Used extensively for graph transformations:
+```python
+pm = PatternMatcher([
+  (UPat(Ops.ADD, src=(UPat.cvar("x"), UPat.cvar("x"))), lambda x: x * 2),
+])
+result = graph_rewrite(uop, pm)
+```
+
+### Schedule Cache
+Schedules are cached by graph structure. BIND nodes (variables with bound values) are unbound before cache key computation so different values hit the same cache.
+
+## Directory Structure
+
+```
+tinygrad/
+├── tensor.py          # Tensor class, user API
+├── device.py          # Buffer, device management
+├── dtype.py           # Data types
+├── helpers.py         # Utilities, environment vars
+├── uop/
+│   ├── ops.py         # UOp class, Ops enum, PatternMatcher
+│   ├── spec.py        # UOp type verification
+│   └── symbolic.py    # Symbolic math simplification
+├── engine/
+│   ├── schedule.py    # Schedule creation, caching
+│   ├── realize.py     # Tensor realization
+│   ├── jit.py         # JIT compilation
+│   └── memory.py      # Memory planning
+├── schedule/
+│   ├── rangeify.py    # Convert movements to ranges
+│   └── indexing.py    # Index calculations
+├── codegen/
+│   ├── kernel.py      # Kernel optimization
+│   └── uopgraph.py    # UOp graph transformations
+├── renderer/          # Code generation (CUDA, Metal, etc.)
+└── runtime/           # Device backends
+```
+
+## Testing
+
+```bash
+# Run specific test
+python -m pytest test/unit/test_schedule_cache.py -xvs
+
+# Run with timeout
+python -m pytest test/test_symbolic_ops.py -x --timeout=60
+
+# Debug with print
+DEBUG=2 python -m pytest test/test_schedule.py::test_name -xvs
+
+# Visualize UOp graphs
+VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()"
+```
+
+## Common Environment Variables
+
+- `DEBUG=1-4` - Increasing verbosity
+- `VIZ=1` - Enable graph visualization
+- `SPEC=1` - Enable UOp spec verification
+- `NOOPT=1` - Disable optimizations
+- `DEVICE=CPU/CUDA/AMD/METAL` - Set default device
+
+## Debugging Tips
+
+1. **Print UOp graphs**: `print(tensor.uop)` or `print(tensor.uop.sink())`
+2. **Check schedule**: `tensor.schedule()` returns list of ExecItems
+3. **Trace graph rewrites**: Use `VIZ=1` or add print in PatternMatcher callbacks
+4. **Find UOps by type**: `[u for u in uop.toposort() if u.op is Ops.SOMETHING]`
+
+## Workflow Rules
+
+- **NEVER commit without explicit user approval** - always show the diff and wait for approval
+- **NEVER amend commits** - always create a new commit instead
+- Run `pre-commit run --all-files` before committing to catch linting/type errors
+- Run tests before proposing commits
+- Test with `SPEC=2` when modifying UOp-related code
+
+## Style Notes
+
+- 2-space indentation, 150 char line limit
+- PatternMatchers should be defined at module level (slow to construct)
+- Prefer `graph_rewrite` over manual graph traversal
+- UOp methods like `.replace()` preserve tags unless explicitly changed
+- Use `.rtag(value)` to add tags to UOps
+
+## Lessons Learned
+
+### UOp ucache Behavior
+UOps are cached by their contents - creating a UOp with identical (op, dtype, src, arg) returns the **same object**. This means:
+- `uop.replace(tag=None)` on a tagged UOp returns the original untagged UOp if it exists in cache
+- Two UOps with same structure are identical (`is` comparison works)
+
+### Spec Validation
+When adding new UOp patterns, update `tinygrad/uop/spec.py`. Test with:
+```bash
+SPEC=2 python3 test/unit/test_something.py
+```
+Spec issues appear as `RuntimeError: SPEC ISSUE None: UOp(...)`.
+
+### Schedule Cache Key Normalization
+The schedule cache strips values from BIND nodes so different bound values (e.g., KV cache positions) hit the same cache entry:
+- `pm_pre_sched_cache`: BIND(DEFINE_VAR, CONST) → BIND(DEFINE_VAR) for cache key
+- `pm_post_sched_cache`: restores original BIND from context
+- When accessing `bind.src[1]`, check `len(bind.src) > 1` first (might be stripped)
+- Extract var_vals from `input_buffers` dict after graph_rewrite (avoids extra toposort)
+
+### Avoiding Extra Work
+- Use ctx dict from graph_rewrite to collect info during traversal instead of separate toposort
+- Only extract var_vals when schedule is non-empty (no kernels = no vars needed)
+- PatternMatchers are slow to construct - define at module level, not in functions
+
+### Readability Over Speed
+Don't add complexity for marginal performance gains. Simpler code that's slightly slower is often better:
+```python
+# BAD: "optimized" with extra complexity
+if has_afters:  # skip toposort if no AFTERs
+  after_map = [(u, u.buf_uop) for u in big_sink.toposort() if u.op is Ops.AFTER]
+
+# GOOD: simple, always works
+after_map = [(u, u.buf_uop) for u in big_sink.toposort() if u.op is Ops.AFTER]
+```
+The conditional check adds complexity, potential bugs, and often negligible speedup. Only optimize when profiling shows a real bottleneck.
+
+### Testing LLM Changes
+```bash
+# Quick smoke test
+echo "Hello" | DEBUG=1 python tinygrad/apps/llm.py --model "llama3.2:1b"
+
+# Check cache hits (should see "cache hit" after warmup)
+echo "Hello world" | DEBUG=1 python tinygrad/apps/llm.py --model "llama3.2:1b" 2>&1 | grep cache
+
+# Test with beam search
+echo "Hello" | BEAM=2 python tinygrad/apps/llm.py --model "llama3.2:1b"
+```
+
+## Common Patterns
+
+### Graph Transformation
+```python
+def my_transform(ctx, x):
+  # Return new UOp or None to skip
+  return x.replace(arg=new_arg)
+
+pm = PatternMatcher([
+  (UPat(Ops.SOMETHING, name="x"), my_transform),
+])
+result = graph_rewrite(input_uop, pm, ctx={})
+```
+
+### Finding Variables
+```python
+# Get all variables in a UOp graph
+variables = uop.variables()
+
+# Get bound variable values
+var, val = bind_uop.unbind()
+```
+
+### Shape Handling
+```python
+# Shapes can be symbolic (contain UOps)
+shape = tensor.shape  # tuple[sint, ...] where sint = int | UOp
+```
+
+## Performance Optimization
+
+When optimizing tinygrad internals:
+
+1. **Measure wall time, not just call counts** - Reducing `graph_rewrite` calls doesn't always improve wall time. The overhead of conditional checks can exceed the cost of the operation being skipped.
+
+2. **Profile each optimization individually** - Run benchmarks with and without each change to measure actual impact. Use `test/external/external_benchmark_schedule.py` for schedule/rewrite timing.
+
+3. **Early exits in hot paths are effective** - Simple checks like `if self.op is Ops.CONST: return self` in `simplify()` can eliminate many unnecessary `graph_rewrite` calls.
+
+4. **`graph_rewrite` is expensive** - Each call has overhead even for small graphs. Avoid calling it when the result is trivially known (e.g., simplifying a CONST returns itself).
+
+5. **Beware iterator overhead** - Checks like `all(x.op is Ops.CONST for x in self.src)` can be slower than just running the operation, especially for small sequences.
+
+6. **Verify cache hit rates before adding/keeping caches** - Measure actual hit rates with real workloads. A cache with 0% hit rate is pure overhead (e.g., `pm_cache` was removed because the algorithm guarantees each UOp is only passed to `pm_rewrite` once).
+
+7. **Use `TRACK_MATCH_STATS=2` to profile pattern matching** - This shows match rates and time per pattern. Look for patterns with 0% match rate that still cost significant time - these are pure overhead for that workload.
+
+8. **Cached properties beat manual traversal** - `backward_slice` uses `@functools.cached_property`. A DFS with early-exit sounds faster but is actually slower because it doesn't benefit from caching. The cache hit benefit often outweighs algorithmic improvements.
+
+9. **Avoid creating intermediate objects in hot paths** - For example, `any(x.op in ops for x in self.backward_slice)` is faster than `any(x.op in ops for x in {self:None, **self.backward_slice})` because it avoids dict creation.
+
+## Pattern Matching Profiling
+
+Use `TRACK_MATCH_STATS=2` to identify expensive patterns:
+
+```bash
+TRACK_MATCH_STATS=2 PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
+```
+
+Output format: `matches / attempts -- match_time / total_time ms -- location`
+
+Key patterns to watch (from ResNet50 benchmark):
+- `split_load_store`: ~146ms, 31% match rate - does real work
+- `simplify_valid`: ~75ms, 0% match rate in this workload - checks AND ops for INDEX in backward slice
+- `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches
+
+Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose.
--- a/docs/abstractions2.py
+++ b/docs/abstractions2.py
@ -1,135 +0,0 @@
-# tinygrad is a tensor library, and as a tensor library it has multiple parts
-# 1. a "runtime". this allows buffer management, compilation, and running programs
-# 2. a "Device" that uses the runtime but specifies compute in an abstract way for all
-# 3. a "UOp" that fuses the compute into kernels, using memory only when needed
-# 4. a "Tensor" that provides an easy to use frontend with autograd ".backward()"
-
-
-print("******** first, the runtime ***********")
-
-from tinygrad.runtime.ops_cpu import ClangJITCompiler, CPUDevice, CPUProgram
-
-cpu = CPUDevice()
-
-# allocate some buffers
-out = cpu.allocator.alloc(4)
-a = cpu.allocator.alloc(4)
-b = cpu.allocator.alloc(4)
-
-# load in some values (little endian)
-cpu.allocator._copyin(a, memoryview(bytearray([2,0,0,0])))
-cpu.allocator._copyin(b, memoryview(bytearray([3,0,0,0])))
-
-# compile a program to a binary
-lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
-
-# create a runtime for the program
-fxn = cpu.runtime("add", lib)
-
-# run the program
-fxn(out, a, b)
-
-# check the data out
-print(val := cpu.allocator._as_buffer(out).cast("I").tolist()[0])
-assert val == 5
-
-
-print("******** second, the Device ***********")
-
-DEVICE = "CPU"   # NOTE: you can change this!
-
-import struct
-from tinygrad.dtype import dtypes
-from tinygrad.device import Buffer, Device
-from tinygrad.uop.ops import UOp, Ops
-
-# allocate some buffers + load in values
-out = Buffer(DEVICE, 1, dtypes.int32).allocate()
-a = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
-b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
-# NOTE: a._buf is the same as the return from cpu.allocator.alloc
-
-# describe the computation
-idx = UOp.const(dtypes.index, 0)
-buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
-buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
-alu = buf_1.index(idx) + buf_2.index(idx)
-output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
-st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.index(idx), alu))
-s = UOp(Ops.SINK, dtypes.void, (st_0,))
-
-# convert the computation to a "linearized" format (print the format)
-from tinygrad.engine.realize import get_program, CompiledRunner
-program = get_program(s, Device[DEVICE].renderer)
-
-# compile a program (and print the source)
-fxn = CompiledRunner(program)
-print(fxn.p.src)
-# NOTE: fxn.clprg is the CPUProgram
-
-# run the program
-fxn.exec([out, a, b])
-
-# check the data out
-assert out.as_buffer().cast('I')[0] == 5
-
-
-print("******** third, the UOp ***********")
-
-from tinygrad.engine.realize import run_schedule
-from tinygrad.engine.schedule import create_schedule_with_vars
-from tinygrad.schedule.rangeify import get_rangeify_map
-
-# allocate some values + load in values
-a = UOp.new_buffer(DEVICE, 1, dtypes.int32)
-b = UOp.new_buffer(DEVICE, 1, dtypes.int32)
-a.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
-b.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
-
-# describe the computation
-out = a + b
-s = UOp(Ops.SINK, dtypes.void, (out,))
-
-# group the computation into kernels
-becomes_map = get_rangeify_map(s)
-
-# the compute maps to an assign
-assign = becomes_map[a+b].base
-
-# the first source is the output buffer (data)
-assert assign.src[0].op is Ops.BUFFER
-# the second source is the kernel (compute)
-assert assign.src[1].op is Ops.KERNEL
-
-# schedule the kernel graph in a linear list
-s = UOp(Ops.SINK, dtypes.void, (assign,))
-sched, _ = create_schedule_with_vars(s)
-assert len(sched) == 1
-
-# DEBUGGING: print the compute ast
-print(sched[-1].ast)
-# NOTE: sched[-1].ast is the same as st_0 above
-
-# the output will be stored in a new buffer
-out = assign.buf_uop
-assert out.op is Ops.BUFFER and not out.buffer.is_allocated()
-print(out)
-
-# run that schedule
-run_schedule(sched)
-
-# check the data out
-assert out.is_realized and out.buffer.as_buffer().cast('I')[0] == 5
-
-
-print("******** fourth, the Tensor ***********")
-
-from tinygrad import Tensor
-
-a = Tensor([2], dtype=dtypes.int32, device=DEVICE)
-b = Tensor([3], dtype=dtypes.int32, device=DEVICE)
-out = a + b
-
-# check the data out
-print(val:=out.item())
-assert val == 5
--- a/docs/abstractions3.py
+++ b/docs/abstractions3.py
@ -38,25 +38,19 @@ optim.schedule_step()   # this will step the optimizer without running realize
 # The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
 # l1.uop and l2.uop define a computation graph

-from tinygrad.engine.schedule import ScheduleItem
-schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)
+from tinygrad.engine.schedule import ExecItem
+schedule: List[ExecItem] = Tensor.schedule(l1, l2)

 print(f"The schedule contains {len(schedule)} items.")
 for si in schedule: print(str(si)[:80])

 # *****
-# 4. Lower a schedule.
+# 4. Lower and run the schedule.

-from tinygrad.engine.realize import lower_schedule_item, ExecItem
-lowered: List[ExecItem] = [lower_schedule_item(si) for si in tqdm(schedule)]
+for si in tqdm(schedule): si.run()

 # *****
-# 5. Run the schedule
-
-for ei in tqdm(lowered): ei.run()
-
-# *****
-# 6. Print the weight change
+# 5. Print the weight change

 print("first weight change\n", l1.numpy()-l1n)
 print("second weight change\n", l2.numpy()-l2n)
--- a/docs/developer/developer.md
+++ b/docs/developer/developer.md
@ -17,15 +17,15 @@ The `UOp` graph specifies the compute in terms of low level tinygrad ops. Not al

 ## Scheduling

-The [scheduler](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/schedule.py) converts the graph of UOps into a list of `ScheduleItem`. One `ScheduleItem` is one kernel on the GPU, and the scheduler is responsible for breaking the large compute graph into subgraphs that can fit in a kernel. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.
+The [scheduler](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/schedule.py) converts the graph of UOps into a list of `ExecItem`. One `ExecItem` is one kernel on the GPU, and the scheduler is responsible for breaking the large compute graph into subgraphs that can fit in a kernel. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.

-::: tinygrad.engine.schedule.ScheduleItem
+::: tinygrad.engine.schedule.ExecItem

 ## Lowering

-The code in [realize](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/realize.py) lowers `ScheduleItem` to `ExecItem` with
+The code in [realize](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/realize.py) lowers `ExecItem` by populating its `prg` field with

-::: tinygrad.engine.realize.lower_schedule
+::: tinygrad.engine.realize.run_schedule

 There's a ton of complexity hidden behind this, see the `codegen/` directory.

--- a/docs/mnist.md
+++ b/docs/mnist.md
@ -131,7 +131,7 @@ timeit.repeat(jit_step, repeat=5, number=1)

 1.0 ms is 75x faster! Note that we aren't syncing the GPU, so GPU time may be slower.

-The slowness the first two times is the JIT capturing the kernels. And this JIT will not run any Python in the function, it will just replay the tinygrad kernels that were run, so be aware that non tinygrad Python operations won't work. Randomness functions work as expected.
+The first two runs of the function execute normally, with the JIT capturing the kernels. Starting from the third run, only the tinygrad operations are replayed, removing the overhead by skipping Python code execution. So be aware that any non-tinygrad Python values affecting the kernels will be "frozen" from the second run. Note that `Tensor` randomness functions work as expected.

 Unlike other JITs, we JIT everything, including the optimizer. Think of it as a dumb replay on different data.

--- a/docs/ramp.py
+++ b/docs/ramp.py
@ -1,293 +0,0 @@
-#!/usr/bin/env python3
-
-# this file is a "ramp" for people new to tinygrad to think about how to approach it
-# it is runnable and editable.
-# whenever you see stuff like DEBUG=2 or CPU=1 discussed, these are environment variables
-# in a unix shell like bash `DEBUG=2 CPU=1 python docs/ramp.py`
-
-# this pip installs tinygrad master for the system
-# the -e allows you to edit the tinygrad folder and update system tinygrad
-# tinygrad is pure Python, so you are encouraged to do this
-# git pull in the tinygrad directory will also get you the latest
-"""
-git clone https://github.com/tinygrad/tinygrad.git
-cd tinygrad
-python3 -m pip install -e .
-"""
-
-# %% ********
-print("******* PART 1 *******")
-
-# we start with a Device.
-# a Device is where Tensors are stored and compute is run
-# tinygrad autodetects the best device on your system and makes it the DEFAULT
-from tinygrad import Device
-print(Device.DEFAULT)  # on Mac, you can see this prints METAL
-
-# now, lets create a Tensor
-from tinygrad import Tensor, dtypes
-t = Tensor([1,2,3,4])
-
-# you can see this Tensor is on the DEFAULT device with int dtype and shape (4,)
-assert t.device == Device.DEFAULT
-assert t.dtype == dtypes.int
-assert t.shape == (4,)
-
-# unlike in torch, if we print it, it doesn't print the contents
-# this is because tinygrad is lazy
-# this Tensor has not been computed yet
-print(t)
-# <Tensor <UOp METAL (4,) int (<Ops.COPY: 7>, None)> on METAL with grad None>
-
-# the ".uop" property on Tensor contains the specification of how to compute it
-print(t.uop)
-"""
-UOp(Ops.COPY, dtypes.int, arg=None, src=(
-  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
-    UOp(Ops.UNIQUE, dtypes.void, arg=0, src=()),
-    UOp(Ops.DEVICE, dtypes.void, arg='PYTHON', src=()),)),
-  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
-"""
-# as you can see, it's specifying a copy from PYTHON device
-# which is where the [1,2,3,4] array lives
-
-# UOps are the specification language in tinygrad
-# they are immutable and form a DAG
-# they have a "Ops", a "dtype", a tuple of srcs (parents), and an arg
-
-t.realize()
-# if we want to "realize" a tensor, we can with the "realize" method
-# now when we look at the uop, it's changed
-print(t.uop)
-"""
-UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
-  UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
-  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
-"""
-# the copy was actually run, and now the "uop" of the Tensor is just a BUFFER
-# if you run this script with DEBUG=2 in the environment, you can see the copy happen
-# *** METAL      1 copy       16,   METAL <- PYTHON ...
-
-# now let's do some compute
-# we look at the uop to see the specification of the compute
-t_times_2 = t * 2
-print(t_times_2.uop)
-"""
-UOp(Ops.MUL, dtypes.int, arg=None, src=(
-  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
-    UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
-    x2:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)),
-  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
-    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
-      UOp(Ops.CONST, dtypes.int, arg=2, src=(
-        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
-           x2,)),)),)),)),))
-"""
-# the BUFFER from above is being multiplied by a CONST 2
-# it's RESHAPEd and EXPANDed to broadcast the CONST to the BUFFER
-
-# we can check the result with
-assert t_times_2.tolist() == [2, 4, 6, 8]
-
-# UOps are both immutable and globally unique
-# if i multiply the Tensor by 4 twice, these result Tensors will have the same uop specification
-t_times_4_try_1 = t * 4
-t_times_4_try_2 = t * 4
-assert t_times_4_try_1.uop is t_times_4_try_2.uop
-# the specification isn't just the same, it's the exact same Python object
-assert t_times_4_try_1 is not t_times_4_try_2
-# the Tensor is a different Python object
-
-# if we realize `t_times_4_try_1` ...
-t_times_4_try_1.realize()
-print(t_times_4_try_2.uop)
-"""
-UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
-  UOp(Ops.UNIQUE, dtypes.void, arg=4, src=()),
-  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
-"""
-# ... `t_times_4_try_2` also becomes the same BUFFER
-assert t_times_4_try_1.uop is t_times_4_try_2.uop
-# so this print doesn't require any computation, just a copy back to the CPU so we can print it
-print("** only the copy start")
-print(t_times_4_try_2.tolist())  # [4, 8, 12, 16]
-print("** only the copy end")
-# you can confirm this with DEBUG=2, seeing what's printed in between the "**" prints
-
-# tinygrad has an auto differentiation engine that operates according to these same principles
-# the derivative of "log(x)" is "1/x", and you can see this on line 20 of gradient.py
-t_float = Tensor([3.0])
-t_log = t_float.log()
-t_log_grad, = t_log.sum().gradient(t_float)
-# due to how log is implemented, this gradient contains a lot of UOps
-print(t_log_grad.uop)
-# ...not shown here...
-# but if you run with DEBUG=4 (CPU=1 used here for simpler code), you can see the generated code
-"""
-void E_(float* restrict data0, float* restrict data1) {
-  float val0 = *(data1+0);
-  *(data0+0) = (1/val0);
-}
-"""
-# the derivative is close to 1/3
-assert (t_log_grad.item() - 1/3) < 1e-6
-
-# %% ********
-print("******* PART 2 *******")
-
-# we redefine the same t here so this cell can run on it's own
-from tinygrad import Tensor
-t = Tensor([1,2,3,4])
-
-# what's above gives you enough of an understanding to go use tinygrad as a library
-# however, a lot of the beauty of tinygrad is in how easy it is to interact with the internals
-# NOTE: the APIs here are subject to change
-
-t_plus_3_plus_4 = t + 3 + 4
-print(t_plus_3_plus_4.uop)
-"""
-UOp(Ops.ADD, dtypes.int, arg=None, src=(
-  UOp(Ops.ADD, dtypes.int, arg=None, src=(
-    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
-      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
-      x3:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
-    UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
-      UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
-        UOp(Ops.CONST, dtypes.int, arg=3, src=(
-          x7:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
-             x3,)),)),)),)),)),
-  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
-    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
-      UOp(Ops.CONST, dtypes.int, arg=4, src=(
-         x7,)),)),)),))
-"""
-# you can see it's adding both 3 and 4
-
-# but by the time we are actually running the code, it's adding 7
-# `kernelize` will simplify and group the operations in the graph into kernels
-t_plus_3_plus_4.kernelize()
-print(t_plus_3_plus_4.uop)
-"""
-UOp(Ops.ASSIGN, dtypes.int, arg=None, src=(
-  x0:=UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
-    UOp(Ops.UNIQUE, dtypes.void, arg=7, src=()),
-    x2:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
-  UOp(Ops.KERNEL, dtypes.void, arg=<Kernel 12 SINK(<Ops.STORE: 48>,) (__add__,)>, src=(
-     x0,
-    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
-      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
-       x2,)),)),))
-"""
-# ASSIGN has two srcs, src[0] is the BUFFER that's assigned to, and src[1] is the thing to assign
-# src[1] is the GPU Kernel that's going to be run
-# we can get the ast of the Kernel as follows
-kernel_ast = t_plus_3_plus_4.uop.src[1].arg.ast
-
-# almost everything in tinygrad functions as a rewrite of the UOps
-# the codegen rewrites the ast to a simplified form ready for "rendering"
-from tinygrad.codegen import full_rewrite_to_sink
-rewritten_ast = full_rewrite_to_sink(kernel_ast)
-print(rewritten_ast)
-"""
-UOp(Ops.SINK, dtypes.void, arg=None, src=(
-  UOp(Ops.STORE, dtypes.void, arg=None, src=(
-    UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
-      UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=0, src=()),
-      x3:=UOp(Ops.SPECIAL, dtypes.int, arg=('gidx0', 4), src=()),)),
-    UOp(Ops.ADD, dtypes.int, arg=None, src=(
-      UOp(Ops.LOAD, dtypes.int, arg=None, src=(
-        UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
-          UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=1, src=()),
-           x3,)),)),
-      UOp(Ops.CONST, dtypes.int, arg=7, src=()),)),)),))
-"""
-# you can see at this point we are adding 7, not 3 and 4
-
-# with DEBUG=4, we can see the code.
-# since optimizations are on, it UPCASTed the operation, explicitly writing out all 4 +7s
-t_plus_3_plus_4.realize()
-"""
-void E_4n2(int* restrict data0, int* restrict data1) {
-  int val0 = *(data1+0);
-  int val1 = *(data1+1);
-  int val2 = *(data1+2);
-  int val3 = *(data1+3);
-  *(data0+0) = (val0+7);
-  *(data0+1) = (val1+7);
-  *(data0+2) = (val2+7);
-  *(data0+3) = (val3+7);
-}
-"""
-# the function name E_4n2 is "E" for elementwise op (as opposed to "r" for reduce op)
-# "4" for the size, and "n2" for name deduping (it's the 3rd function with the same E and 4 in this session)
-# when you print the name with DEBUG=2, you'll see the 4 is yellow, meaning that it's upcasted
-# if you run with NOOPT=1 ...
-"""
-void E_4n2(int* restrict data0, int* restrict data1) {
-  for (int ridx0 = 0; ridx0 < 4; ridx0++) {
-    int val0 = *(data1+ridx0);
-    *(data0+ridx0) = (val0+7);
-  }
-}
-"""
-# ... you get this unoptimized code with a loop and the 4 is blue (for global). the color code is in kernel.py
-
-# %% ********
-print("******* PART 3 *******")
-
-# now, we go even lower and understand UOps better and how the graph rewrite engine works.
-# it's much simpler than what's in LLVM or MLIR
-
-from tinygrad import dtypes
-from tinygrad.uop.ops import UOp, Ops
-
-# first, we'll construct some const UOps
-a = UOp(Ops.CONST, dtypes.int, arg=2)
-b = UOp(Ops.CONST, dtypes.int, arg=2)
-
-# if you have been paying attention, you should know these are the same Python object
-assert a is b
-
-# UOps support normal Python math operations, so a_plus_b expresses the spec for 2 + 2
-a_plus_b = a + b
-print(a_plus_b)
-"""
-UOp(Ops.ADD, dtypes.int, arg=None, src=(
-  x0:=UOp(Ops.CONST, dtypes.int, arg=2, src=()),
-   x0,))
-"""
-
-# we could actually render this 2+2 into a language like c and run it
-# or, we can use tinygrad's graph rewrite engine to "constant fold"
-
-from tinygrad.uop.ops import graph_rewrite, UPat, PatternMatcher
-
-# a `PatternMatcher` is a list of tuples. for each element in the list:
-# [0] is the pattern to match, and [1] is the function to run.
-# this function can return either a UOp to replace the pattern with, or None to not replace
-simple_pm = PatternMatcher([
-  (UPat(Ops.ADD, src=(UPat(Ops.CONST, name="c1"), UPat(Ops.CONST, name="c2"))),
-   lambda c1,c2: UOp(Ops.CONST, dtype=c1.dtype, arg=c1.arg+c2.arg)),
-])
-# this pattern matches the addition of two CONST and rewrites it into a single CONST UOp
-
-# to actually apply the pattern to a_plus_b, we use graph_rewrite
-a_plus_b_simplified = graph_rewrite(a_plus_b, simple_pm)
-print(a_plus_b_simplified)
-"""
-UOp(Ops.CONST, dtypes.int, arg=4, src=())
-"""
-# 2+2 is in fact, 4
-
-# we can also use syntactic sugar to write the pattern nicer
-simpler_pm = PatternMatcher([
-  (UPat.cvar("c1")+UPat.cvar("c2"), lambda c1,c2: c1.const_like(c1.arg+c2.arg))
-])
-assert graph_rewrite(a_plus_b, simple_pm) is graph_rewrite(a_plus_b, simpler_pm)
-# note again the use of is, UOps are immutable and globally unique
-
-# %% ********
-
-# that brings you to an understanding of the most core concepts in tinygrad
-# you can run this with VIZ=1 to use the web based graph rewrite explorer
-# hopefully now you understand it. the nodes in the graph are just UOps
--- a/docs/tinybox.md
+++ b/docs/tinybox.md
@ -41,7 +41,7 @@ The BMC also has a web interface you can use if you find that easier.
 It is recommended that you change the BMC password after setting up the box, as the password on the screen is only the initial password.

 If you do decide to change the BMC password and no longer want the initial password to be displayed, remove the `/root/.bmc_password` file.
-Reboot after making these changes or restart the `displayservice.service` service.
+Reboot after making these changes or restart the `tinybox-display.service` service.

 ## What do I use it for?

--- a/examples/beautiful_mnist.py
+++ b/examples/beautiful_mnist.py
@ -21,7 +21,7 @@ if __name__ == "__main__":
  X_train, Y_train, X_test, Y_test = mnist(fashion=getenv("FASHION"))

  model = Model()
-  opt = (nn.optim.Adam if not getenv("MUON") else nn.optim.Muon)(nn.state.get_parameters(model))
+  opt = (nn.optim.Muon if getenv("MUON") else nn.optim.SGD if getenv("SGD") else nn.optim.Adam)(nn.state.get_parameters(model))

  @TinyJit
  @Tensor.train()
--- a/examples/coder.py
+++ b/examples/coder.py
@ -1,93 +0,0 @@
-#!/usr/bin/env python3
-import os, sys, traceback
-sys.path.append(os.getcwd())
-
-from io import StringIO
-from contextlib import redirect_stdout
-from tinygrad import Tensor, nn
-from tinygrad.helpers import Timing, colored, getenv, fetch
-from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
-from sentencepiece import SentencePieceProcessor
-
-def create_fixed_tokenizer(output_file):
-  print("creating fixed tokenizer")
-  import extra.junk.sentencepiece_model_pb2 as spb2
-  mp = spb2.ModelProto()
-  mp.ParseFromString(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/tokenizer.model?download=true").read_bytes())
-  mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
-  mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
-  with open(output_file, "wb") as f:
-    f.write(mp.SerializeToString())
-
-# example:
-# echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py
-
-if __name__ == "__main__":
-  # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
-  with Timing("create model: "):
-    model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1))
-
-  with Timing("download weights: "):
-    part1 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00001-of-00002.bin?download=true"))
-    part2 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00002-of-00002.bin?download=true"))
-
-  with Timing("weights -> model: "):
-    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part1, 32, 32, 8)), strict=False)
-    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part2, 32, 32, 8)), strict=False)
-
-  if not os.path.isfile("/tmp/tokenizer.model"): create_fixed_tokenizer("/tmp/tokenizer.model")
-  spp = SentencePieceProcessor(model_file="/tmp/tokenizer.model")
-
-  # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/tokenizer_config.json
-  #   "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-  IM_END = 32000
-  IM_START = 32001
-  def encode_prompt(k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
-  def start_prompt(k): return [IM_START]+spp.encode(f"{k}\n")
-  def output(outputted, toks, color):
-    cur = spp.decode(toks)[len(outputted):]
-    sys.stdout.write(colored(cur, color))
-    sys.stdout.flush()
-    outputted += cur
-    return outputted
-
-  # *** app below this line ***
-
-  toks = [spp.bos_id()] + encode_prompt("system", "You are Quentin. Quentin is a useful assistant who writes Python code to answer questions. He keeps the code as short as possible and doesn't read from user input")
-
-  PROMPT = getenv("PROMPT", 1)
-  temperature = getenv("TEMP", 0.7)
-
-  start_pos = 0
-  outputted = output("", toks, "green")
-  turn = True
-  while 1:
-    if PROMPT:
-      toks += encode_prompt("user", input("Q: ")) + start_prompt("assistant")
-    else:
-      toks += start_prompt("user" if turn else "assistant")
-      turn = not turn
-    old_output_len = len(outputted)
-    while 1:
-      tok = model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
-      start_pos = len(toks)
-      toks.append(tok)
-      outputted = output(outputted, toks, "blue" if not turn else "cyan")
-      if tok == IM_END: break
-      if tok == spp.eos_id(): break
-      new_output = outputted[old_output_len:]
-
-      if new_output.endswith("```") and '```python\n' in new_output:
-        python_code = new_output.split('```python\n')[1].split("```")[0]
-        # AI safety. Warning to user. Do not press y if the AI is trying to do unsafe things.
-        if input(colored(f" <-- PYTHON DETECTED, RUN IT? ", "red")).lower() == 'y':
-          my_stdout = StringIO()
-          try:
-            with redirect_stdout(my_stdout): exec(python_code)
-            result = my_stdout.getvalue()
-          except Exception as e:
-            result = ''.join(traceback.format_exception_only(e))
-          toks += spp.encode(f"\nOutput:\n```\n{result}```")
-          outputted = output(outputted, toks, "yellow")
-          old_output_len = len(outputted)
-    print("")
--- a/examples/conversation.py
+++ b/examples/conversation.py
@ -1,341 +0,0 @@
-import argparse
-import multiprocessing as mp
-import os
-import re
-import sys
-import time
-from contextlib import contextmanager
-from pathlib import Path
-
-import numpy as np
-import pyaudio
-import yaml
-from llama import LLaMa
-from vits import MODELS as VITS_MODELS
-from vits import Y_LENGTH_ESTIMATE_SCALARS, HParams, Synthesizer, TextMapper, get_hparams_from_file, load_model
-from whisper import init_whisper, transcribe_waveform
-from sentencepiece import SentencePieceProcessor
-
-from tinygrad.helpers import Timing, fetch
-from tinygrad import Tensor, dtypes
-
-# Whisper constants
-RATE = 16000
-CHUNK = 1600
-
-# LLaMa constants
-IM_START = 32001
-IM_END = 32002
-
-
-# Functions for encoding prompts to chatml md
-def encode_prompt(spp, k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
-def start_prompt(spp, k): return [IM_START]+spp.encode(f"{k}\n")
-
-def chunks(lst, n):
-  for i in range(0, len(lst), n): yield lst[i:i + n]
-
-def create_fixed_tokenizer():
-  """Function needed for extending tokenizer with additional chat tokens"""
-  import extra.junk.sentencepiece_model_pb2 as spb2
-  tokenizer_path = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/tokenizer.model")
-  if SentencePieceProcessor(model_file=str(tokenizer_path)).vocab_size() != 32003:
-    print("creating fixed tokenizer")
-    mp = spb2.ModelProto()
-    mp.ParseFromString(tokenizer_path.read_bytes())
-    # https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/blob/main/added_tokens.json
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="[PAD]", score=0))
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
-    tokenizer_path.write_bytes(mp.SerializeToString())
-  return tokenizer_path
-
-def llama_prepare(llama: LLaMa, temperature: float, pre_prompt_path: Path) -> tuple[list[int], str, str, str]:
-  """Prepares a llama model from a specified pre-prompt file"""
-  with open(str(pre_prompt_path)) as f:
-    config = yaml.safe_load(f.read())
-  toks = [llama.tokenizer.bos_id()] + encode_prompt(llama.tokenizer, "system", config["pre_prompt"].replace("\n", " "))
-  for i in config["examples"]:
-    toks += encode_prompt(llama.tokenizer, config["user_delim"], i["user_prompt"])
-    toks += encode_prompt(llama.tokenizer, config["resp_delim"], i["resp_prompt"])
-  llama.model(Tensor([toks]), 0, temperature).realize()  # NOTE: outputs are not used
-  return toks, config["user_delim"], config["resp_delim"], len(toks), llama.tokenizer.decode(toks)
-
-def llama_generate(
-  llama: LLaMa,
-  toks: list[int],
-  outputted: str,
-  prompt: str,
-  start_pos: int,
-  user_delim: str,
-  resp_delim: str,
-  temperature=0.7,
-  max_tokens=1000
-):
-  """Generates an output for the specified prompt"""
-  toks += encode_prompt(llama.tokenizer, user_delim, prompt)
-  toks += start_prompt(llama.tokenizer, resp_delim)
-
-  outputted = llama.tokenizer.decode(toks)
-  init_length = len(outputted)
-  for _ in range(max_tokens):
-    token = llama.model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
-    start_pos = len(toks)
-    toks.append(token)
-
-    cur = llama.tokenizer.decode(toks)
-
-    # Print is just for debugging
-    sys.stdout.write(cur[len(outputted):])
-    sys.stdout.flush()
-    outputted = cur
-    if toks[-1] == IM_END: break
-  else:
-    toks.append(IM_END)
-  print() # because the output is flushed
-  return outputted, start_pos, outputted[init_length:].replace("<|im_end|>", "")
-
-def tts(
-  text_to_synthesize: str,
-  synth: Synthesizer,
-  hps: HParams,
-  emotion_embedding: Path,
-  speaker_id: int,
-  model_to_use: str,
-  noise_scale: float,
-  noise_scale_w: float,
-  length_scale: float,
-  estimate_max_y_length: bool,
-  text_mapper: TextMapper,
-  model_has_multiple_speakers: bool,
-  pad_length=600,
-  vits_pad_length=1000
-):
-  if model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
-
-  # Convert the input text to a tensor.
-  stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
-  init_shape = stn_tst.shape
-  assert init_shape[0] < pad_length, "text is too long"
-  x_tst, x_tst_lengths = stn_tst.pad(((0, pad_length - init_shape[0]),), value=1).unsqueeze(0), Tensor([init_shape[0]], dtype=dtypes.int64)
-  sid = Tensor([speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
-
-  # Perform inference.
-  audio_tensor = synth.infer(x_tst, x_tst_lengths, sid, noise_scale, length_scale, noise_scale_w, emotion_embedding=emotion_embedding,
-                             max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[model_to_use] if estimate_max_y_length else None, pad_length=vits_pad_length)[0, 0]
-  # Save the audio output.
-  audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
-  return audio_data
-
-def init_vits(
-  model_to_use: str,
-  emotion_path: Path,
-  speaker_id: int,
-  seed: int,
-):
-  model_config = VITS_MODELS[model_to_use]
-
-  # Load the hyperparameters from the config file.
-  hps = get_hparams_from_file(fetch(model_config[0]))
-
-  # If model has multiple speakers, validate speaker id and retrieve name if available.
-  model_has_multiple_speakers = hps.data.n_speakers > 0
-  if model_has_multiple_speakers:
-    if speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {speaker_id} is invalid for this model.")
-    if hps.__contains__("speakers"): # maps speaker ids to names
-      speakers = hps.speakers
-      if isinstance(speakers, list): speakers = {speaker: i for i, speaker in enumerate(speakers)}
-
-  # Load emotions if any. TODO: find an english model with emotions, this is untested atm.
-  emotion_embedding = None
-  if emotion_path is not None:
-    if emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(emotion_path), dtype=dtypes.int64).unsqueeze(0)
-    else: raise ValueError("Emotion path must be a .npy file.")
-
-  # Load symbols, instantiate TextMapper and clean the text.
-  if hps.__contains__("symbols"): symbols = hps.symbols
-  elif model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
-  else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ")
-  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
-
-  # Load the model.
-  if seed is not None:
-    Tensor.manual_seed(seed)
-    np.random.seed(seed)
-  net_g = load_model(text_mapper.symbols, hps, model_config)
-
-  return net_g, emotion_embedding, text_mapper, hps, model_has_multiple_speakers
-
-@contextmanager
-def output_stream(num_channels: int, sample_rate: int):
-  try:
-    p = pyaudio.PyAudio()
-    stream = p.open(format=pyaudio.paInt16, channels=num_channels, rate=sample_rate, output=True)
-    yield stream
-  except KeyboardInterrupt: pass
-  finally:
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-@contextmanager
-def log_writer():
-  try:
-    logs = []
-    yield logs
-  finally:
-    sep = "="*os.get_terminal_size()[1]
-    print(f"{sep[:-1]}\nCHAT LOG")
-    print(*logs, sep="\n")
-    print(sep)
-
-def listener(q: mp.Queue, event: mp.Event):
-  try:
-    p = pyaudio.PyAudio()
-    stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK)
-    did_print = False
-    while True:
-      data = stream.read(CHUNK) # read data to avoid overflow
-      if event.is_set():
-        if not did_print:
-          print("listening")
-          did_print = True
-        q.put(((np.frombuffer(data, np.int16)/32768).astype(np.float32)*3))
-      else:
-        did_print = False
-  finally:
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_rate: int):
-  with output_stream(num_channels, sample_rate) as stream:
-    while True:
-      try:
-        stream.write(q.get())
-        counter.value += 1
-      except KeyboardInterrupt:
-        break
-
-if __name__ == "__main__":
-  import nltk
-  nltk.download("punkt")
-  # Parse CLI arguments
-  parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad")
-
-  # Whisper args
-  parser.add_argument("--whisper_model_name", type=str, default="tiny.en")
-
-  # LLAMA args
-  parser.add_argument("--llama_pre_prompt_path", type=Path, default=Path(__file__).parent / "conversation_data" / "pre_prompt_stacy.yaml", help="Path to yaml file which contains all pre-prompt data needed. ")
-  parser.add_argument("--llama_count", type=int, default=1000, help="Max number of tokens to generate")
-  parser.add_argument("--llama_temperature", type=float, default=0.7, help="Temperature in the softmax")
-  parser.add_argument("--llama_quantize", type=str, default=None, help="Quantize the weights to int8 or nf4 in memory")
-  parser.add_argument("--llama_model", type=Path, default=None, help="Folder with the original weights to load, or single .index.json, .safetensors or .bin file")
-  parser.add_argument("--llama_gen", type=str, default="tiny", required=False, help="Generation of the model to use")
-  parser.add_argument("--llama_size", type=str, default="1B-Chat", required=False, help="Size of model to use")
-  parser.add_argument("--llama_tokenizer", type=Path, default=None, required=False, help="Path to llama tokenizer.model")
-
-  # vits args
-  parser.add_argument("--vits_model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
-  parser.add_argument("--vits_speaker_id", type=int, default=12, help="Specify the speaker ID. Default is 6.")
-  parser.add_argument("--vits_noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
-  parser.add_argument("--vits_noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
-  parser.add_argument("--vits_length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
-  parser.add_argument("--vits_seed", type=int, default=None, help="Specify the seed (set to None if no seed). Default is 1337.")
-  parser.add_argument("--vits_num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
-  parser.add_argument("--vits_sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
-  parser.add_argument("--vits_emotion_path", type=Path, default=None, help="Specify the path to emotion reference.")
-  parser.add_argument("--vits_estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
-  parser.add_argument("--vits_vocab_path", type=Path, default=None, help="Path to the TTS vocabulary.")
-
-  # conversation args
-  parser.add_argument("--max_sentence_length", type=int, default=20, help="Max words in one sentence to pass to vits")
-
-  args = parser.parse_args()
-
-  # Init models
-  model, enc = init_whisper(args.whisper_model_name)
-  synth, emotion_embedding, text_mapper, hps, model_has_multiple_speakers = init_vits(args.vits_model_to_use, args.vits_emotion_path, args.vits_speaker_id, args.vits_seed)
-
-  # Download tinyllama chat as a default model
-  if args.llama_model is None:
-    args.llama_model = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/model.safetensors", "tinyllamachat.safetensors")
-    args.llama_gen = "tiny"
-    args.llama_size = "1B-Chat"
-  # Add 3 more tokens to the tokenizer
-  if args.llama_gen == "tiny" and args.llama_size.endswith("Chat"): args.llama_tokenizer = create_fixed_tokenizer()
-  tokenizer_path = args.llama_tokenizer or args.llama_model.parent / "tokenizer.model"
-  llama = LLaMa.build(args.llama_model, tokenizer_path, args.llama_gen, args.llama_size, args.llama_quantize)
-  toks, user_delim, resp_delim, start_pos, outputted = llama_prepare(llama, args.llama_temperature, args.llama_pre_prompt_path)
-
-  # Start child process for mic input
-  q = mp.Queue()
-  is_listening_event = mp.Event()
-  p = mp.Process(target=listener, args=(q, is_listening_event,))
-  p.daemon = True
-  p.start()
-
-  # Start child process for speaker output
-  out_q = mp.Queue()
-  out_counter = mp.Value("i", 0)
-  out_p = mp.Process(target=mp_output_stream, args=(out_q, out_counter, args.vits_num_channels, hps.data.sampling_rate,))
-  out_p.daemon = True
-  out_p.start()
-
-  # JIT tts
-  for i in ["Hello, I'm a chat bot", "I am capable of doing a lot of things"]:
-    tts(
-      i, synth, hps, emotion_embedding,
-      args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
-      args.vits_noise_scale_w, args.vits_length_scale,
-      args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
-    )
-
-  # Start the pipeline
-  with log_writer() as log:
-    while True:
-      tokens = [enc._special_tokens["<|startoftranscript|>"], enc._special_tokens["<|notimestamps|>"]]
-      total = np.array([])
-      out_counter.value = 0
-
-      s = time.perf_counter()
-      is_listening_event.set()
-      prev_text = None
-      while True:
-        for _ in range(RATE // CHUNK): total = np.concatenate([total, q.get()])
-        txt = transcribe_waveform(model, enc, [total], truncate=True)
-        print(txt, end="\r")
-        if txt == "[BLANK_AUDIO]" or re.match(r"^\([\w+ ]+\)$", txt.strip()): continue
-        if prev_text is not None and prev_text == txt:
-          is_listening_event.clear()
-          break
-        prev_text = txt
-      print() # to avoid llama printing on the same line
-      log.append(f"{user_delim.capitalize()}: {txt}")
-
-      # Generate with llama
-      with Timing("llama generation: "):
-        outputted, start_pos, response = llama_generate(
-          llama, toks, outputted, txt, start_pos,
-          user_delim=user_delim, resp_delim=resp_delim, temperature=args.llama_temperature,
-          max_tokens=args.llama_count
-        )
-        log.append(f"{resp_delim.capitalize()}: {response}")
-
-      # Convert to voice
-      with Timing("tts: "):
-        sentences = nltk.sent_tokenize(response.replace('"', ""))
-        for i in sentences:
-          total = np.array([], dtype=np.int16)
-          for j in chunks(i.split(), args.max_sentence_length):
-            audio_data = tts(
-              " ".join(j), synth, hps, emotion_embedding,
-              args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
-              args.vits_noise_scale_w, args.vits_length_scale,
-              args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
-            )
-            total = np.concatenate([total, audio_data])
-          out_q.put(total.tobytes())
-      while out_counter.value < len(sentences): continue
-      log.append(f"Total: {time.perf_counter() - s}")
--- a/examples/efficientnet.py
+++ b/examples/efficientnet.py
@ -1,89 +0,0 @@
-# load weights from
-# https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth
-# a rough copy of
-# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
-import sys
-import ast
-import time
-import numpy as np
-from PIL import Image
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv, fetch, Timing
-from tinygrad.engine.jit import TinyJit
-from extra.models.efficientnet import EfficientNet
-np.set_printoptions(suppress=True)
-
-# TODO: you should be able to put these in the jitted function
-bias = Tensor([0.485, 0.456, 0.406])
-scale = Tensor([0.229, 0.224, 0.225])
-
-@TinyJit
-def _infer(model, img):
-  img = img.permute((2,0,1))
-  img = img / 255.0
-  img = img - bias.reshape((1,-1,1,1))
-  img = img / scale.reshape((1,-1,1,1))
-  return model.forward(img).realize()
-
-def infer(model, img):
-  # preprocess image
-  aspect_ratio = img.size[0] / img.size[1]
-  img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
-
-  img = np.array(img)
-  y0,x0=(np.asarray(img.shape)[:2]-224)//2
-  retimg = img = img[y0:y0+224, x0:x0+224]
-
-  # if you want to look at the image
-  """
-  import matplotlib.pyplot as plt
-  plt.imshow(img)
-  plt.show()
-  """
-
-  # run the net
-  out = _infer(model, Tensor(img.astype("float32"))).numpy()
-
-  # if you want to look at the outputs
-  """
-  import matplotlib.pyplot as plt
-  plt.plot(out[0])
-  plt.show()
-  """
-  return out, retimg
-
-if __name__ == "__main__":
-  # instantiate my net
-  model = EfficientNet(getenv("NUM", 0))
-  model.load_from_pretrained()
-
-  # category labels
-  lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
-
-  # load image and preprocess
-  url = sys.argv[1] if len(sys.argv) >= 2 else "https://raw.githubusercontent.com/tinygrad/tinygrad/master/docs/showcase/stable_diffusion_by_tinygrad.jpg"
-  if url == 'webcam':
-    import cv2
-    cap = cv2.VideoCapture(0)
-    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
-    while 1:
-      _ = cap.grab() # discard one frame to circumvent capture buffering
-      ret, frame = cap.read()
-      img = Image.fromarray(frame[:, :, [2,1,0]])
-      lt = time.monotonic_ns()
-      out, retimg = infer(model, img)
-      print(f"{(time.monotonic_ns()-lt)*1e-6:7.2f} ms", np.argmax(out), np.max(out), lbls[np.argmax(out)])
-      SCALE = 3
-      simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
-      retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
-      cv2.imshow('capture', retimg)
-      if cv2.waitKey(1) & 0xFF == ord('q'):
-        break
-    cap.release()
-    cv2.destroyAllWindows()
-  else:
-    img = Image.open(fetch(url))
-    for i in range(getenv("CNT", 1)):
-      with Timing("did inference in "):
-        out, _ = infer(model, img)
-        print(np.argmax(out), np.max(out), lbls[np.argmax(out)])
--- a/examples/flux1.py
+++ b/examples/flux1.py
@ -1,498 +0,0 @@
-# pip3 install sentencepiece
-
-# This file incorporates code from the following:
-# Github Name                    | License | Link
-# black-forest-labs/flux         | Apache  | https://github.com/black-forest-labs/flux/tree/main/model_licenses
-
-from tinygrad import Tensor, nn, dtypes, TinyJit
-from tinygrad.nn.state import safe_load, load_state_dict
-from tinygrad.helpers import fetch, tqdm, colored
-from sdxl import FirstStage
-from extra.models.clip import FrozenClosedClipEmbedder
-from extra.models.t5 import T5Embedder
-import numpy as np
-
-import math, time, argparse, tempfile
-from typing import List, Dict, Optional, Union, Tuple, Callable
-from dataclasses import dataclass
-from pathlib import Path
-from PIL import Image
-
-urls:dict = {
-  "flux-schnell": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/flux1-schnell.safetensors",
-  "flux-dev": "https://huggingface.co/camenduru/FLUX.1-dev/resolve/main/flux1-dev.sft",
-  "ae": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/ae.safetensors",
-  "T5_1_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00001-of-00002.safetensors",
-  "T5_2_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00002-of-00002.safetensors",
-  "T5_tokenizer": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/tokenizer_2/spiece.model",
-  "clip": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder/model.safetensors"
-}
-
-def tensor_identity(x:Tensor) -> Tensor: return x
-
-class AutoEncoder:
-  def __init__(self, scale_factor:float, shift_factor:float):
-    self.decoder = FirstStage.Decoder(128, 3, 3, 16, [1, 2, 4, 4], 2, 256)
-    self.scale_factor = scale_factor
-    self.shift_factor = shift_factor
-
-  def decode(self, z:Tensor) -> Tensor:
-    z = z / self.scale_factor + self.shift_factor
-    return self.decoder(z)
-
-# Conditioner
-class ClipEmbedder(FrozenClosedClipEmbedder):
-  def __call__(self, texts:Union[str, List[str], Tensor]) -> Tensor:
-    if isinstance(texts, str): texts = [texts]
-    assert isinstance(texts, (list,tuple)), f"expected list of strings, got {type(texts).__name__}"
-    tokens = Tensor.cat(*[Tensor(self.tokenizer.encode(text)) for text in texts], dim=0)
-    return self.transformer.text_model(tokens.reshape(len(texts),-1))[:, tokens.argmax(-1)]
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
-def attention(q:Tensor, k:Tensor, v:Tensor, pe:Tensor) -> Tensor:
-  q, k = apply_rope(q, k, pe)
-  x = Tensor.scaled_dot_product_attention(q, k, v)
-  return x.rearrange("B H L D -> B L (H D)")
-
-def rope(pos:Tensor, dim:int, theta:int) -> Tensor:
-  assert dim % 2 == 0
-  scale = Tensor.arange(0, dim, 2, dtype=dtypes.float32, device=pos.device) / dim # NOTE: this is torch.float64 in reference implementation
-  omega = 1.0 / (theta**scale)
-  out = Tensor.einsum("...n,d->...nd", pos, omega)
-  out = Tensor.stack(Tensor.cos(out), -Tensor.sin(out), Tensor.sin(out), Tensor.cos(out), dim=-1)
-  out = out.rearrange("b n d (i j) -> b n d i j", i=2, j=2)
-  return out.float()
-
-def apply_rope(xq:Tensor, xk:Tensor, freqs_cis:Tensor) -> Tuple[Tensor, Tensor]:
-  xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-  xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-  xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-  xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-  return xq_out.reshape(*xq.shape).cast(xq.dtype), xk_out.reshape(*xk.shape).cast(xk.dtype)
-
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
-class EmbedND:
-  def __init__(self, dim:int, theta:int, axes_dim:List[int]):
-    self.dim = dim
-    self.theta = theta
-    self.axes_dim = axes_dim
-
-  def __call__(self, ids:Tensor) -> Tensor:
-    n_axes = ids.shape[-1]
-    emb = Tensor.cat(*[rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3)
-    return emb.unsqueeze(1)
-
-class MLPEmbedder:
-  def __init__(self, in_dim:int, hidden_dim:int):
-    self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
-    self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
-
-  def __call__(self, x:Tensor) -> Tensor:
-    return self.out_layer(self.in_layer(x).silu())
-
-class QKNorm:
-  def __init__(self, dim:int):
-    self.query_norm = nn.RMSNorm(dim)
-    self.key_norm = nn.RMSNorm(dim)
-
-  def __call__(self, q:Tensor, k:Tensor) -> Tuple[Tensor, Tensor]:
-    return self.query_norm(q), self.key_norm(k)
-
-class SelfAttention:
-  def __init__(self, dim:int, num_heads:int = 8, qkv_bias:bool = False):
-    self.num_heads = num_heads
-    head_dim = dim // num_heads
-
-    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-    self.norm = QKNorm(head_dim)
-    self.proj = nn.Linear(dim, dim)
-
-  def __call__(self, x:Tensor, pe:Tensor) -> Tensor:
-    qkv = self.qkv(x)
-    q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    q, k = self.norm(q, k)
-    x = attention(q, k, v, pe=pe)
-    return self.proj(x)
-
-@dataclass
-class ModulationOut:
-  shift:Tensor
-  scale:Tensor
-  gate:Tensor
-
-class Modulation:
-  def __init__(self, dim:int, double:bool):
-    self.is_double = double
-    self.multiplier = 6 if double else 3
-    self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
-
-  def __call__(self, vec:Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
-    out = self.lin(vec.silu())[:, None, :].chunk(self.multiplier, dim=-1)
-    return ModulationOut(*out[:3]), ModulationOut(*out[3:]) if self.is_double else None
-
-class DoubleStreamBlock:
-  def __init__(self, hidden_size:int, num_heads:int, mlp_ratio:float, qkv_bias:bool = False):
-    mlp_hidden_dim = int(hidden_size * mlp_ratio)
-    self.num_heads = num_heads
-    self.hidden_size = hidden_size
-    self.img_mod = Modulation(hidden_size, double=True)
-    self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-
-    self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.img_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
-
-    self.txt_mod = Modulation(hidden_size, double=True)
-    self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-
-    self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.txt_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
-
-  def __call__(self, img:Tensor, txt:Tensor, vec:Tensor, pe:Tensor) -> tuple[Tensor, Tensor]:
-    img_mod1, img_mod2 = self.img_mod(vec)
-    txt_mod1, txt_mod2 = self.txt_mod(vec)
-    assert img_mod2 is not None and txt_mod2 is not None
-    # prepare image for attention
-    img_modulated = self.img_norm1(img)
-    img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-    img_qkv = self.img_attn.qkv(img_modulated)
-    img_q, img_k, img_v = img_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    img_q, img_k = self.img_attn.norm(img_q, img_k)
-
-    # prepare txt for attention
-    txt_modulated = self.txt_norm1(txt)
-    txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-    txt_qkv = self.txt_attn.qkv(txt_modulated)
-    txt_q, txt_k, txt_v = txt_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k)
-
-    # run actual attention
-    q = Tensor.cat(txt_q, img_q, dim=2)
-    k = Tensor.cat(txt_k, img_k, dim=2)
-    v = Tensor.cat(txt_v, img_v, dim=2)
-
-    attn = attention(q, k, v, pe=pe)
-    txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-
-    # calculate the img bloks
-    img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-    img = img + img_mod2.gate * ((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift).sequential(self.img_mlp)
-
-    # calculate the txt bloks
-    txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-    txt = txt + txt_mod2.gate * ((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift).sequential(self.txt_mlp)
-    return img, txt
-
-
-class SingleStreamBlock:
-  """
-  A DiT block with parallel linear layers as described in
-  https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-  """
-
-  def __init__(self,hidden_size:int, num_heads:int, mlp_ratio:float=4.0, qk_scale:Optional[float]=None):
-    self.hidden_dim = hidden_size
-    self.num_heads = num_heads
-    head_dim = hidden_size // num_heads
-    self.scale = qk_scale or head_dim**-0.5
-
-    self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-    # qkv and mlp_in
-    self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
-    # proj and mlp_out
-    self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
-
-    self.norm = QKNorm(head_dim)
-
-    self.hidden_size = hidden_size
-    self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-
-    self.mlp_act = Tensor.gelu
-    self.modulation = Modulation(hidden_size, double=False)
-
-  def __call__(self, x:Tensor, vec:Tensor, pe:Tensor) -> Tensor:
-    mod, _ = self.modulation(vec)
-    x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-    qkv, mlp = Tensor.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-    q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    q, k = self.norm(q, k)
-
-    # compute attention
-    attn = attention(q, k, v, pe=pe)
-    # compute activation in mlp stream, cat again and run second linear layer
-    output = self.linear2(Tensor.cat(attn, self.mlp_act(mlp), dim=2))
-    return x + mod.gate * output
-
-
-class LastLayer:
-  def __init__(self, hidden_size:int, patch_size:int, out_channels:int):
-    self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
-    self.adaLN_modulation:List[Callable[[Tensor], Tensor]] = [Tensor.silu, nn.Linear(hidden_size, 2 * hidden_size, bias=True)]
-
-  def __call__(self, x:Tensor, vec:Tensor) -> Tensor:
-    shift, scale = vec.sequential(self.adaLN_modulation).chunk(2, dim=1)
-    x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-    return self.linear(x)
-
-def timestep_embedding(t:Tensor, dim:int, max_period:int=10000, time_factor:float=1000.0) -> Tensor:
-  """
-  Create sinusoidal timestep embeddings.
-  :param t: a 1-D Tensor of N indices, one per batch element.
-                    These may be fractional.
-  :param dim: the dimension of the output.
-  :param max_period: controls the minimum frequency of the embeddings.
-  :return: an (N, D) Tensor of positional embeddings.
-  """
-  t = time_factor * t
-  half = dim // 2
-  freqs = Tensor.exp(-math.log(max_period) * Tensor.arange(0, stop=half, dtype=dtypes.float32) / half).to(t.device)
-
-  args = t[:, None].float() * freqs[None]
-  embedding = Tensor.cat(Tensor.cos(args), Tensor.sin(args), dim=-1)
-  if dim % 2:  embedding = Tensor.cat(*[embedding, Tensor.zeros_like(embedding[:, :1])], dim=-1)
-  if Tensor.is_floating_point(t):  embedding = embedding.cast(t.dtype)
-  return embedding
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/model.py
-class Flux:
-  """
-  Transformer model for flow matching on sequences.
-  """
-
-  def __init__(
-      self,
-      guidance_embed:bool,
-      in_channels:int = 64,
-      vec_in_dim:int = 768,
-      context_in_dim:int = 4096,
-      hidden_size:int = 3072,
-      mlp_ratio:float = 4.0,
-      num_heads:int = 24,
-      depth:int = 19,
-      depth_single_blocks:int = 38,
-      axes_dim:Optional[List[int]] = None,
-      theta:int = 10_000,
-      qkv_bias:bool = True,
-      ):
-
-    axes_dim = axes_dim or [16, 56, 56]
-    self.guidance_embed = guidance_embed
-    self.in_channels = in_channels
-    self.out_channels = self.in_channels
-    if hidden_size % num_heads != 0:
-      raise ValueError(f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}")
-    pe_dim = hidden_size // num_heads
-    if sum(axes_dim) != pe_dim:
-      raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
-    self.hidden_size = hidden_size
-    self.num_heads = num_heads
-    self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
-    self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-    self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-    self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
-    self.guidance_in:Callable[[Tensor], Tensor] = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else tensor_identity
-    self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
-
-    self.double_blocks = [DoubleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias) for _ in range(depth)]
-    self.single_blocks = [SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio) for _ in range(depth_single_blocks)]
-    self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-
-  def __call__(self, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, timesteps:Tensor, y:Tensor, guidance:Optional[Tensor] = None) -> Tensor:
-    if img.ndim != 3 or txt.ndim != 3:
-      raise ValueError("Input img and txt tensors must have 3 dimensions.")
-    # running on sequences img
-    img = self.img_in(img)
-    vec = self.time_in(timestep_embedding(timesteps, 256))
-    if self.guidance_embed:
-      if guidance is None:
-        raise ValueError("Didn't get guidance strength for guidance distilled model.")
-      vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
-    vec = vec + self.vector_in(y)
-    txt = self.txt_in(txt)
-    ids = Tensor.cat(txt_ids, img_ids, dim=1)
-    pe = self.pe_embedder(ids)
-    for double_block in self.double_blocks:
-      img, txt = double_block(img=img, txt=txt, vec=vec, pe=pe)
-
-    img = Tensor.cat(txt, img, dim=1)
-    for single_block in self.single_blocks:
-      img = single_block(img, vec=vec, pe=pe)
-
-    img = img[:, txt.shape[1] :, ...]
-
-    return self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/util.py
-def load_flow_model(name:str, model_path:str):
-  # Loading Flux
-  print("Init model")
-  model = Flux(guidance_embed=(name != "flux-schnell"))
-  if not model_path: model_path = fetch(urls[name])
-  state_dict = {k.replace("scale", "weight"): v for k, v in safe_load(model_path).items()}
-  load_state_dict(model, state_dict)
-  return model
-
-def load_T5(max_length:int=512):
-  # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
-  print("Init T5")
-  T5 = T5Embedder(max_length, fetch(urls["T5_tokenizer"]))
-  pt_1 = fetch(urls["T5_1_of_2"])
-  pt_2 = fetch(urls["T5_2_of_2"])
-  load_state_dict(T5.encoder, safe_load(pt_1) | safe_load(pt_2), strict=False)
-  return T5
-
-def load_clip():
-  print("Init Clip")
-  clip = ClipEmbedder()
-  load_state_dict(clip.transformer, safe_load(fetch(urls["clip"])))
-  return clip
-
-def load_ae() -> AutoEncoder:
-  # Loading the autoencoder
-  print("Init AE")
-  ae = AutoEncoder(0.3611, 0.1159)
-  load_state_dict(ae, safe_load(fetch(urls["ae"])))
-  return ae
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/sampling.py
-def prepare(T5:T5Embedder, clip:ClipEmbedder, img:Tensor, prompt:Union[str, List[str]]) -> Dict[str, Tensor]:
-  bs, _, h, w = img.shape
-  if bs == 1 and not isinstance(prompt, str):
-    bs = len(prompt)
-
-  img = img.rearrange("b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-  if img.shape[0] == 1 and bs > 1:
-    img = img.expand((bs, *img.shape[1:]))
-
-  img_ids = Tensor.zeros(h // 2, w // 2, 3).contiguous()
-  img_ids[..., 1] = img_ids[..., 1] + Tensor.arange(h // 2)[:, None]
-  img_ids[..., 2] = img_ids[..., 2] + Tensor.arange(w // 2)[None, :]
-  img_ids = img_ids.rearrange("h w c -> 1 (h w) c")
-  img_ids = img_ids.expand((bs, *img_ids.shape[1:]))
-
-  if isinstance(prompt, str):
-    prompt = [prompt]
-  txt = T5(prompt).realize()
-  if txt.shape[0] == 1 and bs > 1:
-    txt = txt.expand((bs, *txt.shape[1:]))
-  txt_ids = Tensor.zeros(bs, txt.shape[1], 3)
-
-  vec = clip(prompt).realize()
-  if vec.shape[0] == 1 and bs > 1:
-    vec = vec.expand((bs, *vec.shape[1:]))
-
-  return {"img": img, "img_ids": img_ids.to(img.device), "txt": txt.to(img.device), "txt_ids": txt_ids.to(img.device), "vec": vec.to(img.device)}
-
-
-def get_schedule(num_steps:int, image_seq_len:int, base_shift:float=0.5, max_shift:float=1.15, shift:bool=True) -> List[float]:
-  # extra step for zero
-  step_size = -1.0 / num_steps
-  timesteps = Tensor.arange(1, 0 + step_size, step_size)
-
-  # shifting the schedule to favor high timesteps for higher signal images
-  if shift:
-    # estimate mu based on linear estimation between two points
-    mu = 0.5 + (max_shift - base_shift) * (image_seq_len - 256) / (4096 - 256)
-    timesteps = math.exp(mu) / (math.exp(mu) + (1 / timesteps - 1))
-  return timesteps.tolist()
-
-@TinyJit
-def run(model, *args): return model(*args).realize()
-
-def denoise(model, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, vec:Tensor, timesteps:List[float], guidance:float=4.0) -> Tensor:
-  # this is ignored for schnell
-  guidance_vec = Tensor((guidance,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
-  for t_curr, t_prev in tqdm(list(zip(timesteps[:-1], timesteps[1:])), "Denoising"):
-    t_vec = Tensor((t_curr,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
-    pred = run(model, img, img_ids, txt, txt_ids, t_vec, vec, guidance_vec)
-    img = img + (t_prev - t_curr) * pred
-
-  return img
-
-def unpack(x:Tensor, height:int, width:int) -> Tensor:
-  return x.rearrange("b (h w) (c ph pw) -> b c (h ph) (w pw)", h=math.ceil(height / 16), w=math.ceil(width / 16), ph=2, pw=2)
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/cli.py
-if __name__ == "__main__":
-  default_prompt = "bananas and a can of coke"
-  parser = argparse.ArgumentParser(description="Run Flux.1", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-  parser.add_argument("--name",       type=str,   default="flux-schnell", help="Name of the model to load")
-  parser.add_argument("--model_path", type=str,   default="",             help="path of the model file")
-  parser.add_argument("--width",      type=int,   default=512,            help="width of the sample in pixels (should be a multiple of 16)")
-  parser.add_argument("--height",     type=int,   default=512,            help="height of the sample in pixels (should be a multiple of 16)")
-  parser.add_argument("--seed",       type=int,   default=None,           help="Set a seed for sampling")
-  parser.add_argument("--prompt",     type=str,   default=default_prompt, help="Prompt used for sampling")
-  parser.add_argument('--out',        type=str,   default=Path(tempfile.gettempdir()) / "rendered.png", help="Output filename")
-  parser.add_argument("--num_steps",  type=int,   default=None,           help="number of sampling steps (default 4 for schnell, 50 for guidance distilled)") #noqa:E501
-  parser.add_argument("--guidance",   type=float, default=3.5,            help="guidance value used for guidance distillation")
-  parser.add_argument("--output_dir", type=str,   default="output",       help="output directory")
-  args = parser.parse_args()
-
-  if args.name not in ["flux-schnell", "flux-dev"]:
-    raise ValueError(f"Got unknown model name: {args.name}, chose from flux-schnell and flux-dev")
-
-  if args.num_steps is None:
-    args.num_steps = 4 if args.name == "flux-schnell" else 50
-
-  # allow for packing and conversion to latent space
-  height = 16 * (args.height // 16)
-  width = 16 * (args.width // 16)
-
-  if args.seed is None: args.seed = Tensor._seed
-  else: Tensor.manual_seed(args.seed)
-
-  print(f"Generating with seed {args.seed}:\n{args.prompt}")
-  t0 = time.perf_counter()
-
-  # prepare input noise
-  x = Tensor.randn(1, 16, 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), dtype="bfloat16")
-
-  # load text embedders
-  T5 = load_T5(max_length=256 if args.name == "flux-schnell" else 512)
-  clip = load_clip()
-
-  # embed text to get inputs for model
-  inp = prepare(T5, clip, x, prompt=args.prompt)
-  timesteps = get_schedule(args.num_steps, inp["img"].shape[1], shift=(args.name != "flux-schnell"))
-
-  # done with text embedders
-  del T5, clip
-
-  # load model
-  model = load_flow_model(args.name, args.model_path)
-
-  # denoise initial noise
-  x = denoise(model, **inp, timesteps=timesteps, guidance=args.guidance)
-
-  # done with model
-  del model, run
-
-  # load autoencoder
-  ae = load_ae()
-
-  # decode latents to pixel space
-  x = unpack(x.float(), height, width)
-  x = ae.decode(x).realize()
-
-  t1 = time.perf_counter()
-  print(f"Done in {t1 - t0:.1f}s. Saving {args.out}")
-
-  # bring into PIL format and save
-  x = x.clamp(-1, 1)
-  x = x[0].rearrange("c h w -> h w c")
-  x = (127.5 * (x + 1.0)).cast("uint8")
-
-  img = Image.fromarray(x.numpy())
-
-  img.save(args.out)
-
-  # validation!
-  if args.prompt == default_prompt and args.name=="flux-schnell" and args.seed == 0 and args.width == args.height == 512:
-    ref_image = Tensor(np.array(Image.open("examples/flux1_seed0.png")))
-    distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
-    assert distance < 4e-3, colored(f"validation failed with {distance=}", "red")
-    print(colored(f"output validated with {distance=}", "green"))
--- a/examples/gradaccum_mnist.py
+++ b/examples/gradaccum_mnist.py
@ -0,0 +1,108 @@
+import itertools
+from typing import Callable
+from tinygrad import nn, Tensor, dtypes, Device, TinyJit
+from tinygrad.helpers import getenv, trange, partition
+
+class Model:
+  def __init__(self):
+    self.layers: list[Callable[[Tensor], Tensor]] = [
+      nn.Conv2d(1, 32, 5), Tensor.relu,
+      nn.Conv2d(32, 32, 5), Tensor.relu,
+      nn.BatchNorm(32), Tensor.max_pool2d,
+      nn.Conv2d(32, 64, 3), Tensor.relu,
+      nn.Conv2d(64, 64, 3), Tensor.relu,
+      nn.BatchNorm(64), Tensor.max_pool2d,
+      lambda x: x.flatten(1), nn.Linear(576, 10)]
+
+  def __call__(self, x:Tensor) -> Tensor: return x.sequential(self.layers)
+
+# TODO: refactor this into optim/onnx
+def functional_adam(g:Tensor, m:Tensor, v:Tensor, b1_t:Tensor, b2_t:Tensor, lr=0.001, b1=0.9, b2=0.999, eps=1e-6) -> Tensor:
+  b1_t *= b1
+  b2_t *= b2
+  m.assign(b1 * m + (1.0 - b1) * g)
+  v.assign(b2 * v + (1.0 - b2) * (g * g))
+  m_hat = m / (1.0 - b1_t)
+  v_hat = v / (1.0 - b2_t)
+  return lr * (m_hat / (v_hat.sqrt() + eps))
+
+if __name__ == "__main__":
+  BS = getenv("BS", 512)
+  ACC_STEPS = getenv("ACC_STEPS", 8)
+
+  X_train, Y_train, X_test, Y_test = nn.datasets.mnist()
+  model = Model()
+
+  params = nn.state.get_parameters(model)
+
+  # init params, set requires grad on the ones we need gradients of
+  for x in params:
+    if x.requires_grad is None: x.requires_grad_()
+    x.replace(x.contiguous())
+  Tensor.realize(*params)
+
+  # split params (with grads) and buffers (without)
+  params, buffers = partition(params, lambda x: x.requires_grad)
+  print(f"params: {len(params)} buffers: {len(buffers)}")
+
+  # optim params
+  pos_params = list(itertools.accumulate(params, lambda x,y: x+y.numel(), initial=0))
+  adam_m = Tensor.zeros(pos_params[-1], device="CPU").contiguous()
+  adam_v = Tensor.zeros(pos_params[-1], device="CPU").contiguous()
+  adam_b1_t = Tensor.ones((1,), dtype=dtypes.float32, device="CPU", requires_grad=False).contiguous()
+  adam_b2_t = Tensor.ones((1,), dtype=dtypes.float32, device="CPU", requires_grad=False).contiguous()
+  adam_params = [adam_m, adam_v, adam_b1_t, adam_b2_t]
+
+  # create loss and grads. init all state so the JIT works on microbatch
+  for x in params: x.assign(x.detach())
+  loss = Tensor.zeros(tuple()).contiguous()
+  grads = Tensor.zeros(pos_params[-1]).contiguous()
+  Tensor.realize(*params, *buffers, *adam_params, loss, grads)
+
+  @TinyJit
+  @Tensor.train()
+  def microbatch():
+    samples = Tensor.randint(BS // ACC_STEPS, high=X_train.shape[0])
+    for t in params: t.grad = None
+    # divide by ACC_STEPS at the loss
+    uloss = (model(X_train[samples]).sparse_categorical_crossentropy(Y_train[samples]) / ACC_STEPS).backward()
+    ugrads = Tensor.cat(*[t.grad.contiguous().flatten() for t in params], dim=0)
+    for t in params: t.grad = None
+    # concat the grads and assign them
+    loss.assign(loss + uloss)
+    grads.assign(grads + ugrads)
+    Tensor.realize(*params, *buffers, loss, grads)
+
+  @TinyJit
+  def optimizer():
+    # run optimizer (on CPU, where adam params live)
+    delta = functional_adam(grads.to("CPU"), adam_m, adam_v, adam_b1_t, adam_b2_t)
+
+    # update the params, copying back the delta one at a time to avoid OOM
+    # NOTE: the scheduler is ordering things poorly, all the copies are happening before the adds
+    for j,tt in enumerate(params):
+      tt.assign(tt.detach() - delta[pos_params[j]:pos_params[j+1]].reshape(tt.shape).to(Device.DEFAULT))
+
+    # realize everything, zero out loss and grads
+    loss.assign(Tensor.zeros_like(loss))
+    grads.assign(Tensor.zeros_like(grads))
+    Tensor.realize(*params, *adam_params, loss, grads)
+
+  @TinyJit
+  def get_test_acc() -> Tensor: return (model(X_test).argmax(axis=1) == Y_test).mean()*100
+
+  test_acc = float('nan')
+  for i in (t:=trange(getenv("STEPS", 70))):
+    # microbatch sets the gradients
+    for _ in range(ACC_STEPS): microbatch()
+
+    # get the loss before the optimizer clears it
+    # this is already realized so this isn't a schedule
+    loss_item = loss.item()
+
+    # run the optimizer
+    optimizer()
+
+    # eval
+    if i%10 == 9: test_acc = get_test_acc().item()
+    t.set_description(f"loss: {loss_item:6.2f} test_accuracy: {test_acc:5.2f}%")
--- a/examples/mask_rcnn.py
+++ b/examples/mask_rcnn.py
@ -1,299 +0,0 @@
-from extra.models.mask_rcnn import MaskRCNN
-from extra.models.resnet import ResNet
-from extra.models.mask_rcnn import BoxList
-from torch.nn import functional as F
-from torchvision import transforms as T
-from torchvision.transforms import functional as Ft
-import random
-from tinygrad.tensor import Tensor
-from PIL import Image
-import numpy as np
-import torch
-import argparse
-import cv2
-
-
-class Resize:
-  def __init__(self, min_size, max_size):
-    if not isinstance(min_size, (list, tuple)):
-      min_size = (min_size,)
-    self.min_size = min_size
-    self.max_size = max_size
-
-  # modified from torchvision to add support for max size
-  def get_size(self, image_size):
-    w, h = image_size
-    size = random.choice(self.min_size)
-    max_size = self.max_size
-    if max_size is not None:
-      min_original_size = float(min((w, h)))
-      max_original_size = float(max((w, h)))
-      if max_original_size / min_original_size * size > max_size:
-        size = int(round(max_size * min_original_size / max_original_size))
-
-      if (w <= h and w == size) or (h <= w and h == size):
-        return (h, w)
-
-      if w < h:
-        ow = size
-        oh = int(size * h / w)
-      else:
-        oh = size
-        ow = int(size * w / h)
-
-      return (oh, ow)
-
-  def __call__(self, image):
-    size = self.get_size(image.size)
-    image = Ft.resize(image, size)
-    return image
-
-
-class Normalize:
-  def __init__(self, mean, std, to_bgr255=True):
-    self.mean = mean
-    self.std = std
-    self.to_bgr255 = to_bgr255
-
-  def __call__(self, image):
-    if self.to_bgr255:
-      image = image[[2, 1, 0]] * 255
-    else:
-      image = image[[0, 1, 2]] * 255
-    image = Ft.normalize(image, mean=self.mean, std=self.std)
-    return image
-
-transforms = lambda size_scale: T.Compose(
-  [
-    Resize(int(800*size_scale), int(1333*size_scale)),
-    T.ToTensor(),
-    Normalize(
-      mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.], to_bgr255=True
-    ),
-  ]
-)
-
-def expand_boxes(boxes, scale):
-  w_half = (boxes[:, 2] - boxes[:, 0]) * .5
-  h_half = (boxes[:, 3] - boxes[:, 1]) * .5
-  x_c = (boxes[:, 2] + boxes[:, 0]) * .5
-  y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-
-  w_half *= scale
-  h_half *= scale
-
-  boxes_exp = torch.zeros_like(boxes)
-  boxes_exp[:, 0] = x_c - w_half
-  boxes_exp[:, 2] = x_c + w_half
-  boxes_exp[:, 1] = y_c - h_half
-  boxes_exp[:, 3] = y_c + h_half
-  return boxes_exp
-
-
-def expand_masks(mask, padding):
-  N = mask.shape[0]
-  M = mask.shape[-1]
-  pad2 = 2 * padding
-  scale = float(M + pad2) / M
-  padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2))
-  padded_mask[:, :, padding:-padding, padding:-padding] = mask
-  return padded_mask, scale
-
-
-def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1):
-  # TODO: remove torch
-  mask = torch.tensor(mask.numpy())
-  box = torch.tensor(box.numpy())
-  padded_mask, scale = expand_masks(mask[None], padding=padding)
-  mask = padded_mask[0, 0]
-  box = expand_boxes(box[None], scale)[0]
-  box = box.to(dtype=torch.int32)
-
-  TO_REMOVE = 1
-  w = int(box[2] - box[0] + TO_REMOVE)
-  h = int(box[3] - box[1] + TO_REMOVE)
-  w = max(w, 1)
-  h = max(h, 1)
-
-  mask = mask.expand((1, 1, -1, -1))
-
-  mask = mask.to(torch.float32)
-  mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
-  mask = mask[0][0]
-
-  if thresh >= 0:
-    mask = mask > thresh
-  else:
-    mask = (mask * 255).to(torch.uint8)
-
-  im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8)
-  x_0 = max(box[0], 0)
-  x_1 = min(box[2] + 1, im_w)
-  y_0 = max(box[1], 0)
-  y_1 = min(box[3] + 1, im_h)
-
-  im_mask[y_0:y_1, x_0:x_1] = mask[
-                              (y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0])
-                              ]
-  return im_mask
-
-
-class Masker:
-  def __init__(self, threshold=0.5, padding=1):
-    self.threshold = threshold
-    self.padding = padding
-
-  def forward_single_image(self, masks, boxes):
-    boxes = boxes.convert("xyxy")
-    im_w, im_h = boxes.size
-    res = [
-      paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding)
-      for mask, box in zip(masks, boxes.bbox)
-    ]
-    if len(res) > 0:
-      res = torch.stack(*res, dim=0)[:, None]
-    else:
-      res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1]))
-    return Tensor(res.numpy())
-
-  def __call__(self, masks, boxes):
-    if isinstance(boxes, BoxList):
-      boxes = [boxes]
-
-    results = []
-    for mask, box in zip(masks, boxes):
-      result = self.forward_single_image(mask, box)
-      results.append(result)
-    return results
-
-
-masker = Masker(threshold=0.5, padding=1)
-
-def select_top_predictions(predictions, confidence_threshold=0.9):
-  scores = predictions.get_field("scores").numpy()
-  keep = [idx for idx, score in enumerate(scores) if score > confidence_threshold]
-  return predictions[keep]
-
-def compute_prediction(original_image, model, confidence_threshold, size_scale=1.0):
-  image = transforms(size_scale)(original_image).numpy()
-  image = Tensor(image, requires_grad=False)
-  predictions = model(image)
-  prediction = predictions[0]
-  prediction = select_top_predictions(prediction, confidence_threshold)
-  width, height = original_image.size
-  prediction = prediction.resize((width, height))
-
-  if prediction.has_field("mask"):
-    masks = prediction.get_field("mask")
-    masks = masker([masks], [prediction])[0]
-    prediction.add_field("mask", masks)
-  return prediction
-
-def compute_prediction_batched(batch, model, size_scale=1.0):
-  imgs = []
-  for img in batch:
-    imgs.append(transforms(size_scale)(img).numpy())
-  image = [Tensor(image, requires_grad=False) for image in imgs]
-  predictions = model(image)
-  del image
-  return predictions
-
-palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
-
-def findContours(*args, **kwargs):
-  if cv2.__version__.startswith('4'):
-    contours, hierarchy = cv2.findContours(*args, **kwargs)
-  elif cv2.__version__.startswith('3'):
-    _, contours, hierarchy = cv2.findContours(*args, **kwargs)
-  return contours, hierarchy
-
-def compute_colors_for_labels(labels):
-  l = labels[:, None]
-  colors = l * palette
-  colors = (colors % 255).astype("uint8")
-  return colors
-
-def overlay_mask(image, predictions):
-  image = np.asarray(image)
-  masks = predictions.get_field("mask").numpy()
-  labels = predictions.get_field("labels").numpy()
-
-  colors = compute_colors_for_labels(labels).tolist()
-
-  for mask, color in zip(masks, colors):
-    thresh = mask[0, :, :, None]
-    contours, hierarchy = findContours(
-        thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
-    )
-    image = cv2.drawContours(image, contours, -1, color, 3)
-
-  composite = image
-
-  return composite
-
-CATEGORIES = [
-    "__background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant",
-    "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
-    "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
-    "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
-    "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
-    "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster",
-    "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
-]
-
-def overlay_boxes(image, predictions):
-  labels = predictions.get_field("labels").numpy()
-  boxes = predictions.bbox
-  image = np.asarray(image)
-  colors = compute_colors_for_labels(labels).tolist()
-
-  for box, color in zip(boxes, colors):
-    box = torch.tensor(box.numpy())
-    box = box.to(torch.int64)
-    top_left, bottom_right = box[:2].tolist(), box[2:].tolist()
-    image = cv2.rectangle(
-        image, tuple(top_left), tuple(bottom_right), tuple(color), 1
-    )
-
-  return image
-
-def overlay_class_names(image, predictions):
-  scores = predictions.get_field("scores").numpy().tolist()
-  labels = predictions.get_field("labels").numpy().tolist()
-  labels = [CATEGORIES[int(i)] for i in labels]
-  boxes = predictions.bbox.numpy()
-  image = np.asarray(image)
-  template = "{}: {:.2f}"
-  for box, score, label in zip(boxes, scores, labels):
-    x, y = box[:2]
-    s = template.format(label, score)
-    x, y = int(x), int(y)
-    cv2.putText(
-        image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1
-    )
-
-  return image
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser(description='Run MaskRCNN', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-  parser.add_argument('--image', type=str, help="Path of the image to run")
-  parser.add_argument('--threshold', type=float, default=0.7, help="Detector threshold")
-  parser.add_argument('--size_scale', type=float, default=1.0, help="Image resize multiplier")
-  parser.add_argument('--out', type=str, default="/tmp/rendered.png", help="Output filename")
-  args = parser.parse_args()
-
-  resnet = ResNet(50, num_classes=None, stride_in_1x1=True)
-  model_tiny = MaskRCNN(resnet)
-  model_tiny.load_from_pretrained()
-  img = Image.open(args.image)
-  top_result_tiny = compute_prediction(img, model_tiny, confidence_threshold=args.threshold, size_scale=args.size_scale)
-  bbox_image = overlay_boxes(img, top_result_tiny)
-  mask_image = overlay_mask(bbox_image, top_result_tiny)
-  final_image = overlay_class_names(mask_image, top_result_tiny)
-
-  im = Image.fromarray(final_image)
-  print(f"saving {args.out}")
-  im.save(args.out)
-  im.show()
--- a/examples/mlperf/helpers.py
+++ b/examples/mlperf/helpers.py
@ -223,13 +223,13 @@ def get_mlperf_bert_model():

 def get_fake_data_bert(BS:int):
  return {
-    "input_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
-    "input_mask": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
-    "segment_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
-    "masked_lm_positions": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"),
-    "masked_lm_ids": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"),
-    "masked_lm_weights": Tensor.empty((BS, 76), dtype=dtypes.float32, device="CPU"),
-    "next_sentence_labels": Tensor.empty((BS, 1), dtype=dtypes.int32, device="CPU"),
+    "input_ids": Tensor.zeros((BS, 512), dtype=dtypes.int32, device="CPU").contiguous(),
+    "input_mask": Tensor.zeros((BS, 512), dtype=dtypes.int32, device="CPU").contiguous(),
+    "segment_ids": Tensor.zeros((BS, 512), dtype=dtypes.int32, device="CPU").contiguous(),
+    "masked_lm_positions": Tensor.zeros((BS, 76), dtype=dtypes.int32, device="CPU").contiguous(),
+    "masked_lm_ids": Tensor.zeros((BS, 76), dtype=dtypes.int32, device="CPU").contiguous(),
+    "masked_lm_weights": Tensor.zeros((BS, 76), dtype=dtypes.float32, device="CPU").contiguous(),
+    "next_sentence_labels": Tensor.zeros((BS, 1), dtype=dtypes.int32, device="CPU").contiguous(),
  }

 def find_matches(match_quality_matrix:np.ndarray, high_threshold:float=0.5, low_threshold:float=0.4, allow_low_quality_matches:bool=False) -> np.ndarray:
--- a/examples/mlperf/initializers.py
+++ b/examples/mlperf/initializers.py
@ -59,9 +59,7 @@ class EmbeddingBert(nn.Embedding):
    arange_shp, weight_shp, big_shp = (1, 1, self.vocab_sz, 1), (1, 1, self.vocab_sz, self.embed_sz), idx.shape+(self.vocab_sz, self.embed_sz,)
    if not hasattr(self, 'arange'): self.arange = Tensor.arange(self.vocab_sz, requires_grad=False, device=self.weight.device).reshape(arange_shp)
    arange, idx, vals = self.arange.expand(big_shp), idx.reshape(idx.shape+(1, 1,)).expand(big_shp), self.weight.cast(dtypes.default_float).reshape(weight_shp).expand(big_shp)
-    # TODO: contiguous() here because the embedding dropout creates different asts on each device, and search becomes very slow.
-    # Should fix with fixing random ast on multi device, and fuse arange to make embedding fast.
-    return (arange == idx).mul(vals).sum(2, dtype=vals.dtype).contiguous()
+    return (arange == idx).where(vals, 0).sum(2, dtype=vals.dtype)

 class LayerNormBert:
  def __init__(self, normalized_shape:Union[int, tuple[int, ...]], eps:float=1e-12, elementwise_affine:bool=True):
--- a/examples/mlperf/model_eval.py
+++ b/examples/mlperf/model_eval.py
@ -204,43 +204,6 @@ def eval_bert():

    st = time.perf_counter()

-def eval_mrcnn():
-  from tqdm import tqdm
-  from extra.models.mask_rcnn import MaskRCNN
-  from extra.models.resnet import ResNet
-  from extra.datasets.coco import BASEDIR, images, convert_prediction_to_coco_bbox, convert_prediction_to_coco_mask, accumulate_predictions_for_coco, evaluate_predictions_on_coco, iterate
-  from examples.mask_rcnn import compute_prediction_batched, Image
-  mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True))
-  mdl.load_from_pretrained()
-
-  bbox_output = '/tmp/results_bbox.json'
-  mask_output = '/tmp/results_mask.json'
-
-  accumulate_predictions_for_coco([], bbox_output, rm=True)
-  accumulate_predictions_for_coco([], mask_output, rm=True)
-
-  #TODO: bs > 1 not as accurate
-  bs = 1
-
-  for batch in tqdm(iterate(images, bs=bs), total=len(images)//bs):
-    batch_imgs = []
-    for image_row in batch:
-      image_name = image_row['file_name']
-      img = Image.open(BASEDIR/f'val2017/{image_name}').convert("RGB")
-      batch_imgs.append(img)
-    batch_result = compute_prediction_batched(batch_imgs, mdl)
-    for image_row, result in zip(batch, batch_result):
-      image_name = image_row['file_name']
-      box_pred = convert_prediction_to_coco_bbox(image_name, result)
-      mask_pred = convert_prediction_to_coco_mask(image_name, result)
-      accumulate_predictions_for_coco(box_pred, bbox_output)
-      accumulate_predictions_for_coco(mask_pred, mask_output)
-    del batch_imgs
-    del batch_result
-
-  evaluate_predictions_on_coco(bbox_output, iou_type='bbox')
-  evaluate_predictions_on_coco(mask_output, iou_type='segm')
-
 def eval_llama3():
  from extra.models.llama import Transformer
  from examples.llama3 import MODEL_PARAMS, load, convert_from_huggingface
@ -541,7 +504,7 @@ if __name__ == "__main__":
  # inference only
  Tensor.training = False

-  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",")
+  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert").split(",")
  for m in models:
    nm = f"eval_{m}"
    if nm in globals():
--- a/examples/mlperf/model_train.py
+++ b/examples/mlperf/model_train.py
@ -918,40 +918,6 @@ def train_rnnt():
  # TODO: RNN-T
  pass

-@TinyJit
-def train_step_bert(model, optimizer, scheduler, loss_scaler:float, GPUS, grad_acc:int, **kwargs):
-  optimizer.zero_grad()
-
-  for i in range(grad_acc):
-    input_ids, segment_ids = kwargs[f"input_ids{i}"], kwargs[f"segment_ids{i}"]
-    # NOTE: these two have different names
-    attention_mask, masked_positions = kwargs[f"input_mask{i}"], kwargs[f"masked_lm_positions{i}"]
-    masked_lm_ids, masked_lm_weights, next_sentence_labels = kwargs[f"masked_lm_ids{i}"], kwargs[f"masked_lm_weights{i}"], kwargs[f"next_sentence_labels{i}"]
-
-    for t in [input_ids, segment_ids, attention_mask, masked_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels]:
-      if len(GPUS) > 1: t.shard_(GPUS, axis=0)
-      else: t.to_(GPUS[0])
-
-    lm_logits, seq_relationship_logits = model(input_ids, attention_mask, masked_positions, segment_ids)
-    loss = model.loss(lm_logits, seq_relationship_logits, masked_lm_ids, masked_lm_weights, next_sentence_labels)
-    (loss * loss_scaler).backward()
-    # TODO: OOM without this realize with large grad_acc
-    Tensor.realize(*[p.grad for p in optimizer.params])
-
-  global_norm = Tensor(0.0, dtype=dtypes.float32, device=optimizer[0].device)
-  for p in optimizer.params:
-    p.grad = p.grad / loss_scaler
-    global_norm += p.grad.float().square().sum()
-  global_norm = global_norm.sqrt().contiguous()
-  for p in optimizer.params:
-    p.grad = (global_norm > 1.0).where((p.grad/global_norm).cast(p.grad.dtype), p.grad)
-
-  optimizer.step()
-  scheduler.step()
-  # TODO: no to("CPU") here because it blocks and messes the python time
-  Tensor.realize(loss, global_norm, optimizer.optimizers[0].lr)
-  return loss, global_norm, optimizer.optimizers[0].lr
-
@TinyJit
 def eval_step_bert(model, input_ids:Tensor, segment_ids:Tensor, attention_mask:Tensor, masked_positions:Tensor, masked_lm_ids:Tensor,
                   masked_lm_weights:Tensor, next_sentence_labels:Tensor, GPUS):
@ -1014,7 +980,8 @@ def train_bert():
  # ** hyperparameters **
  BS                 = config["BS"]                     = getenv("BS", 11 * len(GPUS) if dtypes.default_float in (dtypes.float16, dtypes.bfloat16) else 8 * len(GPUS))
  grad_acc           = config["GRADIENT_ACC_STEPS"]     = getenv("GRADIENT_ACC_STEPS", 1)
-  # TODO: mlperf logging
+  # TODO: implement grad accumulation + mlperf logging
+  assert grad_acc == 1
  GBS                = config["GLOBAL_BATCH_SIZE"]      = BS * grad_acc
  EVAL_BS            = config["EVAL_BS"]                = getenv("EVAL_BS", 1 * len(GPUS))
  max_lr             = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.000175 * math.sqrt(GBS/96))
@ -1073,8 +1040,8 @@ def train_bert():

  # ** Optimizer **
  parameters_no_wd = [v for k, v in get_state_dict(model).items() if "bias" in k or "LayerNorm" in k]
-  parameters = [x for x in parameters if x not in set(parameters_no_wd)]
-  optimizer_wd = LAMB(parameters, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
+  parameters_wd = [x for x in parameters if x not in set(parameters_no_wd)]
+  optimizer_wd = LAMB(parameters_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
  optimizer_no_wd = LAMB(parameters_no_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=0.0, adam=False)
  optimizer_group = OptimizerGroup(optimizer_wd, optimizer_no_wd)

@ -1131,12 +1098,38 @@ def train_bert():
  # ** train loop **
  wc_start = time.perf_counter()

-  i, train_data = start_step, [next(train_it) for _ in range(grad_acc)]
+  i, train_data = start_step, next(train_it)

  if RUNMLPERF:
    if MLLOGGER:
      MLLOGGER.start(key=mllog_constants.EPOCH_START, value=i*GBS, metadata={"epoch_num": i*GBS})

+  @TinyJit
+  def train_step_bert(input_ids:Tensor, segment_ids:Tensor, attention_mask:Tensor,
+                      masked_positions:Tensor, masked_lm_ids:Tensor, masked_lm_weights:Tensor, next_sentence_labels:Tensor):
+    for t in [input_ids, segment_ids, attention_mask, masked_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels]:
+      if len(GPUS) > 1: t.shard_(GPUS, axis=0)
+      else: t.to_(GPUS[0])
+    optimizer_group.zero_grad()
+
+    lm_logits, seq_relationship_logits = model(input_ids, attention_mask, masked_positions, segment_ids)
+    loss = model.loss(lm_logits, seq_relationship_logits, masked_lm_ids, masked_lm_weights, next_sentence_labels)
+    (loss * loss_scaler).backward()
+
+    global_norm = Tensor(0.0, dtype=dtypes.float32, device=optimizer_group[0].device)
+    for p in optimizer_group.params:
+      p.grad = p.grad / loss_scaler
+      global_norm += p.grad.float().square().sum()
+    global_norm = global_norm.sqrt().contiguous()
+    for p in optimizer_group.params:
+      p.grad = (global_norm > 1.0).where((p.grad/global_norm).cast(p.grad.dtype), p.grad)
+
+    optimizer_group.step()
+    scheduler_group.step()
+    # TODO: no to("CPU") here because it blocks and messes the python time
+    Tensor.realize(loss, global_norm, optimizer_group.optimizers[0].lr)
+    return loss, global_norm, optimizer_group.optimizers[0].lr
+
  while train_data is not None and i < train_steps and not achieved:
    if getenv("TRAIN", 1):
      Tensor.training = True
@ -1144,16 +1137,12 @@ def train_bert():
      st = time.perf_counter()
      GlobalCounters.reset()
      with WallTimeEvent(BenchEvent.STEP):
-        data = {f"{k}{i}":v for i,d in enumerate(train_data) for k,v in d.items()}
-        loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler, GPUS, grad_acc, **data)
+        loss, global_norm, lr = train_step_bert(
+          train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
+          train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"])

        pt = time.perf_counter()
-
-        try:
-          next_data = [next(train_it) for _ in range(grad_acc)]
-        except StopIteration:
-          next_data = None
-
+        next_data = next(train_it)
        dt = time.perf_counter()

        device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
@ -1188,8 +1177,8 @@ def train_bert():
      if MLLOGGER and RUNMLPERF:
        MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": i*GBS, "step_num": i})
      if getenv("RESET_STEP"): train_step_bert.reset()
-      elif getenv("FREE_INTERMEDIATE", 0) and train_step_bert.captured is not None:
-        # TODO: FREE_INTERMEDIATE nan'ed after jit step 2
+      elif getenv("FREE_INTERMEDIATE") and train_step_bert.captured is not None:
+        # TODO: this hangs on tiny green after 90 minutes of training
        train_step_bert.captured.free_intermediates()
      eval_lm_losses = []
      eval_clsf_losses = []
@ -1224,7 +1213,7 @@ def train_bert():
          return

      if getenv("RESET_STEP"): eval_step_bert.reset()
-      elif getenv("FREE_INTERMEDIATE", 0) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()
+      elif getenv("FREE_INTERMEDIATE") and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()

      del eval_data
      avg_lm_loss = sum(eval_lm_losses) / len(eval_lm_losses)
@ -1300,6 +1289,7 @@ def train_llama3():
  BASEDIR            = config["BASEDIR"]                = Path(getenv("BASEDIR", "/raid/datasets/c4/"))
  BS                 = config["BS"]                     = getenv("BS", 16)
  grad_acc           = config["GRADIENT_ACC_STEPS"]     = getenv("GRADIENT_ACC_STEPS", 1)
+  assert grad_acc == 1, f"{grad_acc=} is not supported"
  GBS                = config["GLOBAL_BATCH_SIZE"]      = BS * grad_acc
  SEED               = config["SEED"]                   = getenv("SEED", 5760)
  SEQLEN             = config["SEQLEN"]                 = getenv("SEQLEN", 8192)
@ -1324,12 +1314,14 @@ def train_llama3():
  opt_base_learning_rate = getenv("LR", 8e-5 * GBS / 1152)  # NOTE: cannot change for benchmark
  opt_end_learning_rate = getenv("END_LR", 8e-7)

-  # TODO: confirm weights are in bf16
+  model_params = MODEL_PARAMS[getenv("LLAMA3_SIZE", "8B")]["args"]
  # vocab_size from the mixtral tokenizer
-  params = MODEL_PARAMS[getenv("LLAMA3_SIZE", "8B")]["args"]
-  params = params | {"vocab_size": 32000} if not SMALL else params
-  if (llama_layers:=getenv("LLAMA_LAYERS")) != 0: params['n_layers'] = llama_layers
-  model = Transformer(**params, max_context=SEQLEN, jit=False, disable_kv_cache=True)
+  if not SMALL: model_params |= {"vocab_size": 32000}
+  if (llama_layers:=getenv("LLAMA_LAYERS")) != 0: model_params['n_layers'] = llama_layers
+  model = Transformer(**model_params, max_context=SEQLEN, jit=False, disable_kv_cache=True)
+  params = get_parameters(model)
+  # weights are all bfloat16 for now
+  assert params and all(p.dtype == dtypes.bfloat16 for p in params)

  if getenv("FAKEDATA"):
    for v in get_parameters(model):
@ -1374,20 +1366,17 @@ def train_llama3():

  @TinyJit
  @Tensor.train()
-  def train_step(model, tokens:Tensor, grad_acc:int):
+  def train_step(model, tokens:Tensor):
    optim.zero_grad()
-    # grad acc
-    for batch in tokens.split(tokens.shape[0]//grad_acc):
-      if (DP := getenv("DP", 1)) > 1:
-        device = tuple(f"{Device.DEFAULT}:{i}" for i in range(DP))
-        batch = batch.shard(device, 0)
-      if (MP := getenv("MP", 1)) > 1:
-        device = tuple(f"{Device.DEFAULT}:{i}" for i in range(MP))
-        batch = batch.shard(device)
-      logits:Tensor = model(batch[:, :-1], start_pos=0, temperature=math.nan)
-      loss = logits.sparse_categorical_crossentropy(batch[:, 1:])
-      loss.backward()
-      Tensor.realize(*[p.grad for p in optim.params])
+    if (DP := getenv("DP", 1)) > 1:
+      device = tuple(f"{Device.DEFAULT}:{i}" for i in range(DP))
+      tokens = tokens.shard(device, 0)
+    if (MP := getenv("MP", 1)) > 1:
+      device = tuple(f"{Device.DEFAULT}:{i}" for i in range(MP))
+      tokens = tokens.shard(device)
+    logits:Tensor = model(tokens[:, :-1], start_pos=0, temperature=math.nan)
+    loss = logits.sparse_categorical_crossentropy(tokens[:, 1:])
+    loss.backward()
    # L2 norm grad clip
    # https://github.com/NVIDIA/NeMo/blob/3368c3fc0b4a186ab33a1d68a504315100c0b2a6/nemo/collections/nlp/modules/common/megatron/clip_grads.py#L57
    # https://docs.pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html
@ -1422,18 +1411,18 @@ def train_llama3():
  # ** data iters **
  def fake_data(bs, samples):
    for _ in range(samples // bs):
-      yield Tensor.randint(bs, SEQLEN + 1, low=0, high=params["vocab_size"], dtype=dtypes.int32, device=Device.DEFAULT)
+      yield Tensor.randint(bs, SEQLEN + 1, low=0, high=model_params["vocab_size"], dtype=dtypes.int32, device=Device.DEFAULT)

  def get_train_iter():
    if getenv("FAKEDATA", 0):
-      return fake_data(GBS, SAMPLES)
+      return fake_data(BS, SAMPLES)
    else:
      if SMALL:
        from examples.mlperf.dataloader import batch_load_llama3_small
-        return batch_load_llama3_small(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
+        return batch_load_llama3_small(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
      else:
        from examples.mlperf.dataloader import batch_load_llama3
-        return batch_load_llama3(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
+        return batch_load_llama3(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))

  def get_eval_iter():
    if getenv("FAKEDATA", 0):
@ -1451,7 +1440,7 @@ def train_llama3():
  for tokens in tqdm(iter, total=SAMPLES//GBS):
    t = time.perf_counter()
    GlobalCounters.reset()
-    loss, lr = train_step(model, tokens, grad_acc)
+    loss, lr = train_step(model, tokens)
    loss = loss.float().item()

    i += 1
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/run_and_time.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/run_and_time.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+set -e  # Exit on any error
+set -o pipefail  # Make pipeline fail if any command fails
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_8xMI350X"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+
+# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+export TRAIN_STEPS=3900
+
+export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=5000000
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_8xMI350x_${DATETIME}_${SEED}.log"
+
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@ -2,7 +2,7 @@

 export PYTHONPATH="." NV=1
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72

 export IGNORE_OOB=1
 export REWRITE_STACK_LIMIT=500000
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@ -2,7 +2,7 @@

 export PYTHONPATH="." NV=1
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72

 export IGNORE_OOB=1
 export REWRITE_STACK_LIMIT=500000
--- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@ -5,7 +5,7 @@ set -o pipefail  # Make pipeline fail if any command fails
 export PYTHONPATH="." NV=1
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_green"
-export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72

 export IGNORE_OOB=1
 export REWRITE_STACK_LIMIT=500000
--- a/examples/openelm.py
+++ b/examples/openelm.py
@ -1,118 +0,0 @@
-import json, pprint
-from tinygrad import fetch, nn, Tensor
-from tinygrad.helpers import DEBUG
-
-class FeedForward:
-  def __init__(self, model_dim, intermediate_dim):
-    self.proj_1 = nn.Linear(model_dim, 2*intermediate_dim, bias=False)
-    self.proj_2 = nn.Linear(intermediate_dim, model_dim, bias=False)
-
-  def __call__(self, x):
-    y_12 = self.proj_1(x)
-    y_1, y_2 = y_12.chunk(2, dim=-1)
-    return self.proj_2(y_1.silu() * y_2)
-
-# NOTE: this RoPE doesn't match LLaMA's?
-def _rotate_half(x: Tensor) -> Tensor:
-  x1, x2 = x.chunk(2, dim=-1)
-  return Tensor.cat(-x2, x1, dim=-1)
-
-def _apply_rotary_pos_emb(x: Tensor, pos_sin: Tensor, pos_cos: Tensor) -> Tensor:
-  return (x * pos_cos) + (_rotate_half(x) * pos_sin)
-
-class Attention:
-  def __init__(self, model_dim, num_query_heads, num_kv_heads, head_dim):
-    self.qkv_proj = nn.Linear(model_dim, (num_query_heads + num_kv_heads*2) * head_dim, bias=False)
-    self.num_query_heads, self.num_kv_heads = num_query_heads, num_kv_heads
-    self.head_dim = head_dim
-    self.q_norm = nn.RMSNorm(head_dim)
-    self.k_norm = nn.RMSNorm(head_dim)
-    self.out_proj = nn.Linear(num_query_heads * head_dim, model_dim, bias=False)
-
-  def __call__(self, x:Tensor) -> Tensor:
-    batch_size, seq_len, embed_dim = x.shape
-    qkv = self.qkv_proj(x)
-    qkv = qkv.reshape(batch_size, seq_len, self.num_query_heads+self.num_kv_heads*2, self.head_dim).transpose(1, 2)
-    xq,xk,xv = qkv.split([self.num_query_heads, self.num_kv_heads, self.num_kv_heads], dim=1)
-    xq = self.q_norm(xq)
-    xk = self.k_norm(xk)
-
-    # add positional embedding (how many kernels is this?)
-    freq_constant = 10000
-    inv_freq = 1.0 / (freq_constant ** (Tensor.arange(0, self.head_dim, 2) / self.head_dim))
-    pos_index_theta = Tensor.einsum("i,j->ij", Tensor.arange(seq_len), inv_freq)
-    emb = Tensor.cat(pos_index_theta, pos_index_theta, dim=-1)
-    cos_emb, sin_emb = emb.cos()[None, None, :, :], emb.sin()[None, None, :, :]
-    xq = _apply_rotary_pos_emb(xq, sin_emb, cos_emb)
-    xk = _apply_rotary_pos_emb(xk, sin_emb, cos_emb)
-
-    # grouped-query attention
-    num_groups = self.num_query_heads // self.num_kv_heads
-    xk = xk.repeat_interleave(num_groups, dim=1)
-    xv = xv.repeat_interleave(num_groups, dim=1)
-
-    # masked attention
-    #start_pos = 0
-    #mask = Tensor.full((1, 1, seq_len, start_pos+seq_len), float("-inf"), dtype=xq.dtype, device=xq.device).triu(start_pos+1)
-    #attn_output = xq.scaled_dot_product_attention(xk, xv, mask).transpose(1, 2)
-
-    # causal is fine, no mask needed
-    attn_output = xq.scaled_dot_product_attention(xk, xv, is_causal=True).transpose(1, 2)
-    return self.out_proj(attn_output.reshape(batch_size, seq_len, self.num_query_heads * self.head_dim))
-
-class Layer:
-  def __init__(self, model_dim, intermediate_dim, num_query_heads, num_kv_heads, head_dim):
-    self.ffn = FeedForward(model_dim, intermediate_dim)
-    self.attn = Attention(model_dim, num_query_heads, num_kv_heads, head_dim)
-    self.ffn_norm = nn.RMSNorm(model_dim)
-    self.attn_norm = nn.RMSNorm(model_dim)
-
-  def __call__(self, x:Tensor) -> Tensor: # (batch, seq_len, embed_dim)
-    x = x + self.attn(self.attn_norm(x))
-    x = x + self.ffn(self.ffn_norm(x))
-    return x
-
-# stupidly complex
-def make_divisible(v, divisor):
-  new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
-  if new_v < 0.9 * v: new_v += divisor
-  return new_v
-
-class Transformer:
-  def __init__(self, cfg):
-    if DEBUG >= 3: pprint.pp(cfg)
-    self.layers = [Layer(cfg['model_dim'], make_divisible(int(cfg["model_dim"] * cfg['ffn_multipliers'][i]), cfg['ffn_dim_divisor']),
-                         cfg['num_query_heads'][i], cfg['num_kv_heads'][i], cfg['head_dim']) for i in range(cfg['num_transformer_layers'])]
-    self.norm = nn.RMSNorm(cfg['model_dim'])
-    self.token_embeddings = nn.Embedding(cfg['vocab_size'], cfg['model_dim'])
-
-  def __call__(self, tokens:Tensor):
-    # _bsz, seqlen = tokens.shape
-    x = self.token_embeddings(tokens)
-    for l in self.layers: x = l(x)
-    return self.norm(x) @ self.token_embeddings.weight.T
-
-if __name__ == "__main__":
-  #model_name = "OpenELM-270M-Instruct"
-  model_name = "OpenELM-270M"  # this is fp32
-  model = Transformer(json.loads(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/config.json?download=true").read_bytes()))
-  weights = nn.state.safe_load(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/model.safetensors?download=true"))
-  if DEBUG >= 3:
-    for k, v in weights.items(): print(k, v.shape)
-  nn.state.load_state_dict(model, {k.removeprefix("transformer."):v for k,v in weights.items()})
-
-  from sentencepiece import SentencePieceProcessor
-  tokenizer = SentencePieceProcessor(fetch("https://github.com/karpathy/llama2.c/raw/master/tokenizer.model").as_posix())
-  toks = [tokenizer.bos_id()] + tokenizer.encode("Some car brands include")
-  for i in range(100):
-    ttoks = Tensor([toks])
-    out = model(ttoks).realize()
-    t0 = out[0].argmax(axis=-1).tolist()
-    toks.append(t0[-1])
-    # hmmm...passthrough still doesn't match (it shouldn't, it outputs the most likely)
-    print(tokenizer.decode(toks))
-    #print(toks)
-    #print(tokenizer.decode(t0))
-    #print(t0)
-
-
--- a/examples/other_mnist/beautiful_mnist_mlx.py
+++ b/examples/other_mnist/beautiful_mnist_mlx.py
@ -1,55 +0,0 @@
-from tinygrad.helpers import trange
-from tinygrad.nn.datasets import mnist
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.optimizers as optim
-from functools import partial
-
-class Model(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.c1 = nn.Conv2d(1, 32, 5)
-    self.c2 = nn.Conv2d(32, 32, 5)
-    self.bn1 = nn.BatchNorm(32)
-    self.m1 = nn.MaxPool2d(2)
-    self.c3 = nn.Conv2d(32, 64, 3)
-    self.c4 = nn.Conv2d(64, 64, 3)
-    self.bn2 = nn.BatchNorm(64)
-    self.m2 = nn.MaxPool2d(2)
-    self.lin = nn.Linear(576, 10)
-  def __call__(self, x):
-    x = mx.maximum(self.c1(x), 0)
-    x = mx.maximum(self.c2(x), 0)
-    x = self.m1(self.bn1(x))
-    x = mx.maximum(self.c3(x), 0)
-    x = mx.maximum(self.c4(x), 0)
-    x = self.m2(self.bn2(x))
-    return self.lin(mx.flatten(x, 1))
-
-if __name__ == "__main__":
-  X_train, Y_train, X_test, Y_test = mnist()
-  X_train = mx.array(X_train.float().permute((0,2,3,1)).numpy())
-  Y_train = mx.array(Y_train.numpy())
-  X_test = mx.array(X_test.float().permute((0,2,3,1)).numpy())
-  Y_test = mx.array(Y_test.numpy())
-
-  model = Model()
-  optimizer = optim.Adam(1e-3)
-  def loss_fn(model, x, y): return nn.losses.cross_entropy(model(x), y).mean()
-
-  state = [model.state, optimizer.state]
-  @partial(mx.compile, inputs=state, outputs=state)
-  def step(samples):
-    # Compiled functions will also treat any inputs not in the parameter list as constants.
-    X,Y = X_train[samples], Y_train[samples]
-    loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
-    loss, grads = loss_and_grad_fn(model, X, Y)
-    optimizer.update(model, grads)
-    return loss
-
-  test_acc = float('nan')
-  for i in (t:=trange(70)):
-    samples = mx.random.randint(0, X_train.shape[0], (512,))  # putting this in JIT didn't work well
-    loss = step(samples)
-    if i%10 == 9: test_acc = ((model(X_test).argmax(axis=-1) == Y_test).sum() * 100 / X_test.shape[0]).item()
-    t.set_description(f"loss: {loss.item():6.2f} test_accuracy: {test_acc:5.2f}%")
--- a/examples/rl/lightupbutton.py
+++ b/examples/rl/lightupbutton.py
@ -1,45 +0,0 @@
-import gymnasium as gym
-import numpy as np
-from gymnasium.envs.registration import register
-
-# a very simple game
-# one of <size> lights will light up
-# take the action of the lit up light
-# in <hard_mode>, you act differently based on the step number and need to track this
-
-class PressTheLightUpButton(gym.Env):
-  metadata = {"render_modes": []}
-  def __init__(self, render_mode=None, size=2, game_length=10, hard_mode=False):
-    self.size, self.game_length = size, game_length
-    self.observation_space = gym.spaces.Box(0, 1, shape=(self.size,), dtype=np.float32)
-    self.action_space = gym.spaces.Discrete(self.size)
-    self.step_num = 0
-    self.done = True
-    self.hard_mode = hard_mode
-
-  def _get_obs(self):
-    obs = [0]*self.size
-    if self.step_num < len(self.state):
-      obs[self.state[self.step_num]] = 1
-    return np.array(obs, dtype=np.float32)
-
-  def reset(self, seed=None, options=None):
-    super().reset(seed=seed)
-    self.state = np.random.randint(0, self.size, size=self.game_length)
-    self.step_num = 0
-    self.done = False
-    return self._get_obs(), {}
-
-  def step(self, action):
-    target = ((action + self.step_num) % self.size) if self.hard_mode else action
-    reward = int(target == self.state[self.step_num])
-    self.step_num += 1
-    if not reward:
-      self.done = True
-    return self._get_obs(), reward, self.done, self.step_num >= self.game_length, {}
-
-register(
-  id="PressTheLightUpButton-v0",
-  entry_point="examples.rl.lightupbutton:PressTheLightUpButton",
-  max_episode_steps=None,
-)
--- a/examples/sdv2.py
+++ b/examples/sdv2.py
@ -115,7 +115,7 @@ if __name__ == "__main__":

  with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
    if not args.fakeweights:
-      default_weights_url = 'https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors'
+      default_weights_url = 'https://huggingface.co/sd2-community/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors'
      weights_fn = args.weights_fn
      if not weights_fn:
        weights_url = args.weights_url if args.weights_url else default_weights_url
--- a/examples/serious_mnist.py
+++ b/examples/serious_mnist.py
@ -1,136 +0,0 @@
-#!/usr/bin/env python
-#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
-import sys
-import numpy as np
-from tinygrad.nn.state import get_parameters
-from tinygrad.tensor import Tensor
-from tinygrad.nn import BatchNorm2d, optim
-from tinygrad.helpers import getenv
-from extra.datasets import fetch_mnist
-from extra.augment import augment_img
-from extra.training import train, evaluate
-GPU = getenv("GPU")
-QUICK = getenv("QUICK")
-DEBUG = getenv("DEBUG")
-
-class SqueezeExciteBlock2D:
-  def __init__(self, filters):
-    self.filters = filters
-    self.weight1 = Tensor.scaled_uniform(self.filters, self.filters//32)
-    self.bias1 = Tensor.scaled_uniform(1,self.filters//32)
-    self.weight2 = Tensor.scaled_uniform(self.filters//32, self.filters)
-    self.bias2 = Tensor.scaled_uniform(1, self.filters)
-
-  def __call__(self, input):
-    se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
-    se = se.reshape(shape=(-1, self.filters))
-    se = se.dot(self.weight1) + self.bias1
-    se = se.relu()
-    se = se.dot(self.weight2) + self.bias2
-    se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting
-    se = input.mul(se)
-    return se
-
-class ConvBlock:
-  def __init__(self, h, w, inp, filters=128, conv=3):
-    self.h, self.w = h, w
-    self.inp = inp
-    #init weights
-    self.cweights = [Tensor.scaled_uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
-    self.cbiases = [Tensor.scaled_uniform(1, filters, 1, 1) for i in range(3)]
-    #init layers
-    self._bn = BatchNorm2d(128)
-    self._seb = SqueezeExciteBlock2D(filters)
-
-  def __call__(self, input):
-    x = input.reshape(shape=(-1, self.inp, self.w, self.h))
-    for cweight, cbias in zip(self.cweights, self.cbiases):
-      x = x.pad(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
-    x = self._bn(x)
-    x = self._seb(x)
-    return x
-
-class BigConvNet:
-  def __init__(self):
-    self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
-    self.weight1 = Tensor.scaled_uniform(128,10)
-    self.weight2 = Tensor.scaled_uniform(128,10)
-
-  def parameters(self):
-    if DEBUG: #keeping this for a moment
-      pars = [par for par in get_parameters(self) if par.requires_grad]
-      no_pars = 0
-      for par in pars:
-        print(par.shape)
-        no_pars += np.prod(par.shape)
-      print('no of parameters', no_pars)
-      return pars
-    else:
-      return get_parameters(self)
-
-  def save(self, filename):
-    with open(filename+'.npy', 'wb') as f:
-      for par in get_parameters(self):
-        #if par.requires_grad:
-        np.save(f, par.numpy())
-
-  def load(self, filename):
-    with open(filename+'.npy', 'rb') as f:
-      for par in get_parameters(self):
-        #if par.requires_grad:
-        try:
-          par.numpy()[:] = np.load(f)
-          if GPU:
-            par.gpu()
-        except:
-          print('Could not load parameter')
-
-  def forward(self, x):
-    x = self.conv[0](x)
-    x = self.conv[1](x)
-    x = x.avg_pool2d(kernel_size=(2,2))
-    x = self.conv[2](x)
-    x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
-    x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
-    xo = x1.dot(self.weight1) + x2.dot(self.weight2)
-    return xo
-
-
-if __name__ == "__main__":
-  lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
-  epochss = [2, 1] if QUICK else [13, 3, 3, 1]
-  BS = 32
-
-  lmbd = 0.00025
-  lossfn = lambda out,y: out.sparse_categorical_crossentropy(y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
-  X_train, Y_train, X_test, Y_test = fetch_mnist()
-  X_train = X_train.reshape(-1, 28, 28).astype(np.uint8)
-  X_test = X_test.reshape(-1, 28, 28).astype(np.uint8)
-  steps = len(X_train)//BS
-  np.random.seed(1337)
-  if QUICK:
-    steps = 1
-    X_test, Y_test = X_test[:BS], Y_test[:BS]
-
-  model = BigConvNet()
-
-  if len(sys.argv) > 1:
-    try:
-      model.load(sys.argv[1])
-      print('Loaded weights "'+sys.argv[1]+'", evaluating...')
-      evaluate(model, X_test, Y_test, BS=BS)
-    except:
-      print('could not load weights "'+sys.argv[1]+'".')
-
-  if GPU:
-    params = get_parameters(model)
-    [x.gpu_() for x in params]
-
-  for lr, epochs in zip(lrs, epochss):
-    optimizer = optim.Adam(model.parameters(), lr=lr)
-    for epoch in range(1,epochs+1):
-      #first epoch without augmentation
-      X_aug = X_train if epoch == 1 else augment_img(X_train)
-      train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, BS=BS)
-      accuracy = evaluate(model, X_test, Y_test, BS=BS)
-      model.save(f'examples/checkpoint{accuracy * 1e6:.0f}')
--- a/examples/simple_conv_bn.py
+++ b/examples/simple_conv_bn.py
@ -1,17 +0,0 @@
-from tinygrad.tensor import Tensor
-from tinygrad.nn import Conv2d, BatchNorm2d
-from tinygrad.nn.state import get_parameters
-
-if __name__ == "__main__":
-  with Tensor.train():
-
-    BS, C1, H, W = 4, 16, 224, 224
-    C2, K, S, P = 64, 7, 2, 1
-
-    x = Tensor.uniform(BS, C1, H, W)
-    conv = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
-    bn = BatchNorm2d(C2, track_running_stats=False)
-    for t in get_parameters([x, conv, bn]): t.realize()
-
-    print("running network")
-    x.sequential([conv, bn]).numpy()
--- a/examples/so_vits_svc.py
+++ b/examples/so_vits_svc.py
@ -1,669 +0,0 @@
-# original implementation: https://github.com/svc-develop-team/so-vits-svc
-from __future__ import annotations
-import sys, logging, time, io, math, argparse, operator, numpy as np
-from functools import partial, reduce
-from pathlib import Path
-from typing import Tuple, Optional, Type
-from tinygrad import nn, dtypes, Tensor
-from tinygrad.helpers import getenv, fetch
-from tinygrad.nn.state import torch_load
-from examples.vits import ResidualCouplingBlock, PosteriorEncoder, Encoder, ResBlock1, ResBlock2, LRELU_SLOPE, sequence_mask, split, get_hparams_from_file, load_checkpoint, weight_norm, HParams
-from examples.sovits_helpers import preprocess
-import soundfile
-
-DEBUG = getenv("DEBUG")
-
-F0_BIN = 256
-F0_MAX = 1100.0
-F0_MIN = 50.0
-F0_MEL_MIN = 1127 * np.log(1 + F0_MIN / 700)
-F0_MEL_MAX = 1127 * np.log(1 + F0_MAX / 700)
-
-class SpeechEncoder:
-  def __init__(self, hidden_dim, model:ContentVec): self.hidden_dim, self.model = hidden_dim, model
-  def encode(self, ): raise NotImplementedError("implement me")
-  @classmethod
-  def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
-    contentvec = ContentVec.load_from_pretrained(checkpoint_path, checkpoint_url)
-    return cls(contentvec)
-
-class ContentVec256L9(SpeechEncoder):
-  def __init__(self, model:ContentVec): super().__init__(hidden_dim=256, model=model)
-  def encode(self, wav: Tensor):
-    feats = wav
-    if len(feats.shape) == 2:  # double channels
-      feats = feats.mean(-1)
-    assert len(feats.shape) == 1, feats.dim()
-    feats = feats.reshape(1, -1)
-    padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
-    logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=9)
-    feats = self.model.final_proj(logits[0])
-    return feats.transpose(1,2)
-
-class ContentVec768L12(SpeechEncoder):
-  def __init__(self, model:ContentVec): super().__init__(hidden_dim=768, model=model)
-  def encode(self, wav: Tensor):
-    feats = wav
-    if len(feats.shape) == 2:  # double channels
-      feats = feats.mean(-1)
-    assert len(feats.shape) == 1, feats.dim()
-    feats = feats.reshape(1, -1)
-    padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
-    logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=12)
-    return logits[0].transpose(1,2)
-
-# original code for contentvec: https://github.com/auspicious3000/contentvec/
-class ContentVec:
-  # self.final_proj dims are hardcoded and depend on fairseq.data.dictionary Dictionary in the checkpoint. This param can't yet be loaded since there is no pickle for it. See with DEBUG=2.
-  # This means that the ContentVec only works with the hubert weights used in all SVC models
-  def __init__(self, cfg: HParams):
-    self.feature_grad_mult, self.untie_final_proj = cfg.feature_grad_mult, cfg.untie_final_proj
-    feature_enc_layers = eval(cfg.conv_feature_layers)
-    self.embed = feature_enc_layers[-1][0]
-    final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
-    self.feature_extractor = ConvFeatureExtractionModel(conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias)
-    self.post_extract_proj = nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None
-    self.encoder = TransformerEncoder(cfg)
-    self.layer_norm = nn.LayerNorm(self.embed)
-    self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim * 1) if self.untie_final_proj else nn.Linear(cfg.encoder_embed_dim, final_dim)
-    self.mask_emb = Tensor.uniform(cfg.encoder_embed_dim, dtype=dtypes.float32)
-    self.label_embs_concat = Tensor.uniform(504, final_dim, dtype=dtypes.float32)
-  def forward_features(self, source, padding_mask):
-    if self.feature_grad_mult > 0:
-      features = self.feature_extractor(source, padding_mask)
-      if self.feature_grad_mult != 1.0: pass  # training: GradMultiply.forward(features, self.feature_grad_mult)
-    else:
-      features = self.feature_extractor(source, padding_mask)
-    return features
-  def forward_padding_mask(self, features, padding_mask):  # replaces original forward_padding_mask for batch inference
-    lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1)  # ensure its bool for tilde
-    lengths = (lengths_org - 400).float().div(320).floor().cast(dtypes.int64) + 1  # intermediate float to divide
-    padding_mask = lengths_to_padding_mask(lengths)
-    return padding_mask
-  def extract_features(self, source: Tensor, spk_emb:Tensor=None, padding_mask=None, ret_conv=False, output_layer=None, tap=False):
-    features = self.forward_features(source, padding_mask)
-    if padding_mask is not None:
-      padding_mask = self.forward_padding_mask(features, padding_mask)
-    features = features.transpose(1, 2)
-    features = self.layer_norm(features)
-    if self.post_extract_proj is not None:
-      features = self.post_extract_proj(features)
-    x, _ = self.encoder(features, spk_emb, padding_mask=padding_mask, layer=(None if output_layer is None else output_layer - 1), tap=tap)
-    res = features if ret_conv else x
-    return res, padding_mask
-  @classmethod
-  def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
-    fetch(checkpoint_url, checkpoint_path)
-    cfg = load_fairseq_cfg(checkpoint_path)
-    enc = cls(cfg.model)
-    _ = load_checkpoint_enc(checkpoint_path, enc, None)
-    logging.debug(f"{cls.__name__}: Loaded model with cfg={cfg}")
-    return enc
-
-class TransformerEncoder:
-  def __init__(self, cfg: HParams):
-    def make_conv() -> nn.Conv1d:
-      layer = nn.Conv1d(self.embedding_dim, self.embedding_dim, kernel_size=cfg.conv_pos, padding=cfg.conv_pos // 2, groups=cfg.conv_pos_groups)
-      std = std = math.sqrt(4 / (cfg.conv_pos * self.embedding_dim))
-      layer.weight, layer.bias = (Tensor.normal(*layer.weight.shape, std=std)), (Tensor.zeros(*layer.bias.shape))
-      # for training: layer.weights need to be weight_normed
-      return layer
-    self.dropout, self.embedding_dim, self.layer_norm_first, self.layerdrop, self.num_layers, self.num_layers_1 = cfg.dropout, cfg.encoder_embed_dim, cfg.layer_norm_first, cfg.encoder_layerdrop, cfg.encoder_layers, cfg.encoder_layers_1
-    self.pos_conv, self.pos_conv_remove = [make_conv()], (1 if cfg.conv_pos % 2 == 0 else 0)
-    self.layers = [
-      TransformerEncoderLayer(self.embedding_dim, cfg.encoder_ffn_embed_dim, cfg.encoder_attention_heads, self.dropout, cfg.attention_dropout, cfg.activation_dropout, cfg.activation_fn, self.layer_norm_first, cond_layer_norm=(i >= cfg.encoder_layers))
-      for i in range(cfg.encoder_layers + cfg.encoder_layers_1)
-      ]
-    self.layer_norm = nn.LayerNorm(self.embedding_dim)
-    self.cond_layer_norm = CondLayerNorm(self.embedding_dim) if cfg.encoder_layers_1 > 0 else None
-    # training: apply init_bert_params
-  def __call__(self, x, spk_emb, padding_mask=None, layer=None, tap=False):
-    x, layer_results = self.extract_features(x, spk_emb, padding_mask, layer, tap)
-    if self.layer_norm_first and layer is None:
-      x = self.cond_layer_norm(x, spk_emb) if (self.num_layers_1 > 0) else self.layer_norm(x)
-    return x, layer_results
-  def extract_features(self, x: Tensor, spk_emb: Tensor, padding_mask=None, tgt_layer=None, tap=False):
-    if tgt_layer is not None:  # and not self.training
-      assert tgt_layer >= 0 and tgt_layer < len(self.layers)
-    if padding_mask is not None:
-      # x[padding_mask] = 0
-      assert padding_mask.shape == x.shape[:len(padding_mask.shape)]  # first few dims of x must match padding_mask
-      tmp_mask = padding_mask.unsqueeze(-1).repeat((1, 1, x.shape[-1]))
-      tmp_mask = tilde(tmp_mask.cast(dtypes.bool))
-      x = tmp_mask.where(x, 0)
-    x_conv = self.pos_conv[0](x.transpose(1,2))
-    if self.pos_conv_remove > 0: x_conv = x_conv[:, :, : -self.pos_conv_remove]
-    x_conv = x_conv.gelu().transpose(1, 2)
-    x = (x + x_conv).transpose(0, 1)  # B x T x C -> T x B x C
-    if not self.layer_norm_first: x = self.layer_norm(x)
-    x = x.dropout(p=self.dropout)
-    layer_results = []
-    r = None
-    for i, layer in enumerate(self.layers):
-      if i < self.num_layers:  # if (not self.training or (dropout_probability > self.layerdrop)) and (i < self.num_layers):
-        assert layer.cond_layer_norm == False
-        x = layer(x, self_attn_padding_mask=padding_mask, need_weights=False)
-        if tgt_layer is not None or tap:
-          layer_results.append(x.transpose(0, 1))
-      if i>= self.num_layers:
-        assert layer.cond_layer_norm == True
-        x = layer(x, emb=spk_emb, self_attn_padding_mask=padding_mask, need_weights=False)
-      if i == tgt_layer:
-        r = x
-        break
-    if r is not None:
-      x = r
-    x = x.transpose(0, 1)  # T x B x C -> B x T x C
-    return x, layer_results
-
-class TransformerEncoderLayer:
-  def __init__(self, embedding_dim=768.0, ffn_embedding_dim=3072.0, num_attention_heads=8.0, dropout=0.1, attention_dropout=0.1, activation_dropout=0.1, activation_fn="relu", layer_norm_first=False, cond_layer_norm=False):
-    def get_activation_fn(activation):
-      if activation == "relu": return Tensor.relu
-      if activation == "gelu": return Tensor.gelu
-      else: raise RuntimeError(f"activation function={activation} is not forseen")
-    self.embedding_dim, self.dropout, self.activation_dropout, self.layer_norm_first, self.num_attention_heads, self.cond_layer_norm, self.activation_fn = embedding_dim, dropout, activation_dropout, layer_norm_first, num_attention_heads, cond_layer_norm, get_activation_fn(activation_fn)
-    self.self_attn = MultiHeadAttention(self.embedding_dim, self.num_attention_heads)
-    self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
-    self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
-    self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
-    self.final_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
-  def __call__(self, x:Tensor, self_attn_mask:Tensor=None, self_attn_padding_mask:Tensor=None, emb:Tensor=None, need_weights=False):
-    #self_attn_padding_mask = self_attn_padding_mask.reshape(x.shape[0], 1, 1, self_attn_padding_mask.shape[1]).expand(-1, self.num_attention_heads, -1, -1).reshape(x.shape[0] * self.num_attention_heads, 1, self_attn_padding_mask.shape[1]) if self_attn_padding_mask is not None else None
-    assert self_attn_mask is None and self_attn_padding_mask is not None
-    residual = x
-    if self.layer_norm_first:
-      x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
-      x = self.self_attn(x=x, mask=self_attn_padding_mask)
-      x = x.dropout(self.dropout)
-      x = residual + x
-      x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
-      x = self.activation_fn(self.fc1(x))
-      x = x.dropout(self.activation_dropout)
-      x = self.fc2(x)
-      x = x.dropout(self.dropout)
-      x = residual + x
-    else:
-      x = self.self_attn(x=x, mask=self_attn_padding_mask)
-      x = x.dropout(self.dropout)
-      x = residual + x
-      x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
-      residual = x
-      x = self.activation_fn(self.fc1(x))
-      x = x.dropout(self.activation_dropout)
-      x = self.fc2(x)
-      x = x.dropout(self.dropout)
-      x = residual + x
-      x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
-    return x
-
-class MultiHeadAttention:
-  def __init__(self, n_state, n_head):
-    self.n_state, self.n_head = n_state, n_head
-    self.q_proj, self.k_proj, self.v_proj, self.out_proj = [nn.Linear(n_state, n_state) for _ in range(4)]
-  def __call__(self, x:Tensor, xa:Optional[Tensor]=None, mask:Optional[Tensor]=None):
-    x = x.transpose(0,1)  # TxBxC -> BxTxC
-    q, k, v = self.q_proj(x), self.k_proj(xa or x), self.v_proj(xa or x)
-    q, k, v = [x.reshape(*q.shape[:2], self.n_head, -1) for x in (q, k, v)]
-    wv = Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), None).transpose(1, 2).reshape(*x.shape[:2], -1)
-    ret =  self.out_proj(wv).transpose(0,1)  # BxTxC -> TxBxC
-    return ret
-
-class ConvFeatureExtractionModel:
-  def __init__(self, conv_layers, dropout=.0, mode="default", conv_bias=False):
-    assert mode in {"default", "group_norm_masked", "layer_norm"}
-    def block(n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False):
-      def make_conv():
-        conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
-        conv.weight = Tensor.kaiming_normal(*conv.weight.shape)
-        return conv
-      assert (is_layer_norm and is_group_norm) == False, "layer norm and group norm are exclusive"
-      if is_layer_norm:
-        return [make_conv(), partial(Tensor.dropout, p=dropout),[partial(Tensor.transpose, dim0=-2, dim1=-1), nn.LayerNorm(dim, elementwise_affine=True), partial(Tensor.transpose, dim0=-2, dim1=-1)], Tensor.gelu]
-      elif is_group_norm and mode == "default":
-        return [make_conv(), partial(Tensor.dropout, p=dropout), nn.GroupNorm(dim, dim, affine=True), Tensor.gelu]
-      elif is_group_norm and mode == "group_norm_masked":
-        return [make_conv(), partial(Tensor.dropout, p=dropout), GroupNormMasked(dim, dim, affine=True), Tensor.gelu]
-      else:
-        return [make_conv(), partial(Tensor.dropout, p=dropout), Tensor.gelu]
-    in_d, self.conv_layers, self.mode = 1, [], mode
-    for i, cl in enumerate(conv_layers):
-      assert len(cl) == 3, "invalid conv definition: " + str(cl)
-      (dim, k, stride) = cl
-      if i == 0: self.cl = cl
-      self.conv_layers.append(block(in_d, dim, k, stride, is_layer_norm=(mode == "layer_norm"), is_group_norm=((mode == "default" or mode == "group_norm_masked") and i == 0), conv_bias=conv_bias))
-      in_d = dim
-  def __call__(self, x:Tensor, padding_mask:Tensor):
-    x = x.unsqueeze(1)  # BxT -> BxCxT
-    if self.mode == "group_norm_masked":
-      if padding_mask is not None:
-        _, k, stride = self.cl
-        lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1)  # ensure padding_mask is bool for tilde
-        lengths = (((lengths_org - k) / stride) + 1).floor().cast(dtypes.int64)
-        padding_mask = tilde(lengths_to_padding_mask(lengths)).cast(dtypes.int64)  # lengths_to_padding_mask returns bool tensor
-      x = self.conv_layers[0][0](x)  # padding_mask is numeric
-      x = self.conv_layers[0][1](x)
-      x = self.conv_layers[0][2](x, padding_mask)
-      x = self.conv_layers[0][3](x)
-    else:
-      x = x.sequential(self.conv_layers[0])  # default
-    for _, conv in enumerate(self.conv_layers[1:], start=1):
-      conv = reduce(lambda a,b: operator.iconcat(a,b if isinstance(b, list) else [b]), conv, [])  # flatten
-      x = x.sequential(conv)
-    return x
-
-class CondLayerNorm:  # https://github.com/auspicious3000/contentvec/blob/main/contentvec/modules/cond_layer_norm.py#L10
-  def __init__(self, dim_last, eps=1e-5, dim_spk=256, elementwise_affine=True):
-    self.dim_last, self.eps, self.dim_spk, self.elementwise_affine = dim_last, eps, dim_spk, elementwise_affine
-    if self.elementwise_affine:
-      self.weight_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
-      self.bias_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
-      self.weight_ln.weight, self.bias_ln.weight = (Tensor.ones(*self.weight_ln.weight.shape)), (Tensor.zeros(*self.bias_ln.weight.shape))
-  def __call__(self, x: Tensor, spk_emb: Tensor):
-    axis = tuple(-1-i for i in range(len(x.shape[1:])))
-    x = x.layernorm(axis=axis, eps=self.eps)
-    if not self.elementwise_affine: return x
-    weights, bias = self.weight_ln(spk_emb), self.bias_ln(spk_emb)
-    return weights * x + bias
-
-class GroupNormMasked:  # https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/modules/fp32_group_norm.py#L16
-  def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
-    self.num_groups, self.num_channels, self.eps, self.affine = num_groups, num_channels, eps, affine
-    self.weight, self.bias = (Tensor.ones(num_channels)), (Tensor.zeros(num_channels)) if self.affine else (None, None)
-  def __call__(self, x:Tensor, mask:Tensor):
-    bsz, n_c, length = x.shape
-    assert n_c % self.num_groups == 0
-    x = x.reshape(bsz, self.num_groups, n_c // self.num_groups, length)
-    if mask is None: mask = Tensor.ones_like(x)
-    else: mask = mask.reshape(bsz, 1, 1, length)
-    x = x * mask
-    lengths = mask.sum(axis=3, keepdim=True)
-    assert x.shape[2] == 1
-    mean_ = x.mean(dim=3, keepdim=True)
-    mean = mean_ * length / lengths
-    var = (((x.std(axis=3, keepdim=True) ** 2) + mean_**2) * length / lengths - mean**2) + self.eps
-    return x.add(-mean).div(var.sqrt()).reshape(bsz, n_c, length).mul(self.weight.reshape(1,-1,1)).add(self.bias.reshape(1,-1,1))
-
-class Synthesizer:
-  def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, ssl_dim, n_speakers, sampling_rate=44100, vol_embedding=False, n_flow_layer=4, **kwargs):
-    self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.vol_embedding = spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, vol_embedding
-    self.emb_g = nn.Embedding(n_speakers, gin_channels)
-    if vol_embedding: self.emb_vol = nn.Linear(1, hidden_channels)
-    self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
-    self.enc_p = TextEncoder(inter_channels, hidden_channels, kernel_size, n_layers, filter_channels=filter_channels, n_heads=n_heads, p_dropout=p_dropout)
-    self.dec = Generator(sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels)
-    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels)
-    self.emb_uv = nn.Embedding(vocab_size=2, embed_size=hidden_channels)
-  def infer(self, c:Tensor, f0:Tensor, uv:Tensor, g:Tensor=None, noise_scale=0.35, seed=52468, vol=None) -> Tuple[Tensor, Tensor]:
-    Tensor.manual_seed(getenv('SEED', seed))
-    c_lengths = (Tensor.ones([c.shape[0]]) * c.shape[-1]).to(c.device)
-    if len(g.shape) == 1: g = g.unsqueeze(0)
-    g = self.emb_g(g).transpose(1, 2)
-    x_mask = sequence_mask(c_lengths, c.shape[2]).unsqueeze(1).cast(c.dtype)
-    vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
-    x = self.pre(c) * x_mask + self.emb_uv(uv.cast(dtypes.int64)).transpose(1, 2) + vol
-    z_p, _, _, c_mask = self.enc_p.forward(x, x_mask, f0=self._f0_to_coarse(f0), noise_scale=noise_scale)
-    z = self.flow.forward(z_p, c_mask, g=g, reverse=True)
-    o = self.dec.forward(z * c_mask, g=g, f0=f0)
-    return o,f0
-  def _f0_to_coarse(self, f0 : Tensor):
-    f0_mel = 1127 * (1 + f0 / 700).log()
-    a = (F0_BIN - 2) / (F0_MEL_MAX - F0_MEL_MIN)
-    b = F0_MEL_MIN * a - 1.
-    f0_mel = (f0_mel > 0).where(f0_mel * a - b, f0_mel)
-    f0_coarse = f0_mel.ceil().cast(dtype=dtypes.int64)
-    f0_coarse = f0_coarse * (f0_coarse > 0)
-    f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
-    f0_coarse = f0_coarse * (f0_coarse < F0_BIN)
-    f0_coarse = f0_coarse + ((f0_coarse >= F0_BIN) * (F0_BIN - 1))
-    return f0_coarse
-  @classmethod
-  def load_from_pretrained(cls, config_path:str, config_url:str, weights_path:str, weights_url:str) -> Synthesizer:
-    fetch(config_url, config_path)
-    hps = get_hparams_from_file(config_path)
-    fetch(weights_url, weights_path)
-    net_g = cls(hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model)
-    _ = load_checkpoint(weights_path, net_g, None, skip_list=["f0_decoder"])
-    logging.debug(f"{cls.__name__}:Loaded model with hps: {hps}")
-    return net_g, hps
-
-class TextEncoder:
-  def __init__(self, out_channels, hidden_channels, kernel_size, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None):
-    self.out_channels, self.hidden_channels, self.kernel_size, self.n_layers, self.gin_channels = out_channels, hidden_channels, kernel_size, n_layers, gin_channels
-    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    self.f0_emb = nn.Embedding(256, hidden_channels)  # n_vocab = 256
-    self.enc_ = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
-  def forward(self, x, x_mask, f0=None, noise_scale=1):
-    x = x + self.f0_emb(f0).transpose(1, 2)
-    x = self.enc_.forward(x * x_mask, x_mask)
-    stats = self.proj(x) * x_mask
-    m, logs = split(stats, self.out_channels, dim=1)
-    z = (m + randn_like(m) * logs.exp() * noise_scale) * x_mask
-    return z, m, logs, x_mask
-
-class Upsample:
-  def __init__(self, scale_factor):
-    assert scale_factor % 1 == 0, "Only integer scale factor allowed."
-    self.scale = int(scale_factor)
-  def forward(self, x:Tensor):
-    repeats = tuple([1] * len(x.shape) + [self.scale])
-    new_shape = (*x.shape[:-1], x.shape[-1] * self.scale)
-    return x.unsqueeze(-1).repeat(repeats).reshape(new_shape)
-
-class SineGen:
-  def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voice_threshold=0, flag_for_pulse=False):
-    self.sine_amp, self.noise_std, self.harmonic_num, self.sampling_rate, self.voiced_threshold, self.flag_for_pulse = sine_amp, noise_std, harmonic_num, samp_rate, voice_threshold, flag_for_pulse
-    self.dim = self.harmonic_num + 1
-  def _f02uv(self, f0): return (f0 > self.voiced_threshold).float()  #generate uv signal
-  def _f02sine(self, f0_values):
-    def padDiff(x : Tensor): return (x.pad((0,0,-1,1)) - x).pad((0,0,0,-1))
-    def mod(x: Tensor, n: int) -> Tensor: return x - n * x.div(n).floor()  # this is what the % operator does in pytorch.
-    rad_values = mod((f0_values / self.sampling_rate) , 1)  # convert to F0 in rad
-    rand_ini = Tensor.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)  # initial phase noise
-
-    #rand_ini[:, 0] = 0
-    m = Tensor.ones(f0_values.shape[0]).unsqueeze(1).pad((0,f0_values.shape[2]-1,0,0)).cast(dtypes.bool)
-    m = tilde(m)
-    rand_ini = m.where(rand_ini, 0)
-
-    #rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
-    tmp = rad_values[:, 0, :] + rand_ini
-    m = Tensor.ones(tmp.shape).pad((0,0,0,rad_values.shape[1]-1,0)).cast(dtypes.bool)
-    m = tilde(m)
-    tmp = tmp.unsqueeze(1).pad((0,0,0,rad_values.shape[1]-1,0))
-    rad_values = m.where(rad_values, tmp)
-
-    tmp_over_one = mod(rad_values.cumsum(1), 1)
-    tmp_over_one_idx = padDiff(tmp_over_one) < 0
-    cumsum_shift = Tensor.zeros_like(rad_values)
-
-    #cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
-    tmp_over_one_idx = (tmp_over_one_idx * -1.0).pad((0,0,1,0))
-    cumsum_shift = tmp_over_one_idx
-
-    sines = ((rad_values + cumsum_shift).cumsum(1) * 2 * np.pi).sin()
-    return sines
-  def forward(self, f0, upp=None):
-    fn = f0.mul(Tensor([[range(1, self.harmonic_num + 2)]], dtype=dtypes.float32).to(f0.device))
-    sine_waves = self._f02sine(fn) * self.sine_amp  #generate sine waveforms
-    uv = self._f02uv(f0)  # generate uv signal
-    noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-    noise = noise_amp * randn_like(sine_waves)
-    sine_waves = sine_waves * uv + noise
-    return sine_waves, uv, noise
-
-class SourceHnNSF:
-  def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0):
-    self.sine_amp, self.noise_std = sine_amp, add_noise_std
-    self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
-    self.l_linear = nn.Linear(harmonic_num + 1, 1)
-  def forward(self, x, upp=None):
-    sine_waves, uv, _ = self.l_sin_gen.forward(x, upp)
-    sine_merge = self.l_linear(sine_waves.cast(self.l_linear.weight.dtype)).tanh()
-    noise = randn_like(uv) * self.sine_amp / 3
-    return sine_merge, noise, uv
-
-# most of the hifigan in standard vits is reused here, but need to upsample and construct harmonic source from f0
-class Generator:
-  def __init__(self, sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels):
-    self.sampling_rate, self.inter_channels, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.gin_channels = sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels
-    self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
-    self.conv_pre = nn.Conv1d(inter_channels, upsample_initial_channel, 7, 1, padding=3)
-    self.f0_upsamp = Upsample(scale_factor=np.prod(upsample_rates))
-    self.m_source = SourceHnNSF(sampling_rate, harmonic_num=8)
-    resblock = ResBlock1 if resblock == '1' else ResBlock2
-    self.ups, self.noise_convs, self.resblocks = [], [], []
-    for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-      c_cur = upsample_initial_channel//(2**(i+1))
-      self.ups.append(nn.ConvTranspose1d(upsample_initial_channel//(2**i), c_cur, k, u, padding=(k-u)//2))
-      stride_f0 = int(np.prod(upsample_rates[i + 1:]))
-      self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2) if (i + 1 < len(upsample_rates)) else nn.Conv1d(1, c_cur, kernel_size=1))
-    for i in range(len(self.ups)):
-      ch = upsample_initial_channel // (2 ** (i + 1))
-      for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-        self.resblocks.append(resblock(ch, k, d))
-    self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    self.upp = np.prod(upsample_rates)
-  def forward(self, x, f0, g=None):
-    f0 = self.f0_upsamp.forward(f0[:, None]).transpose(1, 2)  # bs,n,t
-    har_source, _, _ = self.m_source.forward(f0, self.upp)
-    har_source = har_source.transpose(1, 2)
-    x = self.conv_pre(x)
-    if g is not None:  x = x + self.cond(g)
-    for i in range(self.num_upsamples):
-      x, xs = self.ups[i](x.leaky_relu(LRELU_SLOPE)), None
-      x_source = self.noise_convs[i](har_source)
-      x = x + x_source
-      for j in range(self.num_kernels):
-        if xs is None: xs = self.resblocks[i * self.num_kernels + j].forward(x)
-        else: xs += self.resblocks[i * self.num_kernels + j].forward(x)
-      x = xs / self.num_kernels
-    return self.conv_post(x.leaky_relu()).tanh()
-
-# **** helpers ****
-
-def randn_like(x:Tensor) -> Tensor: return Tensor.randn(*x.shape, dtype=x.dtype).to(device=x.device)
-
-def tilde(x: Tensor) -> Tensor:
-  if x.dtype == dtypes.bool: return (1 - x).cast(dtypes.bool)
-  return (x + 1) * -1  # this seems to be what the ~ operator does in pytorch for non bool
-
-def lengths_to_padding_mask(lens:Tensor) -> Tensor:
-  bsz, max_lens = lens.shape[0], lens.max().numpy().item()
-  mask = Tensor.arange(max_lens).to(lens.device).reshape(1, max_lens)
-  mask = mask.expand(bsz, -1) >= lens.reshape(bsz, 1).expand(-1, max_lens)
-  return mask.cast(dtypes.bool)
-
-def repeat_expand_2d_left(content, target_len): # content : [h, t]
-  src_len = content.shape[-1]
-  temp = np.arange(src_len+1) * target_len / src_len
-  current_pos, cols = 0, []
-  for i in range(target_len):
-    if i >= temp[current_pos+1]:
-      current_pos += 1
-    cols.append(content[:, current_pos])
-  return Tensor.stack(*cols).transpose(0, 1)
-
-def load_fairseq_cfg(checkpoint_path):
-  assert Path(checkpoint_path).is_file()
-  state = torch_load(checkpoint_path)
-  cfg = state["cfg"] if ("cfg" in state and state["cfg"] is not None) else None
-  if cfg is None: raise RuntimeError(f"No cfg exist in state keys = {state.keys()}")
-  return HParams(**cfg)
-
-def load_checkpoint_enc(checkpoint_path, model: ContentVec, optimizer=None, skip_list=[]):
-  assert Path(checkpoint_path).is_file()
-  start_time = time.time()
-  checkpoint_dict = torch_load(checkpoint_path)
-  saved_state_dict = checkpoint_dict['model']
-  weight_g, weight_v, parent = None, None, None
-  for key, v in saved_state_dict.items():
-    if any(layer in key for layer in skip_list): continue
-    try:
-      obj, skip = model, False
-      for k in key.split('.'):
-        if k.isnumeric(): obj = obj[int(k)]
-        elif isinstance(obj, dict): obj = obj[k]
-        else:
-          if k in ["weight_g", "weight_v"]:
-            parent, skip = obj, True
-            if k == "weight_g": weight_g = v
-            else: weight_v = v
-          if not skip:
-            parent = obj
-            obj = getattr(obj, k)
-      if weight_g and weight_v:
-        setattr(obj, "weight_g", weight_g.numpy())
-        setattr(obj, "weight_v", weight_v.numpy())
-        obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
-        weight_g, weight_v, parent, skip = None, None, None, False
-      if not skip and obj.shape == v.shape:
-        if "feature_extractor" in key and (isinstance(parent, (nn.GroupNorm, nn.LayerNorm))):  # cast
-          obj.assign(v.to(obj.device).float())
-        else:
-          obj.assign(v.to(obj.device))
-      elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
-    except Exception as e: raise e
-  logging.info(f"Loaded checkpoint '{checkpoint_path}' in {time.time() - start_time:.4f}s")
-  return model, optimizer
-
-def pad_array(arr, target_length):
-  current_length = arr.shape[0]
-  if current_length >= target_length: return arr
-  pad_width = target_length - current_length
-  pad_left = pad_width // 2
-  pad_right = pad_width - pad_left
-  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
-  return padded_arr
-
-def split_list_by_n(list_collection, n, pre=0):
-  for i in range(0, len(list_collection), n):
-    yield list_collection[i-pre if i-pre>=0 else i: i + n]
-
-def get_sid(spk2id:HParams, speaker:str) -> Tensor:
-  speaker_id = spk2id[speaker]
-  if not speaker_id and type(speaker) is int:
-    if len(spk2id.__dict__) >= speaker: speaker_id = speaker
-  if speaker_id is None: raise RuntimeError(f"speaker={speaker} not in the speaker list")
-  return Tensor([int(speaker_id)], dtype=dtypes.int64).unsqueeze(0)
-
-def get_encoder(ssl_dim) -> Type[SpeechEncoder]:
-  if ssl_dim == 256: return ContentVec256L9
-  if ssl_dim == 768: return ContentVec768L12
-
-#########################################################################################
-# CODE: https://github.com/svc-develop-team/so-vits-svc
-#########################################################################################
-# CONTENTVEC:
-#   CODE: https://github.com/auspicious3000/contentvec
-#   PAPER: https://arxiv.org/abs/2204.09224
-#########################################################################################
-# INSTALLATION: dependencies are for preprocessing and loading/saving audio.
-# pip3 install soundfile librosa praat-parselmouth
-#########################################################################################
-# EXAMPLE USAGE:
-# python3 examples/so_vits_svc.py --model tf2spy --file ~/recording.wav
-#########################################################################################
-# DEMO USAGE (uses audio sample from LJ-Speech):
-# python3 examples/so_vits_svc.py --model saul_goodman
-#########################################################################################
-SO_VITS_SVC_PATH = Path(__file__).parents[1] / "weights/So-VITS-SVC"
-VITS_MODELS = { # config_path, weights_path, config_url, weights_url
-  "saul_goodman" : (SO_VITS_SVC_PATH / "config_saul_gman.json", SO_VITS_SVC_PATH / "pretrained_saul_gman.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/G_80000.pth"),
-  "drake" : (SO_VITS_SVC_PATH / "config_drake.json", SO_VITS_SVC_PATH / "pretrained_drake.pth", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/config_aubrey.json", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/pretrained_aubrey.pth"),
-  "cartman" : (SO_VITS_SVC_PATH / "config_cartman.json", SO_VITS_SVC_PATH / "pretrained_cartman.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/G_10200.pth"),
-  "tf2spy" : (SO_VITS_SVC_PATH / "config_tf2spy.json", SO_VITS_SVC_PATH / "pretrained_tf2spy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/G_60000.pth"),
-  "tf2heavy" : (SO_VITS_SVC_PATH / "config_tf2heavy.json", SO_VITS_SVC_PATH / "pretrained_tf2heavy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/G_100000.pth"),
-  "lady_gaga" : (SO_VITS_SVC_PATH / "config_gaga.json", SO_VITS_SVC_PATH / "pretrained_gaga.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/G_14400.pth")
-}
-ENCODER_MODELS = { # weights_path, weights_url
-  "contentvec": (SO_VITS_SVC_PATH / "contentvec_checkpoint.pt", "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
-}
-ENCODER_MODEL = "contentvec"
-DEMO_PATH, DEMO_URL = Path(__file__).parents[1] / "temp/LJ037-0171.wav", "https://keithito.com/LJ-Speech-Dataset/LJ037-0171.wav"
-if __name__=="__main__":
-  logging.basicConfig(stream=sys.stdout, level=(logging.INFO if DEBUG < 1 else logging.DEBUG))
-  parser = argparse.ArgumentParser()
-  parser.add_argument("-m", "--model", default=None, help=f"Specify the model to use. All supported models: {VITS_MODELS.keys()}", required=True)
-  parser.add_argument("-f", "--file", default=DEMO_PATH, help=f"Specify the path of the input file")
-  parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
-  parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
-  parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
-  parser.add_argument("--speaker", default=None, help="If not specified, the first available speaker is chosen. Usually there is only one speaker per model.")
-  parser.add_argument("--noise_scale", default=0.4)
-  parser.add_argument("--tran", default=0.0, help="Pitch shift, supports positive and negative (semitone) values. Default 0.0")
-  parser.add_argument("--pad_seconds", default=0.5)
-  parser.add_argument("--lg_num", default=0.0)
-  parser.add_argument("--clip_seconds", default=0.0)
-  parser.add_argument("--slice_db", default=-40)
-  args = parser.parse_args()
-
-  vits_model = args.model
-  encoder_location, vits_location = ENCODER_MODELS[ENCODER_MODEL], VITS_MODELS[vits_model]
-
-  Tensor.training = False
-  # Get Synthesizer and ContentVec
-  net_g, hps = Synthesizer.load_from_pretrained(vits_location[0], vits_location[2], vits_location[1], vits_location[3])
-  Encoder = get_encoder(hps.model.ssl_dim)
-  encoder = Encoder.load_from_pretrained(encoder_location[0], encoder_location[1])
-
-  # model config args
-  target_sample, spk2id, hop_length, target_sample = hps.data.sampling_rate, hps.spk, hps.data.hop_length, hps.data.sampling_rate
-  vol_embedding = hps.model.vol_embedding if hasattr(hps.data, "vol_embedding") and hps.model.vol_embedding is not None else False
-
-  # args
-  slice_db, clip_seconds, lg_num, pad_seconds, tran, noise_scale, audio_path = args.slice_db, args.clip_seconds, args.lg_num, args.pad_seconds, args.tran, args.noise_scale, args.file
-  speaker = args.speaker if args.speaker is not None else list(hps.spk.__dict__.keys())[0]
-
-  ### Loading audio and slicing ###
-  if audio_path == DEMO_PATH: fetch(DEMO_URL, DEMO_PATH)
-  assert Path(audio_path).is_file() and Path(audio_path).suffix == ".wav"
-  chunks = preprocess.cut(audio_path, db_thresh=slice_db)
-  audio_data, audio_sr = preprocess.chunks2audio(audio_path, chunks)
-
-  per_size = int(clip_seconds * audio_sr)
-  lg_size = int(lg_num * audio_sr)
-
-  ### Infer per slice ###
-  global_frame = 0
-  audio = []
-  for (slice_tag, data) in audio_data:
-    print(f"\n====segment start, {round(len(data) / audio_sr, 3)}s====")
-    length = int(np.ceil(len(data) / audio_sr * target_sample))
-
-    if slice_tag:
-      print("empty segment")
-      _audio = np.zeros(length)
-      audio.extend(list(pad_array(_audio, length)))
-      global_frame += length // hop_length
-      continue
-
-    datas = [data] if per_size == 0 else split_list_by_n(data, per_size, lg_size)
-
-    for k, dat in enumerate(datas):
-      per_length = int(np.ceil(len(dat) / audio_sr * target_sample)) if clip_seconds!=0 else length
-      pad_len = int(audio_sr * pad_seconds)
-      dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
-      raw_path = io.BytesIO()
-      soundfile.write(raw_path, dat, audio_sr, format="wav")
-      raw_path.seek(0)
-
-      ### Infer START ###
-      wav, sr = preprocess.load_audiofile(raw_path)
-      wav = preprocess.sinc_interp_resample(wav, sr, target_sample)[0]
-      wav16k, f0, uv = preprocess.get_unit_f0(wav, tran, hop_length, target_sample)
-      sid = get_sid(spk2id, speaker)
-      n_frames = f0.shape[1]
-
-      # ContentVec infer
-      start = time.time()
-      c = encoder.encode(wav16k)
-      c = repeat_expand_2d_left(c.squeeze(0).realize(), f0.shape[1])  # interpolate speech encoding to match f0
-      c = c.unsqueeze(0).realize()
-      enc_time = time.time() - start
-
-      # VITS infer
-      vits_start = time.time()
-      out_audio, f0 = net_g.infer(c, f0=f0, uv=uv, g=sid, noise_scale=noise_scale, vol=None)
-      out_audio = out_audio[0,0].float().realize()
-      vits_time = time.time() - vits_start
-
-      infer_time = time.time() - start
-      logging.info("total infer time:{:.2f}s, speech_enc time:{:.2f}s, vits time:{:.2f}s".format(infer_time, enc_time, vits_time))
-      ### Infer END ###
-
-      out_sr, out_frame = out_audio.shape[-1], n_frames
-      global_frame += out_frame
-      _audio = out_audio.numpy()
-      pad_len = int(target_sample * pad_seconds)
-      _audio = _audio[pad_len:-pad_len]
-      _audio = pad_array(_audio, per_length)
-      audio.extend(list(_audio))
-
-  audio = np.array(audio)
-  out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model}{f'_spk_{speaker}'}_{args.base_name}.wav")
-  out_path.parent.mkdir(parents=True, exist_ok=True)
-  soundfile.write(out_path, audio, target_sample, format="flac")
-  logging.info(f"Saved audio output to {out_path}")
--- a/examples/sovits_helpers/preprocess.py
+++ b/examples/sovits_helpers/preprocess.py
@ -1,204 +0,0 @@
-import math
-from typing import Optional, Tuple
-from tinygrad import Tensor, dtypes
-import librosa
-import soundfile
-import numpy as np
-import parselmouth
-
-class PMF0Predictor:  # from https://github.com/svc-develop-team/so-vits-svc/
-  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
-    self.hop_length, self.f0_min, self.f0_max, self.sampling_rate, self.name = hop_length, f0_min, f0_max, sampling_rate, "pm"
-  def interpolate_f0(self,f0):
-    vuv_vector = np.zeros_like(f0, dtype=np.float32)
-    vuv_vector[f0 > 0.0] = 1.0
-    vuv_vector[f0 <= 0.0] = 0.0
-    nzindex = np.nonzero(f0)[0]
-    data = f0[nzindex]
-    nzindex = nzindex.astype(np.float32)
-    time_org = self.hop_length / self.sampling_rate * nzindex
-    time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
-    if data.shape[0] <= 0: return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
-    if data.shape[0] == 1: return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
-    f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
-    return f0,vuv_vector
-  def compute_f0(self,wav,p_len=None):
-    x = wav
-    if p_len is None: p_len = x.shape[0]//self.hop_length
-    else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
-    time_step = self.hop_length / self.sampling_rate * 1000
-    f0 = parselmouth.Sound(x, self.sampling_rate) \
-                    .to_pitch_ac(time_step=time_step / 1000, voicing_threshold=0.6,pitch_floor=self.f0_min, pitch_ceiling=self.f0_max) \
-                    .selected_array['frequency']
-    pad_size=(p_len - len(f0) + 1) // 2
-    if(pad_size>0 or p_len - len(f0) - pad_size>0):
-      f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
-    f0,uv = self.interpolate_f0(f0)
-    return f0
-  def compute_f0_uv(self,wav,p_len=None):
-    x = wav
-    if p_len is None: p_len = x.shape[0]//self.hop_length
-    else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
-    time_step = self.hop_length / self.sampling_rate * 1000
-    f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac(
-      time_step=time_step / 1000, voicing_threshold=0.6,
-      pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
-    pad_size=(p_len - len(f0) + 1) // 2
-    if(pad_size>0 or p_len - len(f0) - pad_size>0):
-      f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
-    f0,uv = self.interpolate_f0(f0)
-    return f0,uv
-
-class Slicer:  # from https://github.com/svc-develop-team/so-vits-svc/
-  def __init__(self, sr: int, threshold: float = -40., min_length: int = 5000, min_interval: int = 300, hop_size: int = 20, max_sil_kept: int = 5000):
-    if not min_length >= min_interval >= hop_size:
-      raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
-    if not max_sil_kept >= hop_size:
-      raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
-    min_interval = sr * min_interval / 1000
-    self.threshold = 10 ** (threshold / 20.)
-    self.hop_size = round(sr * hop_size / 1000)
-    self.win_size = min(round(min_interval), 4 * self.hop_size)
-    self.min_length = round(sr * min_length / 1000 / self.hop_size)
-    self.min_interval = round(min_interval / self.hop_size)
-    self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
-  def _apply_slice(self, waveform, begin, end):
-    if len(waveform.shape) > 1: return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
-    else: return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
-  def slice(self, waveform):
-    samples = librosa.to_mono(waveform) if len(waveform.shape) > 1 else waveform
-    if samples.shape[0] <= self.min_length: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
-    rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
-    sil_tags, silence_start, clip_start = [], None, 0
-    for i, rms in enumerate(rms_list):
-      if rms < self.threshold:  # Keep looping while frame is silent.
-        if silence_start is None:  # Record start of silent frames.
-          silence_start = i
-        continue
-      if silence_start is None: continue  # Keep looping while frame is not silent and silence start has not been recorded.
-      # Clear recorded silence start if interval is not enough or clip is too short
-      is_leading_silence = silence_start == 0 and i > self.max_sil_kept
-      need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
-      if not is_leading_silence and not need_slice_middle:
-        silence_start = None
-        continue
-      if i - silence_start <= self.max_sil_kept:  # Need slicing. Record the range of silent frames to be removed.
-        pos = rms_list[silence_start: i + 1].argmin() + silence_start
-        sil_tags.append((0, pos) if silence_start == 0 else (pos, pos))
-        clip_start = pos
-      elif i - silence_start <= self.max_sil_kept * 2:
-        pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
-        pos += i - self.max_sil_kept
-        pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
-        pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
-        if silence_start == 0:
-          sil_tags.append((0, pos_r))
-          clip_start = pos_r
-        else:
-          sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
-          clip_start = max(pos_r, pos)
-      else:
-        pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
-        pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
-        sil_tags.append((0, pos_r) if silence_start == 0 else (pos_l, pos_r))
-        clip_start = pos_r
-      silence_start = None
-    total_frames = rms_list.shape[0]
-    if silence_start is not None and total_frames - silence_start >= self.min_interval:  # Deal with trailing silence.
-      silence_end = min(total_frames, silence_start + self.max_sil_kept)
-      pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
-      sil_tags.append((pos, total_frames + 1))
-    if len(sil_tags) == 0: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}  # Apply and return slices.
-    chunks = []
-    if sil_tags[0][0]:
-      chunks.append({"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
-    for i in range(0, len(sil_tags)):
-      if i: chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
-      chunks.append({"slice": True, "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
-    if sil_tags[-1][1] * self.hop_size < len(waveform):
-      chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
-    chunk_dict = {}
-    for i in range(len(chunks)): chunk_dict[str(i)] = chunks[i]
-    return chunk_dict
-
-# sinc_interp_hann audio resampling
-class Resample:
-  def __init__(self, orig_freq:int=16000, new_freq:int=16000, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None, dtype:Optional[dtypes]=None):
-    self.orig_freq, self.new_freq, self.lowpass_filter_width, self.rolloff, self.beta = orig_freq, new_freq, lowpass_filter_width, rolloff, beta
-    self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
-    self.kernel, self.width = self._get_sinc_resample_kernel(dtype) if self.orig_freq != self.new_freq else (None, None)
-  def __call__(self, waveform:Tensor) -> Tensor:
-    if self.orig_freq == self.new_freq: return waveform
-    return self._apply_sinc_resample_kernel(waveform)
-  def _apply_sinc_resample_kernel(self, waveform:Tensor):
-    if not waveform.is_floating_point(): raise TypeError(f"Waveform tensor expected to be of type float, but received {waveform.dtype}.")
-    orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
-    shape = waveform.shape
-    waveform = waveform.reshape(-1, shape[-1])  # pack batch
-    num_wavs, length = waveform.shape
-    target_length = int(math.ceil(new_freq * length / orig_freq))
-    waveform = waveform.pad((self.width, self.width + orig_freq))
-    resampled = waveform[:, None].conv2d(self.kernel, stride=orig_freq)
-    resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
-    resampled = resampled[..., :target_length]
-    resampled = resampled.reshape(shape[:-1] + resampled.shape[-1:])  # unpack batch
-    return resampled
-  def _get_sinc_resample_kernel(self, dtype=None):
-    orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
-    if self.lowpass_filter_width <= 0: raise ValueError("Low pass filter width should be positive.")
-    base_freq = min(orig_freq, new_freq)
-    base_freq *= self.rolloff
-    width = math.ceil(self.lowpass_filter_width * orig_freq / base_freq)
-    idx = Tensor.arange(-width, width + orig_freq, dtype=(dtype if dtype is not None else dtypes.float32))[None, None] / orig_freq
-    t = Tensor.arange(0, -new_freq, -1, dtype=dtype)[:, None, None] / new_freq + idx
-    t *= base_freq
-    t = t.clip(-self.lowpass_filter_width, self.lowpass_filter_width)
-    window = (t * math.pi / self.lowpass_filter_width / 2).cos() ** 2
-    t *= math.pi
-    scale = base_freq / orig_freq
-    kernels = Tensor.where(t == 0, Tensor(1.0, dtype=t.dtype).to(t.device), t.sin() / t)
-    kernels *= window * scale
-    if dtype is None: kernels = kernels.cast(dtype=dtypes.float32)
-    return kernels, width
-
-def sinc_interp_resample(x:Tensor, orig_freq:int=16000, new_freq:int=1600, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None):
-  resamp = Resample(orig_freq, new_freq, lowpass_filter_width, rolloff, beta, x.dtype)
-  return resamp(x)
-
-def cut(audio_path, db_thresh=-30, min_len=5000):
-  audio, sr = librosa.load(audio_path, sr=None)
-  slicer = Slicer(sr=sr, threshold=db_thresh, min_length=min_len)
-  chunks = slicer.slice(audio)
-  return chunks
-
-def chunks2audio(audio_path, chunks):
-  chunks = dict(chunks)
-  audio, sr = load_audiofile(audio_path)
-  if len(audio.shape) == 2 and audio.shape[1] >= 2:
-    audio = audio.mean(0).unsqueeze(0)
-  audio = audio.numpy()[0]
-  result = []
-  for k, v in chunks.items():
-    tag = v["split_time"].split(",")
-    if tag[0] != tag[1]:
-      result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
-  return result, sr
-
-def load_audiofile(filepath:str, frame_offset:int=0, num_frames:int=-1, channels_first:bool=True):
-  with soundfile.SoundFile(filepath, "r") as file_:
-    frames = file_._prepare_read(frame_offset, None, num_frames)
-    waveform = file_.read(frames, "float32", always_2d=True)
-    sample_rate = file_.samplerate
-  waveform = Tensor(waveform)
-  if channels_first: waveform = waveform.transpose(0, 1)
-  return waveform, sample_rate
-
-def get_unit_f0(wav:Tensor, tran, hop_length, target_sample, f0_filter=False) -> Tuple[Tensor,Tensor,Tensor]:
-  f0_predictor = PMF0Predictor(hop_length, sampling_rate=target_sample)
-  f0, uv = f0_predictor.compute_f0_uv(wav.numpy())
-  if f0_filter and sum(f0) == 0: raise RuntimeError("No voice detected")
-  f0 = Tensor(f0.astype(np.float32)).float()
-  f0 = (f0 * 2 ** (tran / 12)).unsqueeze(0)
-  uv = Tensor(uv.astype(np.float32)).float().unsqueeze(0)
-  wav16k = sinc_interp_resample(wav[None,:], target_sample, 16000)[0]
-  return wav16k.realize(), f0.realize(), uv.realize()
--- a/examples/stable_diffusion.py
+++ b/examples/stable_diffusion.py
@ -9,7 +9,7 @@ from typing import Dict, Any
 from PIL import Image
 import numpy as np
 from tinygrad import Device, GlobalCounters, dtypes, Tensor, TinyJit
-from tinygrad.helpers import Timing, Context, getenv, fetch, colored, tqdm, flatten
+from tinygrad.helpers import Timing, Context, getenv, fetch, colored, tqdm, flatten, profile_marker
 from tinygrad.nn import Conv2d, GroupNorm
 from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
 from extra.models.clip import Closed, Tokenizer, FrozenOpenClipEmbedder
@ -266,13 +266,16 @@ if __name__ == "__main__":
  parser.add_argument('--fakeweights', action='store_true', help="Skip loading checkpoints and use fake weights")
  args = parser.parse_args()

+  profile_marker("create model")
  model = StableDiffusion()

-  # load in weights
+  profile_marker("load in weights")
  with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
    if not args.fakeweights:
      model_bin = fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt')
-      load_state_dict(model, torch_load(model_bin)['state_dict'], verbose=False, strict=False, realize=False)
+      state_dict = torch_load(model_bin)['state_dict']
+      profile_marker("state dict loaded")
+      load_state_dict(model, state_dict, verbose=False, strict=False, realize=False)

    if args.fp16:
      for k,v in get_state_dict(model).items():
@ -281,12 +284,13 @@ if __name__ == "__main__":

    Tensor.realize(*get_state_dict(model).values())

-  # run through CLIP to get context
+  profile_marker("run clip (conditional)")
  tokenizer = Tokenizer.ClipTokenizer()
  prompt = Tensor([tokenizer.encode(args.prompt)])
  context = model.cond_stage_model.transformer.text_model(prompt).realize()
  print("got CLIP context", context.shape)

+  profile_marker("run clip (unconditional)")
  prompt = Tensor([tokenizer.encode("")])
  unconditional_context = model.cond_stage_model.transformer.text_model(prompt).realize()
  print("got unconditional CLIP context", unconditional_context.shape)
@ -310,6 +314,7 @@ if __name__ == "__main__":
  step_times = []
  with Context(BEAM=getenv("LATEBEAM")):
    for index, timestep in (t:=tqdm(list(enumerate(timesteps))[::-1])):
+      profile_marker(f"step {len(timesteps)-index-1}")
      GlobalCounters.reset()
      st = time.perf_counter_ns()
      t.set_description("%3d %3d" % (index, timestep))
@ -319,24 +324,26 @@ if __name__ == "__main__":
          latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
          if args.timing: Device[Device.DEFAULT].synchronize()
      step_times.append((time.perf_counter_ns() - st)*1e-6)
+    # done with diffusion model
    del run
+    del model.model

  if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
    min_time = min(step_times)
    assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
-  # upsample latent space to image with autoencoder
-  x = model.decode(latent)
+  profile_marker("run decoder") # upsample latent space to image with autoencoder
+  x = model.decode(latent).realize()
  print(x.shape)

-  # save image
+  profile_marker("save image")
  im = Image.fromarray(x.numpy())
  print(f"saving {args.out}")
  im.save(args.out)
  # Open image.
  if not args.noshow: im.show()

-  # validation!
  if args.prompt == default_prompt and args.steps == 6 and args.seed == 0 and args.guidance == 7.5:
+    profile_marker("validate")
    ref_image = Tensor(np.array(Image.open(Path(__file__).parent / "stable_diffusion_seed0.png")))
    distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
    assert distance < 3e-3, colored(f"validation failed with {distance=}", "red")  # higher distance with WINO
--- a/examples/tools/bandwidth_test.py
+++ b/examples/tools/bandwidth_test.py
--- a/examples/tools/gpuburn.py
+++ b/examples/tools/gpuburn.py
--- a/examples/train_efficientnet.py
+++ b/examples/train_efficientnet.py
@ -1,104 +0,0 @@
-import traceback
-import time
-from multiprocessing import Process, Queue
-import numpy as np
-from tinygrad.nn.state import get_parameters
-from tinygrad.nn import optim
-from tinygrad.helpers import getenv, trange
-from tinygrad.tensor import Tensor
-from extra.datasets import fetch_cifar
-from extra.models.efficientnet import EfficientNet
-
-class TinyConvNet:
-  def __init__(self, classes=10):
-    conv = 3
-    inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*6*6, classes)
-
-  def forward(self, x):
-    x = x.conv2d(self.c1).relu().max_pool2d()
-    x = x.conv2d(self.c2).relu().max_pool2d()
-    x = x.reshape(shape=[x.shape[0], -1])
-    return x.dot(self.l1)
-
-if __name__ == "__main__":
-  IMAGENET = getenv("IMAGENET")
-  classes = 1000 if IMAGENET else 10
-
-  TINY = getenv("TINY")
-  TRANSFER = getenv("TRANSFER")
-  if TINY:
-    model = TinyConvNet(classes)
-  elif TRANSFER:
-    model = EfficientNet(getenv("NUM", 0), classes, has_se=True)
-    model.load_from_pretrained()
-  else:
-    model = EfficientNet(getenv("NUM", 0), classes, has_se=False)
-
-  parameters = get_parameters(model)
-  print("parameter count", len(parameters))
-  optimizer = optim.Adam(parameters, lr=0.001)
-
-  BS, steps = getenv("BS", 64 if TINY else 16), getenv("STEPS", 2048)
-  print(f"training with batch size {BS} for {steps} steps")
-
-  if IMAGENET:
-    from extra.datasets.imagenet import fetch_batch
-    def loader(q):
-      while 1:
-        try:
-          q.put(fetch_batch(BS))
-        except Exception:
-          traceback.print_exc()
-    q = Queue(16)
-    for i in range(2):
-      p = Process(target=loader, args=(q,))
-      p.daemon = True
-      p.start()
-  else:
-    X_train, Y_train, _, _ = fetch_cifar()
-    X_train = X_train.reshape((-1, 3, 32, 32))
-    Y_train = Y_train.reshape((-1,))
-
-  with Tensor.train():
-    for i in (t := trange(steps)):
-      if IMAGENET:
-        X, Y = q.get(True)
-      else:
-        samp = np.random.randint(0, X_train.shape[0], size=(BS))
-        X, Y = X_train.numpy()[samp], Y_train.numpy()[samp]
-
-      st = time.time()
-      out = model.forward(Tensor(X.astype(np.float32), requires_grad=False))
-      fp_time = (time.time()-st)*1000.0
-
-      y = np.zeros((BS,classes), np.float32)
-      y[range(y.shape[0]),Y] = -classes
-      y = Tensor(y, requires_grad=False)
-      loss = out.log_softmax().mul(y).mean()
-
-      optimizer.zero_grad()
-
-      st = time.time()
-      loss.backward()
-      bp_time = (time.time()-st)*1000.0
-
-      st = time.time()
-      optimizer.step()
-      opt_time = (time.time()-st)*1000.0
-
-      st = time.time()
-      loss = loss.numpy()
-      cat = out.argmax(axis=1).numpy()
-      accuracy = (cat == Y).mean()
-      finish_time = (time.time()-st)*1000.0
-
-      # printing
-      t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
-        (loss, accuracy,
-        fp_time, bp_time, opt_time, finish_time,
-        fp_time + bp_time + opt_time + finish_time))
-
-      del out, y, loss
--- a/examples/vit.py
+++ b/examples/vit.py
@ -1,46 +0,0 @@
-import ast
-import numpy as np
-from PIL import Image
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv, fetch
-from extra.models.vit import ViT
-"""
-fn = "gs://vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz"
-import tensorflow as tf
-with tf.io.gfile.GFile(fn, "rb") as f:
-  dat = f.read()
-  with open("cache/"+ fn.rsplit("/", 1)[1], "wb") as g:
-    g.write(dat)
-"""
-
-Tensor.training = False
-if getenv("LARGE", 0) == 1:
-  m = ViT(embed_dim=768, num_heads=12)
-else:
-  # tiny
-  m = ViT(embed_dim=192, num_heads=3)
-m.load_from_pretrained()
-
-# category labels
-lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
-
-#url = "https://upload.wikimedia.org/wikipedia/commons/4/41/Chicken.jpg"
-url = "https://repository-images.githubusercontent.com/296744635/39ba6700-082d-11eb-98b8-cb29fb7369c0"
-
-# junk
-img = Image.open(fetch(url))
-aspect_ratio = img.size[0] / img.size[1]
-img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
-img = np.array(img)
-y0,x0=(np.asarray(img.shape)[:2]-224)//2
-img = img[y0:y0+224, x0:x0+224]
-img = np.moveaxis(img, [2,0,1], [0,1,2])
-img = img.astype(np.float32)[:3].reshape(1,3,224,224)
-img /= 255.0
-img -= 0.5
-img /= 0.5
-
-out = m.forward(Tensor(img))
-outnp = out.numpy().ravel()
-choice = outnp.argmax()
-print(out.shape, choice, outnp[choice], lbls[choice])
--- a/examples/vits.py
+++ b/examples/vits.py
@ -1,740 +0,0 @@
-import json, logging, math, re, sys, time, wave, argparse, numpy as np
-from phonemizer.phonemize import default_separator, _phonemize
-from phonemizer.backend import EspeakBackend
-from phonemizer.punctuation import Punctuation
-from functools import reduce
-from pathlib import Path
-from typing import List
-from tinygrad import nn, dtypes
-from tinygrad.helpers import fetch
-from tinygrad.nn.state import torch_load
-from tinygrad.tensor import Tensor
-from tinygrad.engine.jit import TinyJit
-from unidecode import unidecode
-
-LRELU_SLOPE = 0.1
-
-class Synthesizer:
-  def __init__(self, n_vocab, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, gin_channels=0, use_sdp=True, emotion_embedding=False, **kwargs):
-    self.n_vocab, self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.use_sdp = n_vocab, spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, use_sdp
-    self.enc_p = TextEncoder(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding)
-    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
-    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
-    self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) if use_sdp else DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
-    if n_speakers > 1: self.emb_g = nn.Embedding(n_speakers, gin_channels)
-  def infer(self, x, x_lengths, sid=None, noise_scale=1.0, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None, max_y_length_estimate_scale=None, pad_length=-1):
-    x, m_p, logs_p, x_mask = self.enc_p.forward(x.realize(), x_lengths.realize(), emotion_embedding.realize() if emotion_embedding is not None else emotion_embedding)
-    g = self.emb_g(sid.reshape(1, 1)).squeeze(1).unsqueeze(-1) if self.n_speakers > 0 else None
-    logw = self.dp.forward(x, x_mask.realize(), g=g.realize(), reverse=self.use_sdp, noise_scale=noise_scale_w if self.use_sdp else 1.0)
-    w_ceil = Tensor.ceil(logw.exp() * x_mask * length_scale)
-    y_lengths = Tensor.maximum(w_ceil.sum([1, 2]), 1).cast(dtypes.int64)
-    return self.generate(g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length)
-  def generate(self, g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length):
-    max_y_length = y_lengths.max().item() if max_y_length_estimate_scale is None else max(15, x.shape[-1]) * max_y_length_estimate_scale
-    y_mask = sequence_mask(y_lengths, max_y_length).unsqueeze(1).cast(x_mask.dtype)
-    attn_mask = x_mask.unsqueeze(2) * y_mask.unsqueeze(-1)
-    attn = generate_path(w_ceil, attn_mask)
-    m_p_2 = attn.squeeze(1).matmul(m_p.transpose(1, 2)).transpose(1, 2)        # [b, t', t], [b, t, d] -> [b, d, t']
-    logs_p_2 = attn.squeeze(1).matmul(logs_p.transpose(1, 2)).transpose(1, 2)  # [b, t', t], [b, t, d] -> [b, d, t']
-    z_p = m_p_2 + Tensor.randn(*m_p_2.shape, dtype=m_p_2.dtype) * logs_p_2.exp() * noise_scale
-    row_len = y_mask.shape[2]
-    if pad_length > -1:
-      # Pad flow forward inputs to enable JIT
-      assert pad_length > row_len, "pad length is too small"
-      y_mask = y_mask.pad(((0, 0), (0, 0), (0, pad_length - row_len))).cast(z_p.dtype)
-      # New y_mask tensor to remove sts mask
-      y_mask = Tensor(y_mask.numpy(), device=y_mask.device, dtype=y_mask.dtype, requires_grad=y_mask.requires_grad)
-      z_p = z_p.squeeze(0).pad(((0, 0), (0, pad_length - z_p.shape[2])), value=1).unsqueeze(0)
-    z = self.flow.forward(z_p.realize(), y_mask.realize(), g=g.realize(), reverse=True)
-    result_length = reduce(lambda x, y: x * y, self.dec.upsample_rates, row_len)
-    o = self.dec.forward((z * y_mask)[:, :, :max_len], g=g)[:, :, :result_length]
-    if max_y_length_estimate_scale is not None:
-      length_scaler = o.shape[-1] / max_y_length
-      o.realize()
-      real_max_y_length = y_lengths.max().numpy()
-      if real_max_y_length > max_y_length:
-        logging.warning(f"Underestimated max length by {(((real_max_y_length / max_y_length) * 100) - 100):.2f}%, recomputing inference without estimate...")
-        return self.generate(g, logs_p, m_p, max_len, None, noise_scale, w_ceil, x, x_mask, y_lengths)
-      if real_max_y_length < max_y_length:
-        overestimation = ((max_y_length / real_max_y_length) * 100) - 100
-        logging.info(f"Overestimated max length by {overestimation:.2f}%")
-        if overestimation > 10: logging.warning("Warning: max length overestimated by more than 10%")
-      o = o[:, :, :(real_max_y_length * length_scaler).astype(np.int32)]
-    return o
-
-class StochasticDurationPredictor:
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
-    filter_channels = in_channels # it needs to be removed from future version.
-    self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.n_flows, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, n_flows, gin_channels
-    self.log_flow, self.flows = Log(), [ElementwiseAffine(2)]
-    for _ in range(n_flows):
-      self.flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.flows.append(Flip())
-    self.post_pre, self.post_proj = nn.Conv1d(1, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
-    self.post_convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    self.post_flows = [ElementwiseAffine(2)]
-    for _ in range(4):
-      self.post_flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.post_flows.append(Flip())
-    self.pre, self.proj = nn.Conv1d(in_channels, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-  @TinyJit
-  def forward(self, x: Tensor, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
-    x = self.pre(x.detach())
-    if g is not None: x = x + self.cond(g.detach())
-    x = self.convs.forward(x, x_mask)
-    x = self.proj(x) * x_mask
-    if not reverse:
-      flows = self.flows
-      assert w is not None
-      log_det_tot_q = 0
-      h_w = self.post_proj(self.post_convs.forward(self.post_pre(w), x_mask)) * x_mask
-      e_q = Tensor.randn(w.size(0), 2, w.size(2), dtype=x.dtype).to(device=x.device) * x_mask
-      z_q = e_q
-      for flow in self.post_flows:
-        z_q, log_det_q = flow.forward(z_q, x_mask, g=(x + h_w))
-        log_det_tot_q += log_det_q
-      z_u, z1 = z_q.split([1, 1], 1)
-      u = z_u.sigmoid() * x_mask
-      z0 = (w - u) * x_mask
-      log_det_tot_q += Tensor.sum((z_u.logsigmoid() + (-z_u).logsigmoid()) * x_mask, [1,2])
-      log_q = Tensor.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - log_det_tot_q
-      log_det_tot = 0
-      z0, log_det = self.log_flow.forward(z0, x_mask)
-      log_det_tot += log_det
-      z = z0.cat(z1, 1)
-      for flow in flows:
-        z, log_det = flow.forward(z, x_mask, g=x, reverse=reverse)
-        log_det_tot = log_det_tot + log_det
-      nll = Tensor.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - log_det_tot
-      return (nll + log_q).realize() # [b]
-    flows = list(reversed(self.flows))
-    flows = flows[:-2] + [flows[-1]] # remove a useless vflow
-    z = Tensor.randn(x.shape[0], 2, x.shape[2], dtype=x.dtype).to(device=x.device) * noise_scale
-    for flow in flows: z = flow.forward(z, x_mask, g=x, reverse=reverse)
-    z0, z1 = z.split([1, 1], 1)
-    return z0.realize()
-
-class DurationPredictor:
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
-    self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, gin_channels
-    self.conv_1, self.norm_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
-    self.conv_2, self.norm_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
-    self.proj = nn.Conv1d(filter_channels, 1, 1)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-  def forward(self, x: Tensor, x_mask, g=None):
-    x = x.detach()
-    if g is not None: x = x + self.cond(g.detach())
-    x = self.conv_1(x * x_mask).relu()
-    x = self.norm_1(x).dropout(self.p_dropout)
-    x = self.conv_2(x * x_mask).relu(x)
-    x = self.norm_2(x).dropout(self.p_dropout)
-    return self.proj(x * x_mask) * x_mask
-
-class TextEncoder:
-  def __init__(self, n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding):
-    self.n_vocab, self.out_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout = n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-    if n_vocab!=0:self.emb = nn.Embedding(n_vocab, hidden_channels)
-    if emotion_embedding: self.emo_proj = nn.Linear(1024, hidden_channels)
-    self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
-    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-  @TinyJit
-  def forward(self, x: Tensor, x_lengths: Tensor, emotion_embedding=None):
-    if self.n_vocab!=0: x = (self.emb(x) * math.sqrt(self.hidden_channels))
-    if emotion_embedding: x = x + self.emo_proj(emotion_embedding).unsqueeze(1)
-    x = x.transpose(1, -1)  # [b, t, h] -transpose-> [b, h, t]
-    x_mask = sequence_mask(x_lengths, x.shape[2]).unsqueeze(1).cast(x.dtype)
-    x = self.encoder.forward(x * x_mask, x_mask)
-    m, logs = (self.proj(x) * x_mask).split(self.out_channels, dim=1)
-    return x.realize(), m.realize(), logs.realize(), x_mask.realize()
-
-class ResidualCouplingBlock:
-  def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
-    self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.n_flows, self.gin_channels = channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows, gin_channels
-    self.flows = []
-    for _ in range(n_flows):
-      self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
-      self.flows.append(Flip())
-  @TinyJit
-  def forward(self, x, x_mask, g=None, reverse=False):
-    for flow in reversed(self.flows) if reverse else self.flows: x = flow.forward(x, x_mask, g=g, reverse=reverse)
-    return x.realize()
-
-class PosteriorEncoder:
-  def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
-    self.in_channels, self.out_channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels = in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels
-    self.pre, self.proj = nn.Conv1d(in_channels, hidden_channels, 1), nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
-  def forward(self, x, x_lengths, g=None):
-    x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).cast(x.dtype)
-    stats = self.proj(self.enc.forward(self.pre(x) * x_mask, x_mask, g=g)) * x_mask
-    m, logs = stats.split(self.out_channels, dim=1)
-    z = (m + Tensor.randn(m.shape, m.dtype) * logs.exp()) * x_mask
-    return z, m, logs, x_mask
-
-class Generator:
-  def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
-    self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
-    self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-    resblock = ResBlock1 if resblock == '1' else ResBlock2
-    self.ups = [nn.ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), k, u, padding=(k-u)//2) for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes))]
-    self.resblocks = []
-    self.upsample_rates = upsample_rates
-    for i in range(len(self.ups)):
-      ch = upsample_initial_channel // (2 ** (i + 1))
-      for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-        self.resblocks.append(resblock(ch, k, d))
-    self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-  @TinyJit
-  def forward(self, x: Tensor, g=None):
-    x = self.conv_pre(x)
-    if g is not None:  x = x + self.cond(g)
-    for i in range(self.num_upsamples):
-      x = self.ups[i](x.leaky_relu(LRELU_SLOPE))
-      xs = sum(self.resblocks[i * self.num_kernels + j].forward(x) for j in range(self.num_kernels))
-      x = (xs / self.num_kernels).realize()
-    res = self.conv_post(x.leaky_relu()).tanh().realize()
-    return res
-
-class LayerNorm(nn.LayerNorm):
-  def __init__(self, channels, eps=1e-5): super().__init__(channels, eps, elementwise_affine=True)
-  def forward(self, x: Tensor): return self.__call__(x.transpose(1, -1)).transpose(1, -1)
-
-class WN:
-  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-    assert (kernel_size % 2 == 1)
-    self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels, self.p_dropout = hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels, p_dropout
-    self.in_layers, self.res_skip_layers = [], []
-    if gin_channels != 0: self.cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-    for i in range(n_layers):
-      dilation = dilation_rate ** i
-      self.in_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=int((kernel_size * dilation - dilation) / 2)))
-      self.res_skip_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels if i < n_layers - 1 else hidden_channels, 1))
-  def forward(self, x, x_mask, g=None, **kwargs):
-    output = Tensor.zeros_like(x)
-    if g is not None: g = self.cond_layer(g)
-    for i in range(self.n_layers):
-      x_in = self.in_layers[i](x)
-      if g is not None:
-        cond_offset = i * 2 * self.hidden_channels
-        g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
-      else:
-        g_l = Tensor.zeros_like(x_in)
-      acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels)
-      res_skip_acts = self.res_skip_layers[i](acts)
-      if i < self.n_layers - 1:
-        x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
-        output = output + res_skip_acts[:, self.hidden_channels:, :]
-      else:
-        output = output + res_skip_acts
-    return output * x_mask
-
-class ResBlock1:
-  def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-    self.convs1 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(3)]
-    self.convs2 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)) for _ in range(3)]
-  def forward(self, x: Tensor, x_mask=None):
-    for c1, c2 in zip(self.convs1, self.convs2):
-      xt = x.leaky_relu(LRELU_SLOPE)
-      xt = c1(xt if x_mask is None else xt * x_mask).leaky_relu(LRELU_SLOPE)
-      x = c2(xt if x_mask is None else xt * x_mask) + x
-    return x if x_mask is None else x * x_mask
-
-class ResBlock2:
-  def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-    self.convs = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(2)]
-  def forward(self, x, x_mask=None):
-    for c in self.convs:
-      xt = x.leaky_relu(LRELU_SLOPE)
-      xt = c(xt if x_mask is None else xt * x_mask)
-      x = xt + x
-    return x if x_mask is None else x * x_mask
-
-class DDSConv: # Dilated and Depth-Separable Convolution
-  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
-    self.channels, self.kernel_size, self.n_layers, self.p_dropout = channels, kernel_size, n_layers, p_dropout
-    self.convs_sep, self.convs_1x1, self.norms_1, self.norms_2 = [], [], [], []
-    for i in range(n_layers):
-      dilation = kernel_size ** i
-      padding = (kernel_size * dilation - dilation) // 2
-      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding))
-      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
-      self.norms_1.append(LayerNorm(channels))
-      self.norms_2.append(LayerNorm(channels))
-  def forward(self, x, x_mask, g=None):
-    if g is not None: x = x + g
-    for i in range(self.n_layers):
-      y = self.convs_sep[i](x * x_mask)
-      y = self.norms_1[i].forward(y).gelu()
-      y = self.convs_1x1[i](y)
-      y = self.norms_2[i].forward(y).gelu()
-      x = x + y.dropout(self.p_dropout)
-    return x * x_mask
-
-class ConvFlow:
-  def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
-    self.in_channels, self.filter_channels, self.kernel_size, self.n_layers, self.num_bins, self.tail_bound = in_channels, filter_channels, kernel_size, n_layers, num_bins, tail_bound
-    self.half_channels = in_channels // 2
-    self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
-    self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = x.split([self.half_channels] * 2, 1)
-    h = self.proj(self.convs.forward(self.pre(x0), x_mask, g=g)) * x_mask
-    b, c, t = x0.shape
-    h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
-    un_normalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
-    un_normalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
-    un_normalized_derivatives = h[..., 2 * self.num_bins:]
-    x1, log_abs_det = piecewise_rational_quadratic_transform(x1, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=reverse, tails='linear', tail_bound=self.tail_bound)
-    x = x0.cat(x1, dim=1) * x_mask
-    return x if reverse else (x, Tensor.sum(log_abs_det * x_mask, [1,2]))
-
-class ResidualCouplingLayer:
-  def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
-    assert channels % 2 == 0, "channels should be divisible by 2"
-    self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.mean_only = channels, hidden_channels, kernel_size, dilation_rate, n_layers, mean_only
-    self.half_channels = channels // 2
-    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
-    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = x.split([self.half_channels] * 2, 1)
-    stats = self.post(self.enc.forward(self.pre(x0) * x_mask, x_mask, g=g)) * x_mask
-    if not self.mean_only:
-      m, logs = stats.split([self.half_channels] * 2, 1)
-    else:
-      m = stats
-      logs = Tensor.zeros_like(m)
-    if not reverse: return x0.cat((m + x1 * logs.exp() * x_mask), dim=1)
-    return x0.cat(((x1 - m) * (-logs).exp() * x_mask), dim=1)
-
-class Log:
-  def forward(self, x : Tensor, x_mask, reverse=False):
-    if not reverse:
-      y = x.maximum(1e-5).log() * x_mask
-      return y, (-y).sum([1, 2])
-    return x.exp() * x_mask
-
-class Flip:
-  def forward(self, x: Tensor, *args, reverse=False, **kwargs):
-    return x.flip([1]) if reverse else (x.flip([1]), Tensor.zeros(x.shape[0], dtype=x.dtype).to(device=x.device))
-
-class ElementwiseAffine:
-  def __init__(self, channels): self.m, self.logs = Tensor.zeros(channels, 1), Tensor.zeros(channels, 1)
-  def forward(self, x, x_mask, reverse=False, **kwargs): # x if reverse else y, logdet
-    return (x - self.m) * Tensor.exp(-self.logs) * x_mask if reverse \
-      else ((self.m + Tensor.exp(self.logs) * x) * x_mask, Tensor.sum(self.logs * x_mask, [1, 2]))
-
-class MultiHeadAttention:
-  def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
-    assert channels % n_heads == 0
-    self.channels, self.out_channels, self.n_heads, self.p_dropout, self.window_size, self.heads_share, self.block_length, self.proximal_bias, self.proximal_init = channels, out_channels, n_heads, p_dropout, window_size, heads_share, block_length, proximal_bias, proximal_init
-    self.attn, self.k_channels  = None, channels // n_heads
-    self.conv_q, self.conv_k, self.conv_v = [nn.Conv1d(channels, channels, 1) for _ in range(3)]
-    self.conv_o = nn.Conv1d(channels, out_channels, 1)
-    if window_size is not None: self.emb_rel_k, self.emb_rel_v = [Tensor.randn(1 if heads_share else n_heads, window_size * 2 + 1, self.k_channels) * (self.k_channels ** -0.5) for _ in range(2)]
-  def forward(self, x, c, attn_mask=None):
-    q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
-    x, self.attn = self.attention(q, k, v, mask=attn_mask)
-    return self.conv_o(x)
-  def attention(self, query: Tensor, key: Tensor, value: Tensor, mask=None):# reshape [b, d, t] -> [b, n_h, t, d_k]
-    b, d, t_s, t_t = key.shape[0], key.shape[1], key.shape[2], query.shape[2]
-    query = query.reshape(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-    key = key.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    value = value.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    scores = (query / math.sqrt(self.k_channels)) @ key.transpose(-2, -1)
-    if self.window_size is not None:
-      assert t_s == t_t, "Relative attention is only available for self-attention."
-      key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-      rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
-      scores = scores + self._relative_position_to_absolute_position(rel_logits)
-    if mask is not None:
-      scores = Tensor.where(mask, scores, -1e4)
-      if self.block_length is not None:
-        assert t_s == t_t, "Local attention is only available for self-attention."
-        scores = Tensor.where(Tensor.ones_like(scores).triu(-self.block_length).tril(self.block_length), scores, -1e4)
-    p_attn = scores.softmax(axis=-1)  # [b, n_h, t_t, t_s]
-    output = p_attn.matmul(value)
-    if self.window_size is not None:
-      relative_weights = self._absolute_position_to_relative_position(p_attn)
-      value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
-      output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-    output = output.transpose(2, 3).contiguous().reshape(b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-    return output, p_attn
-  def _matmul_with_relative_values(self, x, y): return x.matmul(y.unsqueeze(0))                 # x: [b, h, l, m], y: [h or 1, m, d], ret: [b, h, l, d]
-  def _matmul_with_relative_keys(self, x, y): return x.matmul(y.unsqueeze(0).transpose(-2, -1)) # x: [b, h, l, d], y: [h or 1, m, d], re, : [b, h, l, m]
-  def _get_relative_embeddings(self, relative_embeddings, length):
-    pad_length, slice_start_position = max(length - (self.window_size + 1), 0), max((self.window_size + 1) - length, 0)
-    padded_relative_embeddings = relative_embeddings if pad_length <= 0\
-      else relative_embeddings.pad(convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-    return padded_relative_embeddings[:, slice_start_position:(slice_start_position + 2 * length - 1)] #used_relative_embeddings
-  def _relative_position_to_absolute_position(self, x: Tensor): # x: [b, h, l, 2*l-1] -> [b, h, l, l]
-    batch, heads, length, _ = x.shape
-    x = x.pad(convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
-    x_flat = x.reshape([batch, heads, length * 2 * length]).pad(convert_pad_shape([[0,0],[0,0],[0,length-1]]))
-    return x_flat.reshape([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
-  def _absolute_position_to_relative_position(self, x: Tensor): # x: [b, h, l, l] -> [b, h, l, 2*l-1]
-    batch, heads, length, _ = x.shape
-    x = x.pad(convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
-    x_flat = x.reshape([batch, heads, length**2 + length*(length -1)]).pad(convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-    return x_flat.reshape([batch, heads, length, 2*length])[:,:,:,1:]
-
-class FFN:
-  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
-    self.in_channels, self.out_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.activation, self.causal = in_channels, out_channels, filter_channels, kernel_size, p_dropout, activation, causal
-    self.padding = self._causal_padding if causal else self._same_padding
-    self.conv_1, self.conv_2 = nn.Conv1d(in_channels, filter_channels, kernel_size), nn.Conv1d(filter_channels, out_channels, kernel_size)
-  def forward(self, x, x_mask):
-    x = self.conv_1(self.padding(x * x_mask))
-    x = x * (1.702 * x).sigmoid() if self.activation == "gelu" else x.relu()
-    return self.conv_2(self.padding(x.dropout(self.p_dropout) * x_mask)) * x_mask
-  def _causal_padding(self, x):return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [self.kernel_size - 1, 0]]))
-  def _same_padding(self, x): return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [(self.kernel_size - 1) // 2, self.kernel_size // 2]]))
-
-class Encoder:
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
-    self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.window_size = hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, window_size
-    self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 = [], [], [], []
-    for _ in range(n_layers):
-      self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask):
-    attn_mask, x = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1), x * x_mask
-    for i in range(self.n_layers):
-      y = self.attn_layers[i].forward(x, x, attn_mask).dropout(self.p_dropout)
-      x = self.norm_layers_1[i].forward(x + y)
-      y = self.ffn_layers[i].forward(x, x_mask).dropout(self.p_dropout)
-      x = self.norm_layers_2[i].forward(x + y)
-    return x * x_mask
-
-DEFAULT_MIN_BIN_WIDTH, DEFAULT_MIN_BIN_HEIGHT, DEFAULT_MIN_DERIVATIVE = 1e-3, 1e-3, 1e-3
-def piecewise_rational_quadratic_transform(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails=None, tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  if tails is None: spline_fn, spline_kwargs = rational_quadratic_spline, {}
-  else: spline_fn, spline_kwargs = unconstrained_rational_quadratic_spline, {'tails': tails, 'tail_bound': tail_bound}
-  return spline_fn(inputs=inputs, un_normalized_widths=un_normalized_widths, un_normalized_heights=un_normalized_heights, un_normalized_derivatives=un_normalized_derivatives, inverse=inverse, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative, **spline_kwargs)
-def unconstrained_rational_quadratic_spline(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails='linear', tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  if not tails == 'linear': raise RuntimeError('{} tails are not implemented.'.format(tails))
-  constant = np.log(np.exp(1 - min_derivative) - 1).item()
-  un_normalized_derivatives = cat_lr(un_normalized_derivatives, constant, constant)
-  output, log_abs_det = rational_quadratic_spline(inputs=inputs.squeeze(dim=0).squeeze(dim=0), unnormalized_widths=un_normalized_widths.squeeze(dim=0).squeeze(dim=0), unnormalized_heights=un_normalized_heights.squeeze(dim=0).squeeze(dim=0), unnormalized_derivatives=un_normalized_derivatives.squeeze(dim=0).squeeze(dim=0), inverse=inverse, left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative)
-  return output.unsqueeze(dim=0).unsqueeze(dim=0), log_abs_det.unsqueeze(dim=0).unsqueeze(dim=0)
-def rational_quadratic_spline(inputs: Tensor, unnormalized_widths: Tensor, unnormalized_heights: Tensor, unnormalized_derivatives: Tensor, inverse=False, left=0., right=1., bottom=0., top=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  num_bins = unnormalized_widths.shape[-1]
-  if min_bin_width * num_bins > 1.0: raise ValueError('Minimal bin width too large for the number of bins')
-  if min_bin_height * num_bins > 1.0: raise ValueError('Minimal bin height too large for the number of bins')
-  widths = min_bin_width + (1 - min_bin_width * num_bins) * unnormalized_widths.softmax(axis=-1)
-  cum_widths = cat_lr(((right - left) * widths[..., :-1].cumsum(axis=1) + left), left, right + 1e-6 if not inverse else right)
-  widths = cum_widths[..., 1:] - cum_widths[..., :-1]
-  derivatives = min_derivative + (unnormalized_derivatives.exp()+1).log()
-  heights = min_bin_height + (1 - min_bin_height * num_bins) * unnormalized_heights.softmax(axis=-1)
-  cum_heights = cat_lr(((top - bottom) * heights[..., :-1].cumsum(axis=1) + bottom), bottom, top + 1e-6 if inverse else top)
-  heights = cum_heights[..., 1:] - cum_heights[..., :-1]
-  bin_idx = ((inputs[..., None] >= (cum_heights if inverse else cum_widths)).sum(axis=-1) - 1)[..., None]
-  input_cum_widths = gather(cum_widths, bin_idx, axis=-1)[..., 0]
-  input_bin_widths = gather(widths, bin_idx, axis=-1)[..., 0]
-  input_cum_heights = gather(cum_heights, bin_idx, axis=-1)[..., 0]
-  input_delta = gather(heights / widths, bin_idx, axis=-1)[..., 0]
-  input_derivatives = gather(derivatives, bin_idx, axis=-1)[..., 0]
-  input_derivatives_plus_one = gather(derivatives[..., 1:], bin_idx, axis=-1)[..., 0]
-  input_heights = gather(heights, bin_idx, axis=-1)[..., 0]
-  if inverse:
-    a = ((inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + input_heights * (input_delta - input_derivatives))
-    b = (input_heights * input_derivatives - (inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta))
-    c = - input_delta * (inputs - input_cum_heights)
-    discriminant = b.square() - 4 * a * c
-    # assert (discriminant.numpy() >= 0).all()
-    root = (2 * c) / (-b - discriminant.sqrt())
-    theta_one_minus_theta = root * (1 - root)
-    denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
-    derivative_numerator = input_delta.square() * (input_derivatives_plus_one * root.square() + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - root).square())
-    return root * input_bin_widths + input_cum_widths, -(derivative_numerator.log() - 2 * denominator.log())
-  theta = (inputs - input_cum_widths) / input_bin_widths
-  theta_one_minus_theta = theta * (1 - theta)
-  numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
-  denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
-  derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - theta).pow(2))
-  return input_cum_heights + numerator / denominator, derivative_numerator.log() - 2 * denominator.log()
-
-def sequence_mask(length: Tensor, max_length): return Tensor.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1)
-def generate_path(duration: Tensor, mask: Tensor):  # duration: [b, 1, t_x], mask: [b, 1, t_y, t_x]
-  b, _, t_y, t_x = mask.shape
-  path = sequence_mask(duration.cumsum(axis=2).reshape(b * t_x), t_y).cast(mask.dtype).reshape(b, t_x, t_y)
-  path = path - path.pad(convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-  return path.unsqueeze(1).transpose(2, 3) * mask
-def fused_add_tanh_sigmoid_multiply(input_a: Tensor, input_b: Tensor, n_channels: int):
-  n_channels_int, in_act = n_channels, input_a + input_b
-  t_act, s_act = in_act[:, :n_channels_int, :].tanh(), in_act[:, n_channels_int:, :].sigmoid()
-  return t_act * s_act
-
-def cat_lr(t, left, right): return Tensor.full(get_shape(t), left).cat(t, dim=-1).cat(Tensor.full(get_shape(t), right), dim=-1)
-def get_shape(tensor):
-  (shape := list(tensor.shape))[-1] = 1
-  return tuple(shape)
-def convert_pad_shape(pad_shape): return tuple(tuple(x) for x in pad_shape)
-def get_padding(kernel_size, dilation=1): return int((kernel_size*dilation - dilation)/2)
-
-def gather(x, indices, axis):
-  indices = (indices < 0).where(indices + x.shape[axis], indices).transpose(0, axis)
-  permute_args = list(range(x.ndim))
-  permute_args[0], permute_args[axis] = permute_args[axis], permute_args[0]
-  permute_args.append(permute_args.pop(0))
-  x = x.permute(*permute_args)
-  reshape_arg = [1] * x.ndim + [x.shape[-1]]
-  return ((indices.unsqueeze(indices.ndim).expand(*indices.shape, x.shape[-1]) ==
-           Tensor.arange(x.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, x.shape[-1])) * x).sum(indices.ndim).transpose(0, axis)
-
-def norm_except_dim(v, dim):
-  if dim == -1: return np.linalg.norm(v)
-  if dim == 0:
-    (output_shape := [1] * v.ndim)[0] = v.shape[0]
-    return np.linalg.norm(v.reshape(v.shape[0], -1), axis=1).reshape(output_shape)
-  if dim == v.ndim - 1:
-    (output_shape := [1] * v.ndim)[-1] = v.shape[-1]
-    return np.linalg.norm(v.reshape(-1, v.shape[-1]), axis=0).reshape(output_shape)
-  transposed_v = np.transpose(v, (dim,) + tuple(i for i in range(v.ndim) if i != dim))
-  return np.transpose(norm_except_dim(transposed_v, 0), (dim,) + tuple(i for i in range(v.ndim) if i != dim))
-def weight_norm(v: Tensor, g: Tensor, dim):
-  v, g = v.numpy(), g.numpy()
-  return Tensor(v * (g / norm_except_dim(v, dim)))
-
-# HPARAMS LOADING
-def get_hparams_from_file(path):
-  with open(path, "r") as f:
-    data = f.read()
-  return HParams(**json.loads(data))
-class HParams:
-  def __init__(self, **kwargs):
-    for k, v in kwargs.items(): self[k] = v if type(v) != dict else HParams(**v)
-  def keys(self): return self.__dict__.keys()
-  def items(self): return self.__dict__.items()
-  def values(self): return self.__dict__.values()
-  def __len__(self): return len(self.__dict__)
-  def __getitem__(self, key): return getattr(self, key)
-  def __setitem__(self, key, value): return setattr(self, key, value)
-  def __contains__(self, key): return key in self.__dict__
-  def __repr__(self): return self.__dict__.__repr__()
-
-# MODEL LOADING
-def load_model(symbols, hps, model) -> Synthesizer:
-  net_g = Synthesizer(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers = hps.data.n_speakers, **hps.model)
-  _ = load_checkpoint(fetch(model[1]), net_g, None)
-  return net_g
-def load_checkpoint(checkpoint_path, model: Synthesizer, optimizer=None, skip_list=[]):
-  assert Path(checkpoint_path).is_file()
-  start_time = time.time()
-  checkpoint_dict = torch_load(checkpoint_path)
-  iteration, learning_rate = checkpoint_dict['iteration'], checkpoint_dict['learning_rate']
-  if optimizer: optimizer.load_state_dict(checkpoint_dict['optimizer'])
-  saved_state_dict = checkpoint_dict['model']
-  weight_g, weight_v, parent = None, None, None
-  for key, v in saved_state_dict.items():
-    if any(layer in key for layer in skip_list): continue
-    try:
-      obj, skip = model, False
-      for k in key.split('.'):
-        if k.isnumeric(): obj = obj[int(k)]
-        elif isinstance(obj, dict): obj = obj[k]
-        else:
-          if isinstance(obj, (LayerNorm, nn.LayerNorm)) and k in ["gamma", "beta"]:
-            k = "weight" if k == "gamma" else "bias"
-          elif k in ["weight_g", "weight_v"]:
-            parent, skip = obj, True
-            if k == "weight_g": weight_g = v
-            else: weight_v = v
-          if not skip: obj = getattr(obj, k)
-      if weight_g is not None and weight_v is not None:
-        setattr(obj, "weight_g", weight_g.numpy())
-        setattr(obj, "weight_v", weight_v.numpy())
-        obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
-        weight_g, weight_v, parent, skip = None, None, None, False
-      if not skip and obj.shape == v.shape: obj.assign(v.to(obj.device))
-      elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
-    except Exception as e: raise e
-  logging.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration}) in {time.time() - start_time:.4f}s")
-  return model, optimizer, learning_rate, iteration
-
-# Used for cleaning input text and mapping to symbols
-class TextMapper: # Based on https://github.com/keithito/tacotron
-  def __init__(self, symbols, apply_cleaners=True):
-    self.apply_cleaners, self.symbols, self._inflect = apply_cleaners, symbols, None
-    self._symbol_to_id, _id_to_symbol = {s: i for i, s in enumerate(symbols)}, {i: s for i, s in enumerate(symbols)}
-    self._whitespace_re, self._abbreviations = re.compile(r'\s+'), [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [('mrs', 'misess'), ('mr', 'mister'), ('dr', 'doctor'), ('st', 'saint'), ('co', 'company'), ('jr', 'junior'), ('maj', 'major'), ('gen', 'general'), ('drs', 'doctors'), ('rev', 'reverend'), ('lt', 'lieutenant'), ('hon', 'honorable'), ('sgt', 'sergeant'), ('capt', 'captain'), ('esq', 'esquire'), ('ltd', 'limited'), ('col', 'colonel'), ('ft', 'fort'), ]]
-    self.phonemizer = EspeakBackend(
-        language="en-us", punctuation_marks=Punctuation.default_marks(), preserve_punctuation=True, with_stress=True,
-    )
-  def text_to_sequence(self, text, cleaner_names):
-    if self.apply_cleaners:
-      for name in cleaner_names:
-        cleaner = getattr(self, name)
-        if not cleaner: raise ModuleNotFoundError('Unknown cleaner: %s' % name)
-        text = cleaner(text)
-    else: text = text.strip()
-    return [self._symbol_to_id[symbol] for symbol in text]
-  def get_text(self, text, add_blank=False, cleaners=('english_cleaners2',)):
-    text_norm = self.text_to_sequence(text, cleaners)
-    return Tensor(self.intersperse(text_norm, 0) if add_blank else text_norm, dtype=dtypes.int64)
-  def intersperse(self, lst, item):
-    (result := [item] * (len(lst) * 2 + 1))[1::2] = lst
-    return result
-  def phonemize(self, text, strip=True): return _phonemize(self.phonemizer, text, default_separator, strip, 1, False, False)
-  def filter_oov(self, text): return "".join(list(filter(lambda x: x in self._symbol_to_id, text)))
-  def base_english_cleaners(self, text): return self.collapse_whitespace(self.phonemize(self.expand_abbreviations(unidecode(text.lower()))))
-  def english_cleaners2(self, text): return self.base_english_cleaners(text)
-  def transliteration_cleaners(self, text): return self.collapse_whitespace(unidecode(text.lower()))
-  def cjke_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text).replace('ɑ', 'a').replace('ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')))
-  def cjke_cleaners2(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text)))
-  def cjks_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_lazy_ipa(text)))
-  def english_to_ipa2(self, text):
-    _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ ('r', 'ɹ'), ('ʤ', 'dʒ'), ('ʧ', 'tʃ')]]
-    return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _ipa_to_ipa2, self.mark_dark_l(self.english_to_ipa(text))).replace('...', '…')
-  def mark_dark_l(self, text): return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ' + x.group(1), text)
-  def english_to_ipa(self, text):
-    import eng_to_ipa as ipa
-    return self.collapse_whitespace(ipa.convert(self.normalize_numbers(self.expand_abbreviations(unidecode(text).lower()))))
-  def english_to_lazy_ipa(self, text):
-    _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [('r', 'ɹ'), ('æ', 'e'), ('ɑ', 'a'), ('ɔ', 'o'), ('ð', 'z'), ('θ', 's'), ('ɛ', 'e'), ('ɪ', 'i'), ('ʊ', 'u'), ('ʒ', 'ʥ'), ('ʤ', 'ʥ'), ('ˈ', '↓')]]
-    return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _lazy_ipa, self.english_to_ipa(text))
-  def expand_abbreviations(self, text): return reduce(lambda t, abbr: re.sub(abbr[0], abbr[1], t), self._abbreviations, text)
-  def collapse_whitespace(self, text): return re.sub(self._whitespace_re, ' ', text)
-  def normalize_numbers(self, text):
-    import inflect
-    self._inflect = inflect.engine()
-    text = re.sub(re.compile(r'([0-9][0-9\,]+[0-9])'), self._remove_commas, text)
-    text = re.sub(re.compile(r'£([0-9\,]*[0-9]+)'), r'\1 pounds', text)
-    text = re.sub(re.compile(r'\$([0-9\.\,]*[0-9]+)'), self._expand_dollars, text)
-    text = re.sub(re.compile(r'([0-9]+\.[0-9]+)'), self._expand_decimal_point, text)
-    text = re.sub(re.compile(r'[0-9]+(st|nd|rd|th)'), self._expand_ordinal, text)
-    text = re.sub(re.compile(r'[0-9]+'), self._expand_number, text)
-    return text
-  def _remove_commas(self, m): return m.group(1).replace(',', '') # george won't like this
-  def _expand_dollars(self, m):
-    match = m.group(1)
-    parts = match.split('.')
-    if len(parts) > 2: return match + ' dollars'  # Unexpected format
-    dollars, cents = int(parts[0]) if parts[0] else 0, int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents: return '%s %s, %s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars', cents, 'cent' if cents == 1 else 'cents')
-    if dollars: return '%s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars')
-    if cents: return '%s %s' % (cents, 'cent' if cents == 1 else 'cents')
-    return 'zero dollars'
-  def _expand_decimal_point(self, m): return m.group(1).replace('.', ' point ')
-  def _expand_ordinal(self, m): return self._inflect.number_to_words(m.group(0))
-  def _expand_number(self, _inflect, m):
-    num = int(m.group(0))
-    if 1000 < num < 3000:
-      if num == 2000: return 'two thousand'
-      if 2000 < num < 2010: return 'two thousand ' + self._inflect.number_to_words(num % 100)
-      if num % 100 == 0: return self._inflect.number_to_words(num // 100) + ' hundred'
-      return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
-    return self._inflect.number_to_words(num, andword='')
-
-#########################################################################################
-# PAPER: https://arxiv.org/abs/2106.06103
-# CODE: https://github.com/jaywalnut310/vits/tree/main
-#########################################################################################
-# INSTALLATION: this is based on default config, dependencies are for preprocessing.
-# vctk, ljs                      | pip3 install unidecode phonemizer          | phonemizer requires [eSpeak](https://espeak.sourceforge.net) backend to be installed on your system
-# mmts-tts                       | pip3 install unidecode                     |
-# uma_trilingual, cjks, voistock | pip3 install unidecode inflect eng_to_ipa  |
-#########################################################################################
-# Some good speakers to try out, there may be much better ones, I only tried out a few:
-# male vctk 1  | --model_to_use vctk --speaker_id 2
-# male vctk 2  | --model_to_use vctk --speaker_id 6
-# anime lady 1 | --model_to_use uma_trilingual --speaker_id 36
-# anime lady 2 | --model_to_use uma_trilingual --speaker_id 121
-#########################################################################################
-VITS_PATH = Path(__file__).parents[1] / "weights/VITS/"
-MODELS = { # config_url, weights_url
-  "ljs": ("https://raw.githubusercontent.com/jaywalnut310/vits/main/configs/ljs_base.json", "https://drive.google.com/uc?export=download&id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT&confirm=t"),
-  "vctk": ("https://huggingface.co/csukuangfj/vits-vctk/resolve/main/vctk_base.json", "https://huggingface.co/csukuangfj/vits-vctk/resolve/main/pretrained_vctk.pth"),
-  "mmts-tts": ("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/config.json", "https://huggingface.co/facebook/mms-tts/resolve/main/full_models/eng/G_100000.pth"),
-  "uma_trilingual": ("https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/raw/main/configs/uma_trilingual.json", "https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth"),
-  "cjks": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/model.pth"),
-  "voistock": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/model.pth"),
-}
-Y_LENGTH_ESTIMATE_SCALARS = {"ljs": 2.8, "vctk": 1.74, "mmts-tts": 1.9, "uma_trilingual": 2.3, "cjks": 3.3, "voistock": 3.1}
-if __name__ == '__main__':
-  logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-  parser = argparse.ArgumentParser()
-  parser.add_argument("--model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
-  parser.add_argument("--speaker_id", type=int, default=6, help="Specify the speaker ID. Default is 6.")
-  parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
-  parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
-  parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
-  parser.add_argument("--text_to_synthesize", default="""Hello person. If the code you are contributing isn't some of the highest quality code you've written in your life, either put in the effort to make it great, or don't bother.""", help="Specify the text to synthesize. Default is a greeting message.")
-  parser.add_argument("--noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
-  parser.add_argument("--noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
-  parser.add_argument("--length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
-  parser.add_argument("--seed", type=int, default=1337, help="Specify the seed (set to None if no seed). Default is 1337.")
-  parser.add_argument("--num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
-  parser.add_argument("--sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
-  parser.add_argument("--emotion_path", type=str, default=None, help="Specify the path to emotion reference.")
-  parser.add_argument("--estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
-  args = parser.parse_args()
-
-  model_config = MODELS[args.model_to_use]
-
-  # Load the hyperparameters from the config file.
-  hps = get_hparams_from_file(fetch(model_config[0]))
-
-  # If model has multiple speakers, validate speaker id and retrieve name if available.
-  model_has_multiple_speakers = hps.data.n_speakers > 0
-  if model_has_multiple_speakers:
-    logging.info(f"Model has {hps.data.n_speakers} speakers")
-    if args.speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {args.speaker_id} is invalid for this model.")
-    speaker_name = "?"
-    if hps.__contains__("speakers"): # maps speaker ids to names
-      speakers = hps.speakers
-      if isinstance(speakers, List): speakers = {speaker: i for i, speaker in enumerate(speakers)}
-      speaker_name = next((key for key, value in speakers.items() if value == args.speaker_id), None)
-    logging.info(f"You selected speaker {args.speaker_id} (name: {speaker_name})")
-
-  # Load emotions if any. TODO: find an english model with emotions, this is untested atm.
-  emotion_embedding = None
-  if args.emotion_path is not None:
-    if args.emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(args.emotion_path), dtype=dtypes.int64).unsqueeze(0)
-    else: raise ValueError("Emotion path must be a .npy file.")
-
-  # Load symbols, instantiate TextMapper and clean the text.
-  if hps.__contains__("symbols"): symbols = hps.symbols
-  elif args.model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
-  else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ")
-  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
-
-  # Load the model.
-  if args.seed is not None:
-    Tensor.manual_seed(args.seed)
-    np.random.seed(args.seed)
-  net_g = load_model(text_mapper.symbols, hps, model_config)
-  logging.debug(f"Loaded model with hps: {hps}")
-
-  # Convert the input text to a tensor.
-  text_to_synthesize = args.text_to_synthesize
-  if args.model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
-  stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
-  logging.debug(f"Converted input text to tensor \"{text_to_synthesize}\" -> Tensor({stn_tst.shape}): {stn_tst.numpy()}")
-  x_tst, x_tst_lengths = stn_tst.unsqueeze(0), Tensor([stn_tst.shape[0]], dtype=dtypes.int64)
-  sid = Tensor([args.speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
-
-  # Perform inference.
-  start_time = time.time()
-  audio_tensor = net_g.infer(x_tst, x_tst_lengths, sid, args.noise_scale, args.length_scale, args.noise_scale_w, emotion_embedding=emotion_embedding,
-                             max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[args.model_to_use] if args.estimate_max_y_length else None)[0, 0].realize()
-  logging.info(f"Inference took {(time.time() - start_time):.2f}s")
-
-  # Save the audio output.
-  audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
-  out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model_to_use}{f'_sid_{args.speaker_id}' if model_has_multiple_speakers else ''}_{args.base_name}.wav")
-  out_path.parent.mkdir(parents=True, exist_ok=True)
-  with wave.open(str(out_path), 'wb') as wav_file:
-    wav_file.setnchannels(args.num_channels)
-    wav_file.setsampwidth(args.sample_width)
-    wav_file.setframerate(hps.data.sampling_rate)
-    wav_file.setnframes(len(audio_data))
-    wav_file.writeframes(audio_data.tobytes())
-  logging.info(f"Saved audio output to {out_path}")
--- a/extra/amdpci/am_smi.py
+++ b/extra/amdpci/am_smi.py
@ -26,11 +26,13 @@ def color_temp(temp):
 def color_voltage(voltage): return colored(f"{voltage/1000:>5.3f}V", "cyan")

 def draw_bar(percentage, width=40, fill='|', empty=' ', opt_text='', color='cyan'):
+  percentage = 0.0 if percentage != percentage else percentage  # NaN guard
+  percentage = max(0.0, min(1.0, float(percentage)))
  filled_width = int(width * percentage)
  if not opt_text: opt_text = f'{percentage*100:.1f}%'

  bar = fill * filled_width + empty * (width - filled_width)
-  bar = (bar[:-len(opt_text)] + opt_text) if opt_text else bar
+  if opt_text and len(opt_text) <= len(bar): bar = (bar[:-len(opt_text)] + opt_text)
  bar = colored(bar[:filled_width], color) + bar[filled_width:]
  return f'[{bar}]'

@ -88,6 +90,7 @@ class SMICtx:
    self.opened_pci_resources = {}
    self.prev_lines_cnt = 0
    self.prev_terminal_width = 0
+    self.prev_terminal_height = 0

    remove_parts = ["Advanced Micro Devices, Inc. [AMD/ATI]", "VGA compatible controller:"]
    lspci = subprocess.check_output(["lspci"]).decode("utf-8").splitlines()
@ -95,6 +98,20 @@ class SMICtx:
    for k,v in self.lspci.items():
      for part in remove_parts: self.lspci[k] = self.lspci[k].replace(part, "").strip().rstrip()

+  def _smuq10_round(self, v:int) -> int:
+    v = int(v)
+    return (v + 512) >> 10  # SMUQ10_ROUND
+
+  def _fmt_kb(self, kb:int) -> str:
+    kb = int(kb)
+    if kb < 1024: return f"{kb}KB"
+    mb = kb / 1024.0
+    if mb < 1024: return f"{mb:.1f}MB"
+    gb = mb / 1024.0
+    if gb < 1024: return f"{gb:.2f}GB"
+    tb = gb / 1024.0
+    return f"{tb:.2f}TB"
+
  def _open_am_device(self, pcibus):
    if pcibus not in self.opened_pci_resources:
      bar_fds = {bar: os.open(f"/sys/bus/pci/devices/{pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
@ -116,6 +133,7 @@ class SMICtx:
  def rescan_devs(self):
    pattern = os.path.join('/tmp', 'am_*.lock')
    for d in [f[8:-5] for f in glob.glob(pattern)]:
+      if d.startswith("usb"): continue
      if d not in self.opened_pcidevs:
        self._open_am_device(d)

@ -131,21 +149,53 @@ class SMICtx:
        os.system('clear')
        if DEBUG >= 2: print(f"Removed AM device {d.pcibus}")

-  def collect(self): return {d: d.smu.read_metrics() if d.pci_state == "D0" else None for d in self.devs}
+  def collect(self):
+    tables = {}
+    for dev in self.devs:
+      match dev.ip_ver[am.MP1_HWIP]:
+        case (13,0,6): table_t = dev.smu.smu_mod.MetricsTableX_t
+        case (13,0,12): table_t = dev.smu.smu_mod.MetricsTableV2_t
+        case _: table_t = dev.smu.smu_mod.SmuMetricsExternal_t
+      tables[dev] = dev.smu.read_table(table_t, dev.smu.smu_mod.SMU_TABLE_SMU_METRICS) if dev.pci_state == "D0" else None
+    return tables

-  def get_gfx_activity(self, dev, metrics): return metrics.SmuMetrics.AverageGfxActivity
-  def get_mem_activity(self, dev, metrics): return metrics.SmuMetrics.AverageUclkActivity
+  def _pick_nonzero_avg(self, vals) -> int:
+    xs = [x for x in vals if x > 0]
+    return int(sum(xs) / len(xs)) if xs else 0
+
+  def get_gfx_activity(self, dev, metrics):
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return max(0, min(100, self._smuq10_round(metrics.SocketGfxBusy)))
+      case _: return metrics.SmuMetrics.AverageGfxActivity
+
+  def get_mem_activity(self, dev, metrics):
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return max(0, min(100, self._smuq10_round(metrics.DramBandwidthUtilization)))
+      case _: return metrics.SmuMetrics.AverageUclkActivity

  def get_temps(self, dev, metrics, compact=False):
-    temps_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_TEMP_e__enumvalues.items()
-                  if k < dev.smu.smu_mod.TEMP_COUNT and metrics.SmuMetrics.AvgTemperature[k] != 0]
-    if compact: temps_keys = [(k, name) for k, name in temps_keys if k in (dev.smu.smu_mod.TEMP_HOTSPOT, dev.smu.smu_mod.TEMP_MEM)]
-    return {name: metrics.SmuMetrics.AvgTemperature[k] for k, name in temps_keys}
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6):
+        temps = {
+          "Hotspot": self._smuq10_round(metrics.MaxSocketTemperature),
+          "HBM": self._smuq10_round(metrics.MaxHbmTemperature),
+          "VR": self._smuq10_round(metrics.MaxVrTemperature),
+        }
+        if compact: return {k: temps[k] for k in ("Hotspot", "HBM") if temps.get(k, 0) != 0}
+        return {k: v for k, v in temps.items() if v != 0}
+      case _:
+        temps_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_TEMP_e__enumvalues.items()
+                      if k < dev.smu.smu_mod.TEMP_COUNT and metrics.SmuMetrics.AvgTemperature[k] != 0]
+        if compact: temps_keys = [(k, name) for k, name in temps_keys if k in (dev.smu.smu_mod.TEMP_HOTSPOT, dev.smu.smu_mod.TEMP_MEM)]
+        return {name: metrics.SmuMetrics.AvgTemperature[k] for k, name in temps_keys}

  def get_voltage(self, dev, metrics, compact=False):
-    voltage_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_SVI_PLANE_e__enumvalues.items()
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return {}
+      case _:
+        voltage_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_SVI_PLANE_e__enumvalues.items()
                        if k < dev.smu.smu_mod.SVI_PLANE_COUNT and metrics.SmuMetrics.AvgVoltage[k] != 0]
-    return {name: metrics.SmuMetrics.AvgVoltage[k] for k, name in voltage_keys}
+        return {name: metrics.SmuMetrics.AvgVoltage[k] for k, name in voltage_keys}

  def get_busy_threshold(self, dev):
    match dev.ip_ver[am.MP1_HWIP]:
@ -153,22 +203,40 @@ class SMICtx:
      case _: return 15

  def get_gfx_freq(self, dev, metrics):
-    return metrics.SmuMetrics.AverageGfxclkFrequencyPostDs if self.get_gfx_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
-          metrics.SmuMetrics.AverageGfxclkFrequencyPreDs
+    if metrics is None: return 0
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return self._smuq10_round(metrics.GfxclkFrequency[0])
+      case _:
+        return metrics.SmuMetrics.AverageGfxclkFrequencyPostDs if self.get_gfx_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
+               metrics.SmuMetrics.AverageGfxclkFrequencyPreDs

  def get_mem_freq(self, dev, metrics):
-    return metrics.SmuMetrics.AverageMemclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
-           metrics.SmuMetrics.AverageMemclkFrequencyPreDs
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return self._smuq10_round(metrics.UclkFrequency)
+      case _:
+        return metrics.SmuMetrics.AverageMemclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
+               metrics.SmuMetrics.AverageMemclkFrequencyPreDs

  def get_fckl_freq(self, dev, metrics):
-    return metrics.SmuMetrics.AverageFclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
-           metrics.SmuMetrics.AverageFclkFrequencyPreDs
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return self._smuq10_round(metrics.FclkFrequency)
+      case _:
+        return metrics.SmuMetrics.AverageFclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
+               metrics.SmuMetrics.AverageFclkFrequencyPreDs

-  def get_fan_rpm_pwm(self, dev, metrics): return metrics.SmuMetrics.AvgFanRpm, metrics.SmuMetrics.AvgFanPwm
+  def get_fan_rpm_pwm(self, dev, metrics):
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return None, None
+      case _: return metrics.SmuMetrics.AvgFanRpm, metrics.SmuMetrics.AvgFanPwm

-  def get_power(self, dev, metrics): return metrics.SmuMetrics.AverageSocketPower, metrics.SmuMetrics.dGPU_W_MAX
+  def get_power(self, dev, metrics):
+    match dev.ip_ver[am.MP1_HWIP]:
+      case (13,0,6): return self._smuq10_round(metrics.SocketPower), self._smuq10_round(metrics.MaxSocketPowerLimit)
+      case _: return metrics.SmuMetrics.AverageSocketPower, metrics.SmuMetrics.dGPU_W_MAX

  def get_mem_usage(self, dev):
+    return 0
+
    usage = 0
    pt_stack = [dev.mm.root_page_table]
    while len(pt_stack) > 0:
@ -177,7 +245,7 @@ class SMICtx:
        entry = pt.entries[i]

        if (entry & am.AMDGPU_PTE_VALID) == 0: continue
-        if pt.lv!=am.AMDGPU_VM_PTB and not dev.gmc.is_pte_huge_page(entry):
+        if pt.lv!=am.AMDGPU_VM_PTB and not dev.gmc.is_pte_huge_page(pt.lv, entry):
          pt_stack.append(AMPageTableEntry(dev, entry & 0x0000FFFFFFFFF000, lv=pt.lv+1))
          continue
        if (entry & am.AMDGPU_PTE_SYSTEM) != 0: continue
@ -219,23 +287,28 @@ class SMICtx:
      temps_table_compact = ["Temps (°C):" + '/'.join([f"{color_temp(val)} {name}" for name, val in temps_data_compact.items()])]

      fan_rpm, fan_pwm = self.get_fan_rpm_pwm(dev, metrics)
-      power_table = ["=== Power ==="] + [f"Fan Speed: {fan_rpm} RPM"] + [f"Fan Power: {fan_pwm}%"]
+      power_table = ["=== Power ==="]
+      power_table += ["Fan: N/A"] if fan_rpm is None or fan_pwm is None else [f"Fan Speed: {fan_rpm} RPM", f"Fan Power: {fan_pwm}%"]

      total_power, max_power = self.get_power(dev, metrics)
-      power_line = [f"Power: " + draw_bar(total_power / max_power, 16, opt_text=f"{total_power}/{max_power}W")]
-      power_line_compact = [f"Power:       " + draw_bar(total_power / max_power, activity_line_width, opt_text=f"{total_power}/{max_power}W")]
+      if max_power > 0:
+        power_line = [f"Power: " + draw_bar(total_power / max_power, 16, opt_text=f"{total_power}/{max_power}W")]
+        power_line_compact = [f"Power:       " + draw_bar(total_power / max_power, activity_line_width, opt_text=f"{total_power}/{max_power}W")]
+      else:
+        power_line = ["Power: N/A"]
+        power_line_compact = ["Power: N/A"]

      voltage_data = self.get_voltage(dev, metrics)
-      voltage_table = ["=== Voltages ==="] + [f"{name:<20}: {color_voltage(voltage)}" for name, voltage in voltage_data.items()]
+      voltage_table = None if not voltage_data else (["=== Voltages ==="] + [f"{name:<20}: {color_voltage(voltage)}" for name, voltage in voltage_data.items()])

      gfx_freq = self.get_gfx_freq(dev, metrics)
      mclk_freq = self.get_mem_freq(dev, metrics)
      fclk_freq = self.get_fckl_freq(dev, metrics)
-
      frequency_table = ["=== Frequencies ===", f"GFXCLK: {gfx_freq:>4} MHz", f"FCLK  : {fclk_freq:>4} MHz", f"MCLK  : {mclk_freq:>4} MHz"]

      if self.prev_terminal_width >= 231:
-        power_table += power_line + [""] + voltage_table
+        power_table += power_line
+        if voltage_table is not None: power_table += [""] + voltage_table
        activity_line += [""]
      elif self.prev_terminal_width >= 171:
        power_table += power_line + [""] + frequency_table
@ -307,4 +380,5 @@ if __name__ == "__main__":
      smi_ctx.draw(args.list)
      if args.list: break
      time.sleep(1)
-  except KeyboardInterrupt: print("Exiting...")
+  except KeyboardInterrupt:
+    print("Exiting...")
--- a/extra/amdpci/hive_reset.py
+++ b/extra/amdpci/hive_reset.py
@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from tinygrad.helpers import Context
+from tinygrad.runtime.support.system import System, PCIDevice, PCIDevImplBase
+from tinygrad.runtime.support.am.amdev import AMDev
+
+if __name__ == "__main__":
+  gpus = System.pci_scan_bus(0x1002, [(0xffff, [0x74a1, 0x75a0])])
+  pcidevs = [PCIDevice(f"reset:{gpu}", gpu, bars=[0, 2, 5]) for gpu in gpus]
+  amdevs = []
+  with Context(DEBUG=2):
+    for pcidev in pcidevs:
+      amdevs.append(AMDev(pcidev, reset_mode=True))
+    for amdev in amdevs: amdev.smu.mode1_reset()
--- a/extra/amdpci/proclogs.py
+++ b/extra/amdpci/proclogs.py
@ -1,48 +1,65 @@
 import re, ctypes, sys, importlib
+from tinygrad.helpers import getenv

 from tinygrad.runtime.support.am.amdev import AMDev, AMRegister
+
+class GFXFake:
+  def __init__(self): self.xccs = 8
+
 class AMDFake(AMDev):
-  def __init__(self, devfmt, vram, doorbell, mmio, dma_regions=None):
-    self.devfmt, self.vram, self.doorbell64, self.mmio, self.dma_regions = devfmt, vram, doorbell, mmio, dma_regions
+  def __init__(self, pci_dev, dma_regions=None):
+    self.pci_dev, self.devfmt, self.dma_regions = pci_dev, pci_dev.pcibus, dma_regions
+    self.vram, self.doorbell64, self.mmio = self.pci_dev.map_bar(0), self.pci_dev.map_bar(2, fmt='Q'), self.pci_dev.map_bar(5, fmt='I')
    self._run_discovery()
    self._build_regs()

+    self.gfx = GFXFake()
+
 amdev = importlib.import_module("tinygrad.runtime.support.am.amdev")
 amdev.AMDev = AMDFake
-
 from tinygrad.runtime.ops_amd import PCIIface

-def parse_amdgpu_logs(log_content, register_names=None):
-  register_map = register_names
+def parse_amdgpu_logs(log_content, register_names=None, *, only_xcc0: bool = False):
+  register_map = register_names or {}

-  final = ""
  def replace_register(match):
-    register = match.group(1)
-    return f"Reading register {register_map.get(int(register, base=16), register)}"
+    reg = match.group(1)
+    return f"Reading register {register_map.get(int(reg, 16), reg)}"

-  pattern = r'Reading register (0x[0-9a-fA-F]+)'
-
-  processed_log = re.sub(pattern, replace_register, log_content)
+  processed_log = re.sub(r'Reading register (0x[0-9a-fA-F]+)', replace_register, log_content)

  def replace_register_2(match):
-    register = match.group(1)
-    return f"Writing register {register_map.get(int(register, base=16), register)}"
+    reg = match.group(1)
+    return f"Writing register {register_map.get(int(reg, 16), reg)}"
+
+  processed_log = re.sub(r'Writing register (0x[0-9a-fA-F]+)', replace_register_2, processed_log)
+
+  # remove timing prefix
+  processed_log = re.sub(r'^\[\s*\d+(?:\.\d+)?\]\s*', '', processed_log, flags=re.MULTILINE)
+
+  # keep only xcc=0 lines (but keep lines with no xcc at all)
+  if only_xcc0:
+    kept = []
+    for line in processed_log.splitlines(True):
+      if "xcc=" not in line or re.search(r'\bxcc=0\b', line): kept.append(line)
+    processed_log = "".join(kept)

-  pattern = r'Writing register (0x[0-9a-fA-F]+)'
-  processed_log = re.sub(pattern, replace_register_2, processed_log)
  return processed_log

 def main():
+  only_xcc0 = bool(getenv("ONLY_XCC0", 0))
+
  reg_names = {}
  dev = PCIIface(None, 0)
  for x, y in dev.dev_impl.__dict__.items():
    if isinstance(y, AMRegister):
-      for inst, addr in y.addr.items(): reg_names[addr] = f"{x}, xcc={inst}"
+      for xcc, addr in y.addr.items():
+        reg_names[addr] = f"{x}, xcc={xcc}"

  with open(sys.argv[1], 'r') as f:
-    log_content = log_content_them = f.read()
+    log_content = f.read()

-  processed_log = parse_amdgpu_logs(log_content, reg_names)
+  processed_log = parse_amdgpu_logs(log_content, reg_names, only_xcc0=only_xcc0)

  with open(sys.argv[2], 'w') as f:
    f.write(processed_log)
@ -51,5 +68,4 @@ if __name__ == '__main__':
  if len(sys.argv) != 3:
    print("Usage: <input_file_path> <output_file_path>")
    sys.exit(1)
-
-  main()
+  main()
--- a/extra/assembly/assembly.py
+++ b/extra/assembly/assembly.py
@ -1,189 +0,0 @@
-from typing import Tuple, List, NamedTuple, Any, Dict, Optional, Union, DefaultDict, cast
-from tinygrad.codegen.opt.kernel import Ops, MemOp, UOp
-from tinygrad.uop.ops import BinaryOps, UnaryOps
-from tinygrad.dtype import DType, dtypes
-from tinygrad.helpers import DEBUG
-from tinygrad.uop.ops import Variable, NumNode, MulNode, DivNode, ModNode, LtNode, SumNode, AndNode
-import functools
-import math
-from collections import defaultdict
-
-_type_to_letter = {dtypes.float32: 'f', dtypes.bool: 'p', dtypes.int32: 'i', dtypes.int64: 'a', dtypes.uint32: 'u', dtypes.uint64: 'b', dtypes.float.vec(4): 'x', dtypes.uint8: 'uc', dtypes.float16: 'h',
-                   dtypes.int8: 'c', dtypes.uint16: 'us', dtypes.float64: 'd'}
-
-class Register(NamedTuple):
-  nm:str
-  dtype:DType
-  scalar:bool
-  off:Optional[int] = None
-  def __repr__(self): return self.nm if self.off is None else f"{self.nm}:{self.off}"
-  def subregs(self):
-    if self.dtype == dtypes.float.vec(4):
-      return [Register(self.nm, dtypes.float, False, off=off) for off in range(4)]
-    return []
-
-class AssemblyInstruction(NamedTuple):
-  op: Ops
-  out: Optional[Register]
-  vin: List[Union[Register, int, float]]
-  arg: Any = None
-
-# warp size of 32, s registers are shared across the warp, v are 32-wide vectors
-class AssemblyLanguage:
-  supports_load3: bool = False
-  sin_is_sin2pi: bool = False
-  no_div: bool = False
-  #TODO: these should be global vars
-  cnts:DefaultDict[Tuple[DType, bool], int] = defaultdict(int)
-  tor: Dict[Any, Register] = {}
-  ins: List[AssemblyInstruction] = []
-
-  def type_to_letter(self,x): return _type_to_letter[x[0]].upper() if x[1] else _type_to_letter[x[0]]
-  def newreg(self, tok, dtype=dtypes.float32, scalar=False) -> Register:
-    self.tor[tok] = ret = Register(f"%{self.type_to_letter((dtype, scalar))}{self.cnts[(dtype, scalar)]}", dtype, scalar)
-    if dtype == dtypes.float.vec(4):
-      for off in range(4):
-        self.tor[tok] = Register(ret.nm, dtypes.float, ret.scalar, off)
-    self.cnts[(dtype, scalar)] += 1
-    return ret
-
-  def render_numnode(self, b) -> Register:
-    key = ("num", b)
-    if key not in self.tor: self.ins.append(AssemblyInstruction(Ops.LOAD, self.newreg(key, scalar=True, dtype=dtypes.int32), [], b))
-    return self.tor[key]
-
-  def render_alu(self, op, a:Register, b:Union[Register, int, float], dtype=dtypes.int32) -> Register:
-    key = (op, a, b)
-    if key not in self.tor:
-      #if not isinstance(b, Register): b = render_numnode(b)
-      self.ins.append(AssemblyInstruction(Ops.ALU, self.newreg(key, dtype=dtype, scalar=a.scalar and (not isinstance(b, Register) or b.scalar)), [a, b], op))
-    return self.tor[key]
-
-  def render_cast(self, a:Register, new_dtype:DType) -> Register:
-    if a.dtype == new_dtype: return a
-    key = (a, new_dtype)
-    if key not in self.tor:
-      self.ins.append(AssemblyInstruction(Ops.CAST, self.newreg(key, dtype=new_dtype), [a]))
-    return self.tor[key]
-
-  render_ops: Any = { Variable: lambda self, ops, ctx: ctx.tor[self], NumNode: lambda self, ops, ctx: ctx.render_numnode(self.b),
-                 MulNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.MUL, self.a.render(ops, ctx), self.b),
-                 DivNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.DIV, self.a.render(ops, ctx), self.b),
-                 ModNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.MOD, self.a.render(ops, ctx), self.b),
-                 LtNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.CMPLT, self.a.render(ops, ctx), self.b, dtype=dtypes.bool),
-    SumNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.render_alu(BinaryOps.ADD, a, b.render(ops,ctx)), self.nodes[1:], self.nodes[0].render(ops,ctx)),
-    AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.render_alu(BinaryOps.MUL, a, b.render(ops,ctx), dtype=dtypes.bool), self.nodes[1:], self.nodes[0].render(ops,ctx)) }
-
-  def addr_w_offset(self, args):
-    assert isinstance(args, MemOp)
-    idx = args.idx*args.memory_dtype.itemsize
-    off = 0  # TODO: should this be None?
-    if isinstance(idx, SumNode):
-      nums = [n.b for n in idx.nodes if isinstance(n, NumNode)]
-      if nums and nums[0] < 4096 and (idx-nums[0]).min >= 0:  # TODO: different for each GPU?
-        idx -= nums[0]
-        off = cast(int, nums[0])
-    reg = idx.render(self.render_ops, self)
-    if self.supports_load3:
-      if reg.scalar:
-        new_reg = self.newreg((reg.nm, 'vec'), dtype=reg.dtype)
-        self.ins.append(AssemblyInstruction(Ops.ALU, new_reg, [reg], UnaryOps.NOOP))
-        reg = new_reg
-      return self.tor[args.name], reg, off
-    reg = self.render_alu(BinaryOps.ADD, self.render_cast(reg, dtypes.uint64), self.tor[args.name], dtype=dtypes.uint64)
-    return reg, None, off
-
-def uops_to_asmstyle(lang, function_name:str, uops:List[UOp]):
-  #TODO: Do not use clear()
-  lang.ins.clear()
-  lang.tor.clear()
-  lang.cnts.clear()
-  buf_to_dtype = {args:dtype for uop,dtype,_,args,_ in uops if uop == Ops.DEFINE_GLOBAL}
-  global_size, local_size = [], []
-  skipload_branch = 0
-  lang.ins += [AssemblyInstruction(Ops.SPECIAL, lang.newreg(buf, dtype=dtypes.uint64, scalar=True), [], buf) for buf in buf_to_dtype]
-  for u in uops:
-    uop,dtype,vin,args,_ = u
-    if uop == Ops.DEFINE_LOCAL:
-      lang.ins.append(AssemblyInstruction(Ops.DEFINE_LOCAL, None, [], args))
-      lang.ins.append(AssemblyInstruction(Ops.ALU, lang.newreg(args[0], dtype=dtypes.uint64), [args[0]], UnaryOps.NOOP))
-    elif uop == Ops.LOOP:
-      if args[1] == "global":
-        for i,var in enumerate(args[0]):
-          global_size.append(var.max+1)
-          lang.ins.append(AssemblyInstruction(Ops.SPECIAL, lang.newreg(var, dtype=dtypes.int32), [], f"gid{len(args[0])-1-i}"))
-      elif args[1] == "local":
-        for i,var in enumerate(args[0]):
-          local_size.append(var.max+1)
-          lang.ins.append(AssemblyInstruction(Ops.SPECIAL, lang.newreg(var, dtype=dtypes.int32), [], f"lid{len(args[0])-1-i}"))
-      else:
-        for var in args[0]:
-          if not isinstance(var, NumNode):  # TODO: why is this coming through?
-            lang.ins.append(AssemblyInstruction(Ops.LOAD, lang.newreg(var, dtype=dtypes.int32, scalar=True), [], 0))
-            lang.ins.append(AssemblyInstruction(Ops.LABEL, None, [], "$loop_"+var.expr))
-    elif uop == Ops.ENDLOOP:
-      if args[1] not in ["global", "local", "global+local"]:
-        for var in reversed(args[0]):
-          if not isinstance(var, NumNode):  # TODO: why is this coming through?
-            lang.ins.append(AssemblyInstruction(Ops.ALU, lang.tor[var], [lang.tor[var], 1], BinaryOps.ADD))
-            pred = lang.render_alu(BinaryOps.CMPLT, lang.tor[var], var.max+1, dtypes.bool)
-            lang.ins.append(AssemblyInstruction(Ops.COND_BRANCH, None, [pred], ("$loop_"+var.expr, True)))
-      elif args[1] == "global+local":
-        for i, var in enumerate(reversed(args[0])):
-          lang.ins.append(AssemblyInstruction(Ops.ENDLOOP, None, [lang.tor[var]], (var.max+1, f"gid{i}")))
-      elif args[1] == 'local':
-        for i, var in enumerate(reversed(args[0])):
-          lang.ins.append(AssemblyInstruction(Ops.ENDLOOP, None, [lang.tor[var]], (var.max+1, f"lid{i}")))
-    elif uop == Ops.CAST:
-      # TODO: we should reconsider outputting CAST in the linearizer. these are needless copies
-      out = lang.newreg(u, dtype)
-      for i,sr in enumerate(out.subregs()):
-        lang.ins.append(AssemblyInstruction(Ops.ALU, sr, [lang.tor[vin[i]]], UnaryOps.NOOP))
-    elif uop == Ops.ALU:
-      out = lang.newreg(u, dtype) if u not in lang.tor else lang.tor[u]
-      # this is the only thing that can violate SSA
-      if args in [BinaryOps.CMPLT]:
-        pred_reg = lang.newreg((u, 'pred'), dtype=dtypes.bool)
-        lang.ins.append(AssemblyInstruction(Ops.ALU, pred_reg, [lang.tor[x] for x in vin], args))
-        lang.ins.append(AssemblyInstruction(Ops.CAST, out, [pred_reg], args))
-      elif args == BinaryOps.DIV and lang.no_div:
-        tmp = lang.newreg((u, "rcp"))
-        lang.ins.append(AssemblyInstruction(Ops.ALU, tmp, [lang.tor[vin[1]]], UnaryOps.RECIP))
-        lang.ins.append(AssemblyInstruction(Ops.ALU, out, [lang.tor[vin[0]], tmp], BinaryOps.MUL))
-      elif args == UnaryOps.SIN and lang.sin_is_sin2pi:
-        tmp = lang.newreg((u, "2pi"))
-        lang.ins.append(AssemblyInstruction(Ops.ALU, tmp, [lang.tor[vin[0]], 1/(math.pi*2)], BinaryOps.MUL))
-        lang.ins.append(AssemblyInstruction(Ops.ALU, out, [tmp], args))
-      else:
-        lang.ins.append(AssemblyInstruction(Ops.ALU, out, [lang.tor[x] for x in vin], args))
-    elif uop == Ops.DEFINE_REG:
-      reg = lang.newreg(u, dtype=dtype)
-      lang.ins.append(AssemblyInstruction(Ops.LOAD, reg, [], args))
-    elif uop == Ops.SPECIAL:
-      lang.tor[u] = lang.tor[args]
-    elif uop == Ops.CONST:
-      lang.ins.append(AssemblyInstruction(Ops.LOAD, lang.newreg(u, dtype=dtype), [], args))
-    elif uop == Ops.LOAD:
-      idx, treg, off = lang.addr_w_offset(args)
-      reg = lang.newreg(u, dtype=dtype, scalar=(idx.scalar and (not isinstance(treg, Register) or treg.scalar)))
-      if args.valid.min == 0:
-        lang.ins.append(AssemblyInstruction(Ops.LOAD, reg, [], 0))
-        if args.valid.max == 1:
-          pred = args.valid.render(lang.render_ops, lang)
-          lang.ins.append(AssemblyInstruction(Ops.COND_BRANCH, None, [pred], (f"$skipload_{skipload_branch}", False)))
-      if args.valid.max == 1:
-          # NOTE: you can't compute the index in here, because it assumes it's all available later
-        lang.ins.append(AssemblyInstruction(Ops.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
-      if args.valid.min == 0 and args.valid.max == 1:
-        lang.ins.append(AssemblyInstruction(Ops.LABEL, None, [], f"$skipload_{skipload_branch}"))
-        skipload_branch += 1
-    elif uop == Ops.STORE:
-      if args is None:
-        lang.ins.append(AssemblyInstruction(Ops.ALU, lang.tor[vin[0]], [lang.tor[vin[1]]], UnaryOps.NOOP))
-      else:
-        idx, treg, off = lang.addr_w_offset(args)
-        lang.ins.append(AssemblyInstruction(Ops.STORE, None, [idx, lang.tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
-
-  if DEBUG >= 4:
-    for tins in lang.ins: print(tins)
-  return global_size, local_size
--- a/extra/assembly/assembly_arm64.py
+++ b/extra/assembly/assembly_arm64.py
@ -1,177 +0,0 @@
-import struct
-from platform import system
-from typing import Tuple, Dict, List, Optional
-from tinygrad import dtypes
-from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
-from tinygrad.codegen.opt.kernel import Ops, UOp
-from tinygrad.helpers import CI
-from tinygrad.codegen.assembly import uops_to_asmstyle, AssemblyLanguage
-
-def float_to_hex(x): return "%02X%02X%02X%02X" % tuple(struct.pack("f",x)[::-1])
-def compute_offsets(total):
-  quotient, remainder = divmod(total, 4096)
-  return [4096]*quotient + [remainder] if remainder else [4096]*quotient
-
-#NOTE: Darwin needs names to start with a "_"
-def get_name(name): return ('_' if system() == 'Darwin' else '') + name
-
-class ARM64Language(AssemblyLanguage): pass
-
-def specialize_to_arm64(fn_nm, asm):
-  var_size = 16
-  prev_uop:Optional[Ops] = None
-  ins = []
-  x_regs = ['x' + str(i) for i in reversed(range(12))]
-  s_regs = ['s' + str(i) for i in reversed(range(3,32)) if i <= 7 or i >= 16]
-  type_to_reg = {dtypes.double: "d", dtypes.half: 'h', dtypes.float32: 's', dtypes.bool: 'w', dtypes.int8:'w', dtypes.int32: 'w', dtypes.int64: 'x', dtypes.uint8:'w', dtypes.uint32: 'w', dtypes.uint64: 'x'}
-  alu = {BinaryOps.ADD: "add", BinaryOps.SUB: "sub", BinaryOps.MUL: "mul", BinaryOps.DIV: "div", BinaryOps.MAX: "max",
-          BinaryOps.MOD: "", BinaryOps.CMPLT: "subs",
-          UnaryOps.NOOP: "mov", UnaryOps.NEG: "neg",
-          UnaryOps.SIN:'bl ' + get_name('sinf'), UnaryOps.LOG2: 'bl ' + get_name("log2f"), UnaryOps.EXP2: 'bl ' + get_name("exp2f"), UnaryOps.SQRT: 'bl ' + get_name("sqrtf"),
-          TernaryOps.MULACC: "madd", TernaryOps.WHERE: "fcsel"}
-
-  def mov_imm(value, reg):
-    # Manually move value into reg if value can't fit
-    if value.__class__ is not float and abs(value) > abs(65535):
-      ins.append(f"movz w15, #{value & 0xffff}")
-      ins.append(f"movk w15, #{(value >> 16) & 0xffff}, lsl #16")
-      ins.append(f"sxtw {reg}, w15")
-    elif reg[0] == 's':
-      ins.append(f"movz x15, 0x{float_to_hex(value)[4:]}")
-      ins.append(f"movk x15, 0x{float_to_hex(value)[:4]}, lsl #16")
-      ins.append("str x15, [sp, 16]")
-      ins.append(f"ldr {reg}, [sp, 16]")
-    else:
-      ins.append(f"mov {reg}, #{value}")
-
-  # Get variables intervals
-  live_range:Dict[str, List[int]] = {}
-  for i, (uop, out, vin, arg) in enumerate(asm):
-    for var in ([v for v in [out] + vin if v is not None and v.__class__ is not int]):
-      live_range[var.nm] = [i,i] if var.nm not in live_range else [live_range[var.nm][0], i]
-
-  mem_vars:Dict[str, int] = {}
-  rtor:Dict[str, str] = {}
-  def allocate_regs(mvars):
-    nonlocal var_size
-    for v in [v for v in mvars if v is not None and v.__class__ is not int and v.nm not in rtor]:
-      available_regs = s_regs if dtypes.is_float(v[1]) else x_regs
-      #NOTE: Very simple spill, everything that don't fit in regs goes to mem
-      if not available_regs:
-        # ARM needs the stack 16-byte aligned
-        var_size += 16
-        available_regs.append('s0' if dtypes.is_float(out[1]) else 'x12')
-        mem_vars[v.nm] = var_size
-      rtor[v.nm] = available_regs.pop()
-
-  temp_floats = ['s0', 's1', 's2']
-  temp_ints = ['x12', 'x13', 'x16']
-  for i, (uop, out, vin, arg) in enumerate(asm):
-    # Clear regs out of interval
-    for var, reg in list(rtor.items()):
-      available_regs = s_regs if reg[0] == 's' else x_regs
-      if var[1] not in 'B' and var not in mem_vars and i > live_range[var][1]:
-        available_regs.append(rtor.pop(var))
-    # Assign a registers to the variables using live ranges.
-    allocate_regs([out] + vin)
-    # Assign temp regs to vin and load them before direct use
-    for i, v in enumerate([v for v in vin if v.__class__ is not int and v.nm in mem_vars]):
-      rtor[v.nm] = temp_floats[i] if dtypes.is_float(v[1]) else temp_ints[i]
-      # ARM64 addressing constraints https://devblogs.microsoft.com/oldnewthing/20220728-00/?p=106912
-      ins.append(f"mov x15, {mem_vars[v.nm]}")
-      ins.append(f"ldr {rtor[v.nm]}, [sp, x15]")
-
-    if uop == Ops.SPECIAL:
-      if arg.startswith('data'):
-        # data 8 to n into the stack
-        if int(arg[4:]) >= 8:
-          ins.append(f"ldr x15, [x17, #{(int(arg[4:]) - 8) * 8}]")
-          ins.append(f"mov {rtor[out.nm]}, x15")
-      else:
-        ins.append(f"mov {rtor[out.nm]}, #0")
-        ins.append(f"loop_{arg}:")
-    elif uop == Ops.CAST:
-      if arg == BinaryOps.CMPLT:
-        if rtor[out.nm][0] == 's':
-          mov_imm(0.0, 's0')
-          mov_imm(1.0, 's1')
-          ins.append(f"fcsel {rtor[out.nm]}, s1, s0, lt")
-        if rtor[out.nm][0] == 'x':
-          mov_imm(0, 'x14')
-          mov_imm(1, 'x15')
-          ins.append(f"csel {rtor[out.nm]}, x15, x14, lt")
-      else:
-        ins.append(f"sxtw {rtor[out.nm]}, w{rtor[vin[0].nm][1:]}")
-    elif uop == Ops.ALU:
-      if len(vin)==2 and vin[1].__class__ is int: mov_imm(vin[1], 'x15')
-      if arg == BinaryOps.MUL and out.dtype == dtypes.bool:
-        ins.append(f"ands {','.join('x15' if v.__class__ is int else rtor[v.nm] for v in [out] + vin)}")
-      elif arg == TernaryOps.WHERE:
-        ins.append(f"fcmp {rtor[vin[0].nm]}, #0.0" if rtor[vin[0].nm][0] == 's' else f"cmp {rtor[vin[0].nm]}, #0")
-        ins.append(f"{alu[arg]} {rtor[out.nm]}, {rtor[vin[1].nm]}, {rtor[vin[2].nm]}, ne")
-      elif arg in [UnaryOps.LOG2, UnaryOps.SIN, UnaryOps.EXP2, UnaryOps.SQRT]:
-        #NOTE: Not a real instruction, use to emulate a ext call in unicorn
-        if CI: ins.append(f"{alu[arg]} {rtor[out.nm]} {rtor[vin[0].nm]}")
-        else:
-          save_regs = [k for k in rtor.keys() if k != out.nm and k not in mem_vars]
-          ins.append(f"sub sp, sp, #{(len(save_regs))*16}")
-          # Save the registers before they are cleared by func call
-          for i,k in enumerate(save_regs,1):
-            ins.append(f"str {rtor[k]}, [sp, #{16*i}]")
-          ins.append("stp x29, x30, [sp, #0]!")
-          ins.append("mov x29, sp")
-          ins.append(f"fmov s0, {rtor[vin[0].nm]}")
-          ins.append(alu[arg])
-          ins.append(f"fmov {rtor[out.nm]}, s0")
-          ins.append("mov sp, x29")
-          ins.append("ldp x29, x30, [sp], #0")
-          for i,k in enumerate(save_regs,1):
-            ins.append(f"ldr {rtor[k]}, [sp, #{16*i}]")
-          ins.append(f"add sp, sp, #{len(save_regs)*16}")
-      elif arg == BinaryOps.CMPLT:
-        ins.append(f"{alu[arg]} {','.join('x15' if v.__class__ is int else rtor[v.nm] for v in [out] + vin)}" if not dtypes.is_float(vin[0][1]) else f"fcmp {rtor[vin[0].nm]}, {rtor[vin[1].nm]}")
-      elif arg == BinaryOps.MOD:
-        rhs = 'x15' if vin[1].__class__ is int else rtor[vin[1].nm]
-        ins.append(f"udiv x14, {rtor[vin[0].nm]}, {rhs}")
-        ins.append(f"msub {rtor[out.nm]}, x14, {rhs}, {rtor[vin[0].nm]}")
-      else:
-        ins.append(f"{'f' if dtypes.is_float(vin[0][1]) else 's' if arg == BinaryOps.DIV else ''}{alu[arg]} {', '.join('x15' if v.__class__ is int else rtor[v.nm] for v in [out] + vin)}")
-    elif uop == Ops.LOAD:
-      if arg.__class__ in (int, float):
-        mov_imm(arg, rtor[out.nm])
-      else:
-        #NOTE: if need casting load var in s/h0 or x/w12 temp regs
-        reg_in = type_to_reg[arg[2]] + ('0' if dtypes.is_float(arg[2]) else '12') if arg[2] is not None else rtor[out.nm]
-        mov_imm(arg[0], "x15")
-        ins.append(f"add x15, {rtor[vin[0].nm]}, x15")
-        ins.append(f"ldr{'sb' if arg[2] is not None and arg[2] in (dtypes.int8, dtypes.uint8, dtypes.bool) else ''} {reg_in}, [x15]")
-        if arg[2] is not None: ins.append(f"{'fcvt' if arg[2] in [dtypes.half, dtypes.double] else 'scvtf'} {rtor[out.nm]}, {reg_in}")
-    elif uop == Ops.STORE:
-      #NOTE: if need casting load var in s/h0 or x/w12 temp regs
-      reg_out = (type_to_reg[arg[2]] + ('0' if dtypes.is_float(arg[2]) else '12') if arg[2] is not None else rtor[vin[1].nm])
-      if arg[2] is not None: ins.append(f"fcvt{'zs' if arg[2] not in [dtypes.half, dtypes.double] else '' } {reg_out}, {rtor[vin[1].nm]}")
-      ins.append(f"mov x15, #{arg[0]}")
-      ins.append(f"str {reg_out}, [{rtor[vin[0].nm]}, x15, lsl #0]")
-    elif uop == Ops.COND_BRANCH:
-      #TODO: this is a hack it shouldn't always be a cmp before a cond branch?
-      if prev_uop == Ops.LOAD:
-        ins.append(f"cmp {rtor[vin[0].nm]}, #0")
-      ins.append(f"b.{'lt' if arg[1] else 'ge'} {arg[0][1:]}")
-    elif uop == Ops.LABEL:
-      ins.append(f"{arg[1:]}:")
-    elif uop == Ops.ENDLOOP:
-      mov_imm(arg[0], "x15")
-      ins.append(f"add {rtor[vin[0].nm]}, {rtor[vin[0].nm]}, #1")
-      ins.append(f"cmp {rtor[vin[0].nm]}, x15")
-      ins.append(f"b.lt loop_{arg[1]}")
-    prev_uop = uop
-    # store regs into memory if needed
-    if out is not None and out.nm in mem_vars:
-      ins.append(f"mov x15, {mem_vars[out.nm]}")
-      ins.append(f"str {rtor[out.nm]}, [sp, x15]")
-  return "\n".join([f"//varsize {var_size}",".arch armv8-a",".text", f".global {get_name(fn_nm)}",".p2align 2", f"{get_name(fn_nm)}:", "mov x17, sp"] + [f"sub sp, sp, #{offset}" for offset in compute_offsets(var_size)]+ ins + [f"add sp, sp, #{offset}" for offset in compute_offsets(var_size)] +["ret", "\n"])
-
-def uops_to_arm64_asm(fn_nm:str, uops:List[UOp]) -> Tuple[str, List[int], List[int], bool]:
-  lang = ARM64Language()
-  global_size, local_size = uops_to_asmstyle(lang, fn_nm, uops)
-  return specialize_to_arm64(fn_nm, lang.ins), global_size[::-1], local_size[::-1], True
--- a/extra/assembly/assembly_ptx.py
+++ b/extra/assembly/assembly_ptx.py
@ -1,105 +0,0 @@
-from typing import List
-import struct
-from tinygrad.codegen.assembly import uops_to_asmstyle, AssemblyLanguage
-from tinygrad.codegen.opt.kernel import Ops, UOp
-from tinygrad import dtypes
-from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
-from tinygrad.runtime.ops_cuda import arch
-
-dtype_to_nvtype = {dtypes.float32: "f32", dtypes.float16: "f16", dtypes.int64: "s64", dtypes.int32: "s32", dtypes.int8: "s8", dtypes.bool: "pred", dtypes.uint64: "u64", dtypes.uint32: "u32", dtypes.uint16: "u16", dtypes.uint8: "u8", "bits16": "b16", dtypes.float64: "f64"}
-def float_to_hex(x): return "%02X%02X%02X%02X" % tuple(struct.pack("f",x)[::-1])
-
-def ptx_needs_cast(dest_dtype, src_dtype): return dtypes.is_float(dest_dtype) and dtypes.is_int(src_dtype) or dtypes.is_int(dest_dtype) and dtypes.is_float(src_dtype) or (dtypes.is_float(src_dtype) and dtypes.is_float(dest_dtype) and dest_dtype.itemsize != src_dtype.itemsize)
-
-def render_cast(ins, inp, out):
-  if inp.dtype == dtypes.bool and (dtypes.is_float(out.dtype) or dtypes.is_int(out.dtype)):
-    ins.append(f"selp.{dtype_to_nvtype[out.dtype]} {out}, {'0f3F800000, 0f00000000' if dtypes.is_float(out.dtype) else '1, 0'}, {inp};")
-  elif out.dtype == dtypes.bool:
-    if inp.dtype == dtypes.bool:
-      ins.append(f"mov.pred {out}, {inp};")
-    else:
-      ins.append(f"setp.ne.{dtype_to_nvtype[inp.dtype]} {out}, {'0f00000000' if dtypes.is_float(inp.dtype) else '0'}, {inp};")
-  else:
-    round_mod = ".rzi" if dtypes.is_int(out.dtype) and dtypes.is_float(inp.dtype) else '.rz' if dtypes.is_float(out.dtype) and (dtypes.is_int(inp.dtype) or dtypes.is_float(inp.dtype) and inp.dtype.itemsize > out.dtype.itemsize) else ''
-    ins.append(f"cvt{round_mod}.{dtype_to_nvtype[out.dtype]}.{dtype_to_nvtype[inp.dtype]} {out}, {inp};")
-
-# https://docs.nvidia.com/cuda/parallel-thread-execution/#
-
-class PTXLanguage(AssemblyLanguage):
-  supports_constant_folding: bool = True
-
-def specialize_to_ptx(lang, function_name):
-  param_cnt = 0
-  ins = []
-  alu = {BinaryOps.ADD: "add", BinaryOps.SUB: "sub", BinaryOps.MUL: "mul", BinaryOps.DIV: "div", BinaryOps.MAX: "max",
-         BinaryOps.MOD: "rem", BinaryOps.CMPLT: "setp.lt", UnaryOps.SQRT: "sqrt.approx",
-         UnaryOps.NOOP: "mov", UnaryOps.NEG: "neg",
-         UnaryOps.SIN: "sin.approx", UnaryOps.LOG2: "lg2.approx", UnaryOps.EXP2: "ex2.approx.ftz",
-         TernaryOps.MULACC: "fma.rn", TernaryOps.WHERE: "selp"}
-  for uop, out, vin, arg in lang.ins:
-    if uop == Ops.ENDLOOP:
-      ins.append("bar.sync 0;")
-    elif uop == Ops.DEFINE_LOCAL:
-      ins.append(f".shared .align 4 .b8 {arg[0]}[{arg[1]*4}];")
-    elif uop == Ops.SPECIAL:
-      if arg.startswith('data'):
-        param_cnt += 1
-        ins.append(f"ld.param.u64 {out}, [{arg}];")
-        # TODO: we sometimes want this to be local, nvcc converts to global most of the time, not sure when we would need to?
-        # ins.append(f"cvta.to.global.u64 {out}, {out};")
-      elif arg.startswith('gid'):
-        ins.append(f"mov.u32 {out}, %ctaid.{'xyz'[int(arg[3:])]};")
-      elif arg.startswith('lid'):
-        ins.append(f"mov.u32 {out}, %tid.{'xyz'[int(arg[3:])]};")
-    elif uop == Ops.ALU:
-      if arg == BinaryOps.MUL and out.dtype == dtypes.bool:
-        ins.append(f"and.pred {out}, {', '.join(str(x) for x in vin)};")
-      else:
-        otype = vin[0].dtype if arg in [BinaryOps.CMPLT] else out.dtype
-        if arg == TernaryOps.WHERE:
-          if vin[0].dtype == dtypes.bool:
-            reg = vin[0]
-          else:
-            reg = lang.newreg((vin[0], 'bool'), dtypes.bool)
-            ins.append(f"setp.ne.{dtype_to_nvtype[vin[0].dtype]} {reg}, {'0f00000000' if dtypes.is_float(vin[0].dtype) else '0'}, {vin[0]};")
-          vin = vin[1:] + [reg]
-        ins.append(f"{alu[arg]}{'.lo' if arg == BinaryOps.MUL and out.dtype != dtypes.float32 else ''}{'.rn' if arg == BinaryOps.DIV and out.dtype == dtypes.float32 else ''}.{dtype_to_nvtype[otype]} {out}, {', '.join(str(x) for x in vin)};")
-    elif uop == Ops.LOAD:
-      if arg.__class__ in (int, float):
-        ins.append(f"mov.{dtype_to_nvtype[out.dtype]} {out}, {'0f'+float_to_hex(arg) if dtypes.is_float(out.dtype) else int(arg)};")
-      elif arg[2] is not None and (arg[2] == dtypes.bool or arg[2] != out.dtype):
-        dt = ('u16', dtypes.uint16) if arg[2] == dtypes.bool == out.dtype else ('u8', dtypes.uint8) if arg[2] == dtypes.bool else ('b16', dtypes.float16) if arg[2] == dtypes.half else (dtype_to_nvtype[arg[2]], arg[2])
-        reg = lang.newreg((out, dt[0]), dtype=dt[1])
-        ins.append(f"ld.{arg[1]}.{dt[0]} {reg}, [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}];")
-        render_cast(ins, reg, out)
-      else:
-        ins.append(f"ld.{arg[1]}.{dtype_to_nvtype[dtypes.float if arg[2] is None else arg[2]]} {out}, [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}];")
-    elif uop == Ops.STORE:
-      if ptx_needs_cast(dtypes.float if arg[2] is None else arg[2], vin[1].dtype) or arg[2] == dtypes.bool:
-        if arg[2] == dtypes.bool != vin[1].dtype:
-          prereg = lang.newreg((vin[1],'bool'), dtype=dtypes.bool)
-          render_cast(ins, vin[1], prereg)
-        else: prereg = vin[1]
-        reg = lang.newreg((prereg, dtypes.uint16 if arg[2] == dtypes.bool else arg[2]), dtype=dtypes.uint16 if arg[2] == dtypes.bool else dtypes.float if arg[2] is None else arg[2])
-        render_cast(ins, prereg, reg)
-        ins.append(f"st.{arg[1]}.{dtype_to_nvtype['bits16' if arg[2] == dtypes.float16 else dtypes.uint8 if arg[2] == dtypes.bool else dtypes.float if arg[2] is None else arg[2]]} [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}], {reg};")
-      else:
-        ins.append(f"st.{arg[1]}.{dtype_to_nvtype[dtypes.float if arg[2] is None else arg[2]]} [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}], {vin[1]};")
-    elif uop == Ops.CAST:
-      render_cast(ins, vin[0], out)
-    elif uop == Ops.LABEL:
-      ins.append(f"{arg}:")
-    elif uop == Ops.COND_BRANCH:
-      ins.append(f"@{'!' if not arg[1] else ''}{vin[0]} bra {arg[0]};")
-
-  ins_prefix = [".version 7.8", ".target " + arch(), ".address_size 64",
-                f".visible .entry {function_name}({', '.join(f'.param .u64 data{i}' for i in range(param_cnt))}) {{"]
-  for arg in [(dtype, lang.type_to_letter(dtype), c) for dtype,c in lang.cnts.items()]: ins_prefix.append(f".reg .{dtype_to_nvtype[arg[0][0]]} %{arg[1]}<{arg[2]}>;",)
-  ins = ins_prefix + ins
-  ins += ["ret;", "}"]
-  return '\n'.join(ins)
-
-def uops_to_ptx_asm(function_name:str, uops:List[UOp]):
-  lang = PTXLanguage()
-  global_size, local_size = uops_to_asmstyle(lang, function_name, uops)
-  return specialize_to_ptx(lang, function_name), global_size[::-1], local_size[::-1], True
--- a/extra/assembly/assembly_rdna.py
+++ b/extra/assembly/assembly_rdna.py
@ -1,203 +0,0 @@
-import yaml
-from typing import Tuple, Set, Dict
-from tinygrad import dtypes
-from tinygrad.codegen.assembly import AssemblyCodegen, Register
-from tinygrad.codegen.opt.kernel import Ops
-from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
-from tinygrad.runtime.ops_cl import ROCM_LLVM_PATH
-
-# ugh, is this really needed?
-from extra.helpers import enable_early_exec
-early_exec = enable_early_exec()
-
-boilerplate_start = """
-.global _start
-_start:
-.rodata
-.align 0x10
-.global code.kd
-.type code.kd,STT_OBJECT
-.amdhsa_kernel code"""
-
-code_start = """.end_amdhsa_kernel
-.text
-code:
-"""
-
-# https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
-# https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md#initial-kernel-register-state
-# RDNA3 is actually a SIMD machine!
-class RDNACodegen(AssemblyCodegen):
-  supports_float4: bool = True
-  supports_float4_alu: bool = True
-  supports_load3: bool = True
-  sin_is_sin2pi: bool = True
-  no_div: bool = True
-
-  def specialize(self, asm) -> Tuple[str, str]:
-    args = []
-    for i,b in enumerate(self.bufs): args.append({'.address_space': 'global', '.name': f'buf_{i}', '.offset': i*8, '.size': 8, '.type_name': b.dtype.name+"*", '.value_kind': 'global_buffer'})
-    ins = []
-
-    v_cnt = 3  # v[0:2] is local_xyz
-    s_cnt = 5  # s[0:1] is the address, s[2:4] is global_xyz
-
-    dtype_to_rdnatype = {dtypes.float32: "f32", dtypes.int64: "i64", dtypes.int32: "i32", dtypes.uint64: "u64", dtypes.bool: "i32"}
-    alu = {BinaryOps.ADD: "add", BinaryOps.SUB: "sub", BinaryOps.MUL: "mul", TernaryOps.MULACC: "fma",
-           BinaryOps.MAX: "max", UnaryOps.RECIP: "rcp",
-           UnaryOps.NOOP: "mov", UnaryOps.SIN: "sin", UnaryOps.LOG2: "log", UnaryOps.EXP2: "exp",
-           BinaryOps.CMPLT: "cmp_lt"}
-
-    pend_regs:Set[Register] = set()
-    rtor:Dict[Register, str] = {}
-    def reg_in(x):
-      nonlocal pend_regs
-      #print("reg_in", x, rtor[x], pend_regs)
-      if x in pend_regs:
-        #print("clear")
-        ins.append('s_waitcnt lgkmcnt(0), vmcnt(0)')
-        pend_regs.clear()
-      return rtor[x]
-    def reg_out(x):
-      return rtor[x]
-    for uop, out, vin, arg in asm:
-      if uop == Ops.DEFINE_REGISTER:
-        if arg[0][0] in [dtypes.uint32, dtypes.uint64, dtypes.int64, dtypes.int32, dtypes.float32, dtypes.float.vec(4)]:
-          for i in range(arg[2]):
-            # TODO: Re-use gaps created by this to avoid wasting registers
-            align = int(arg[0][0].itemsize / 4)
-            if arg[0][1]:
-              s_cnt += s_cnt % align
-              reg_name = f"s[{s_cnt}:{s_cnt + align - 1}]" if align > 1 else f"s{s_cnt}"
-              s_cnt += align
-            else:
-              v_cnt += v_cnt % align
-              reg_name = f"v[{v_cnt}:{v_cnt + align - 1}]" if align > 1 else f"v{v_cnt}"
-              v_cnt += align
-            rtor[Register(f"%{arg[1]}{i}", *arg[0])] = reg_name
-
-            if arg[0][0] == dtypes.float.vec(4):
-              for off in range(4):
-                reg_name = f"s{s_cnt-align+off}" if arg[0][1] else f"v{v_cnt-align+off}"
-                rtor[Register(f"%{arg[1]}{i}", dtypes.float, False, off=off)] = reg_name
-        elif arg[0][0] == dtypes.bool:
-          for i in range(arg[2]):
-            reg_name = "scc" if arg[0][1] else "vcc_lo" # `_lo` suffix since we're running wavefront_size=32
-            rtor[Register(f"%{arg[1]}{i}", *arg[0])] = reg_name
-        else:
-          raise NotImplementedError("DEFINE_REGISTER not implemented for arg: ", arg)
-      elif uop == Ops.SPECIAL:
-        if arg.startswith('buf'):
-          i = int(arg[3:])
-          ins.append(f's_load_b64 {reg_out(out)}, s[0:1], {i*8}')
-          pend_regs.add(out)
-          for r in out.subregs(): pend_regs.add(r)
-        elif arg.startswith('gid'):
-          ins.append(f'v_mov_b32 {reg_out(out)}, s{2+int(arg[3])}')
-          # the docs lied, this is actually y
-          if int(arg[3]) == 2: ins.append("v_bfe_u32 v2, v0, 20, 10")  # untested
-          if int(arg[3]) == 1: ins.append("v_bfe_u32 v1, v0, 10, 10")
-          elif int(arg[3]) == 0: ins.append("v_and_b32_e32 v0, 0x3ff, v0")
-          # get local size
-          offset = len(args)*8
-          args.append({".offset": offset, ".value_kind": f"hidden_group_size_{'xyz'[int(arg[3])]}", ".size": 8})
-          ins.append(f's_load_b32 s{2+int(arg[3])}, s[0:1], {offset}')
-          ins.append('s_waitcnt vmcnt(0) lgkmcnt(0)')
-          pend_regs.clear()
-          ins.append(f'v_mul_i32_i24 {reg_out(out)}, {reg_out(out)}, s{2+int(arg[3])}')
-          ins.append(f'v_add_nc_u32 {reg_out(out)}, v{int(arg[3])}, {reg_out(out)}')
-      elif uop == Ops.CONST:
-        if arg == float('inf'): arg = "0x7f800000"
-        elif arg == float('-inf'): arg = "0xff800000"
-        if out.dtype == dtypes.float.vec(4):
-          for off in range(4):
-            ins.append(f"{'s_' if out.scalar else 'v_'}mov_b32 {reg_out(Register(out.nm, dtypes.float, False, off=off))}, {arg}")
-        else:
-          ins.append(f"{'s_' if out.scalar else 'v_'}mov_b32 {reg_out(out)}, {arg}")
-      elif uop == Ops.ALU:
-        if arg in [BinaryOps.CMPLT]:
-          ins.append(f"{'s' if out.scalar else 'v'}_{alu[arg]}_{dtype_to_rdnatype[out.dtype]} {', '.join(reg_in(x) if x.__class__ is Register else str(x) for x in vin)}")
-        else:
-          alu_arg = alu[arg]
-          if arg == TernaryOps.MULACC and out == vin[2]:
-            alu_arg = "fmac"
-            vin = vin[0:2]
-          if out.dtype == dtypes.float.vec(4):
-            for rr in zip(*[x.subregs() if x.dtype == dtypes.float.vec(4) else [x,x,x,x] for x in [out]+vin]):
-              ins.append(f"{'s_' if rr[0].scalar else 'v_'}{alu_arg}_{dtype_to_rdnatype[rr[0].dtype]} {reg_out(rr[0])}, {', '.join(reg_in(x) if x.__class__ is Register else str(x) for x in rr[1:])}")
-          else:
-            ins.append(f"{'s_' if out.scalar else 'v_'}{alu_arg}_{dtype_to_rdnatype[out.dtype] if arg != UnaryOps.NOOP else 'b32'}{'_i24' if arg == BinaryOps.MUL and out.dtype != dtypes.float32 and not out.scalar else ''} {reg_out(out)}, {', '.join(reg_in(x) if x.__class__ is Register else str(x) for x in vin)}")
-      elif uop == Ops.LOAD:
-        if out.scalar:
-          # swap arg order
-          ins.append(f's_load_b32 {reg_out(out)}, {reg_in(vin[0])}, {reg_in(vin[1])} offset:{arg[0]}')
-        else:
-          ins.append(f'global_load_{"b128" if out.dtype == dtypes.float.vec(4) else "b32"} {reg_out(out)}, {reg_in(vin[1])}, {reg_in(vin[0])} offset:{arg[0]}')
-        pend_regs.add(out)
-        for r in out.subregs(): pend_regs.add(r)
-      elif uop == Ops.STORE:
-        ins.append(f'global_store_{"b128" if vin[1].dtype == dtypes.float.vec(4) else "b32"} {reg_in(vin[2])}, {reg_in(vin[1])}, {reg_in(vin[0])} offset:{arg[0]}')
-      elif uop == Ops.LABEL:
-        ins.append(f"{arg}:")
-      elif uop == Ops.COND_BRANCH:
-        ins.append(f"s_cbranch_scc{'1' if arg[1] else '0'} {arg[0]}")
-      elif uop == Ops.CAST:
-        if vin[0].dtype == dtypes.bool:
-          if out.dtype == dtypes.float32:
-            ins.append(f"v_cndmask_b32 {reg_out(out)}, 0.0, 1.0, {reg_in(vin[0])}")
-        else:
-          raise NotImplementedError(f"cast {vin[0].dtype} -> {out.dtype}")
-      else:
-        raise NotImplementedError(uop)
-
-    ins += ['s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)', 's_endpgm', 's_code_end']
-
-    # dual alu group
-    seen = set()
-    new_ins = []
-    for i,tins in enumerate(ins):
-      if tins in seen: continue
-      if tins.startswith("v_fmac_f32"):
-        for gins in reversed(ins[i+1:]):
-          if gins in seen: continue
-          if gins.startswith("v_fmac_f32"):
-            r0 = [int(x[1:].strip(',')) for x in tins.split(" ")[1:]]
-            r1 = [int(x[1:].strip(',')) for x in gins.split(" ")[1:]]
-            if r0[0]%2 == r1[0]%2: continue
-            if r0[1]%2 == r1[1]%2: continue
-            if r0[2]%2 == r1[2]%2: continue
-            new_ins.append(tins.replace("v_", "v_dual_")+" :: " + gins.replace("v_", "v_dual_"))
-            seen.add(tins)
-            seen.add(gins)
-            break
-      if tins not in seen:
-        new_ins.append(tins)
-    ins = new_ins
-
-    return 'code', self.assemble(args, ins, v_cnt, s_cnt)
-
-  def assemble(self, args, ins, v_cnt, s_cnt):
-    kernel_desc = {'.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,
-                   '.amdhsa_next_free_vgpr': v_cnt,   # this matters!
-                   '.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,
-                   '.amdhsa_next_free_sgpr': s_cnt,
-                   '.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3, '.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1,
-                   '.amdhsa_fp16_overflow': 0, '.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,
-                   '.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,
-                   '.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?
-                   '.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0, '.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,
-                   '.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0, '.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,
-                   '.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}
-
-    metadata = {'amdhsa.kernels': [{'.args': args,
-                  '.group_segment_fixed_size': 0, '.kernarg_segment_align': 8, '.kernarg_segment_size': args[-1][".offset"] + args[-1][".size"],
-                  '.language': 'OpenCL C', '.language_version': [1, 2], '.max_flat_workgroup_size': 256,
-                  '.name': 'code', '.private_segment_fixed_size': 0, '.sgpr_count': s_cnt, '.sgpr_spill_count': 0,
-                  '.symbol': 'code.kd', '.uses_dynamic_stack': False, '.vgpr_count': v_cnt, '.vgpr_spill_count': 0,
-                  '.wavefront_size': 32}],
-                'amdhsa.target': 'amdgcn-amd-amdhsa--gfx1100', 'amdhsa.version': [1, 2]}
-
-    code = boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) + "\n" +  code_start + '\n'.join(ins) + "\n.amdgpu_metadata\n" + yaml.dump(metadata) + ".end_amdgpu_metadata"
-    obj = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
-    asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], obj))
-    return asm
--- a/extra/assembly/ptx/test.py
+++ b/extra/assembly/ptx/test.py
@ -1,23 +0,0 @@
-#!/usr/bin/env python3
-import numpy as np
-from tinygrad.runtime.ops_cuda import CUDAProgram, RawCUDABuffer
-
-if __name__ == "__main__":
-  test = RawCUDABuffer.fromCPU(np.zeros(10, np.float32))
-  prg = CUDAProgram("test", """
-  .version 7.8
-  .target sm_86
-  .address_size 64
-  .visible .entry test(.param .u64 x) {
-    .reg .b32       %r<2>;
-    .reg .b64       %rd<3>;
-
-    ld.param.u64    %rd1, [x];
-    cvta.to.global.u64      %rd2, %rd1;
-    mov.u32         %r1, 0x40000000; // 2.0 in float
-    st.global.u32   [%rd2], %r1;
-    ret;
-  }""", binary=True)
-  prg([1], [1], test)
-  print(test.toCPU())
-
--- a/extra/assembly/rdna3/generate.py
+++ b/extra/assembly/rdna3/generate.py
@ -0,0 +1,136 @@
+import os, sys, struct
+sys.path.append(os.getcwd())
+# PROFILE=1 to use
+#os.environ["PROFILE"] = "1"
+os.environ["SQTT"] = "1"
+os.environ["SQTT_ITRACE_SE_MASK"] = "1"
+os.environ["SQTT_LIMIT_SE"] = "1"
+import xml.etree.ElementTree as ET
+
+from tinygrad import nn, Tensor, Device
+from tinygrad.helpers import get_single_element
+from tinygrad.runtime.support.elf import elf_loader
+from tinygrad.runtime.ops_amd import ProfileSQTTEvent
+from extra.sqtt.attempt_sqtt_parse import parse_sqtt_print_packets
+
+def disassemble(text, root:ET.Element):
+  i = 0
+  while i < len(text):
+    ins = struct.unpack("I", text[i:i+4])[0]
+
+    # 1. Get the encoding
+    did_match = False
+    for enc_el in root.findall("./ISA/Encodings/Encoding"):
+      mask = enc_el.findtext("EncodingIdentifierMask")
+      assert len(mask)%32 == 0
+      bit_mask = int(mask, 2)
+      iden = [int(x.text, 2) for x in enc_el.find("EncodingIdentifiers").findall("EncodingIdentifier")]
+      for ide in iden:
+        if ins&bit_mask == ide:
+          did_match = True
+          break
+      if did_match: break
+    if not did_match: raise RuntimeError(f"unknown instruction {ins:08X}")
+    if len(mask) >= 64: ins = (struct.unpack("I", text[i+4:i+8])[0]<<32) | ins
+    if len(mask) >= 96: ins = (struct.unpack("I", text[i+8:i+12])[0]<<64) | ins
+    encoding_name = enc_el.findtext("EncodingName")
+
+    #print(ET.tostring(enc_el).decode())
+
+    # 2. Parse the Fields for this Encoding
+    field_data = {}
+    for field in enc_el.findall("MicrocodeFormat/BitMap/Field"):
+      # Fields can be split into multiple ranges (RangeCount > 1)
+      ranges = sorted(field.findall("BitLayout/Range"), key=lambda x: int(x.attrib.get('Order')))
+      val = 0
+      current_shift = 0
+      for rng in ranges:
+        width = int(rng.find("BitCount").text)
+        chunk = (ins >> int(rng.find("BitOffset").text)) & ((1 << width) - 1)
+        val |= (chunk << current_shift)
+        current_shift += width
+      field_data[field.find("FieldName").text] = val
+    # this is already used
+    del field_data["ENCODING"]
+
+    # 3. Extract the instruction
+    did_match = False
+    for ins_el in root.findall("./ISA/Instructions/Instruction"):
+      ins_name = ins_el.findtext("InstructionName")
+      for ins_enc in ins_el.findall("InstructionEncodings/InstructionEncoding"):
+        if ins_enc.findtext("EncodingName") == encoding_name:
+          opcode = int(ins_enc.findtext("Opcode"))
+          if "OP" in field_data and opcode == field_data["OP"]:
+            did_match = True
+            del field_data["OP"]
+            break
+        if did_match: break
+      if did_match: break
+
+    #print(ET.tostring(ins_enc).decode())
+    #print()
+    #print(field_data)
+    if not did_match:
+      print(f"{i:4X} : {ins:16x} -- {encoding_name}")
+    elif did_match:
+      params = []
+      #print(ET.tostring(ins_el).decode())
+
+      # 4. Extract the opcodes
+      for op_ins in ins_enc.findall("Operands/Operand"):
+        op_type = op_ins.findtext("OperandType")
+        op_size = op_ins.findtext("OperandSize")
+        op_fmt = op_ins.findtext("DataFormatName")
+        op_field_name = op_ins.findtext("FieldName")
+        if op_field_name is None: continue
+        assert op_field_name in field_data
+        # loop through operands for compare
+        for op_el in root.findall("./ISA/OperandTypes/OperandType"):
+          test_op_type = op_el.findtext("OperandTypeName")
+          val_dict = {}
+          for op_val in op_el.findall("OperandPredefinedValues/PredefinedValue"):
+            val_dict[int(op_val.findtext("Value"))] = op_val.findtext("Name")
+          if op_type == test_op_type:
+            if field_data[op_field_name] in val_dict:
+              print(op_type, op_size, op_fmt)
+              params.append(val_dict[field_data[op_field_name]])
+            else:
+              params.append(f"{op_type}({field_data[op_field_name]})")
+            del field_data[op_field_name]
+            #print(op_type, op_size, op_fmt, op_el, op_field_name,
+            #      field_data[op_field_name],
+            #      val_dict.get(field_data[op_field_name], "<UNK>"))
+            #print(ET.tostring(op_el).decode())
+
+      print(f"{i:4X} : {ins:16x} -- {ins_name.lower()} {', '.join(params)}", field_data)
+
+    # advance
+    i += len(mask) // 8
+
+  #print(ET.tostring(root).decode())
+
+if __name__ == "__main__":
+  # human readable manual at https://docs.amd.com/v/u/en-US/rdna35_instruction_set_architecture
+  fns = nn.state.zip_extract(Tensor.from_url("https://gpuopen.com/download/machine-readable-isa/latest/"))
+  xml_str = fns['amdgpu_isa_rdna3_5.xml'].to("CPU").data()
+  with open("/tmp/rdna35.xml", "wb") as f: f.write(bytes(xml_str))
+  root = ET.fromstring(xml_str)
+
+  a = Tensor.empty(16)+1
+  for ei in a.schedule():
+    ei.lower()
+    # get text
+    _, hdr, _ = elf_loader(ei.prg.lib)
+    text = get_single_element([x for x in hdr if x.name==".text"]).content
+
+    # llvm disassembler
+    Device["AMD"].compiler.disassemble(ei.prg.lib)
+
+    # run program
+    ei.run()
+
+  sqtt_events = [e for e in Device["AMD"].profile_events if isinstance(e, ProfileSQTTEvent)]
+  for e in sqtt_events[0:1]: # only the first SE
+    parse_sqtt_print_packets(e.blob)
+
+  disassemble(text[:0x40], root)
--- a/extra/assembly/rdna3/parse.py
+++ b/extra/assembly/rdna3/parse.py
@ -0,0 +1,15 @@
+from tinygrad import Tensor, nn
+import xml.etree.ElementTree as ET
+
+if __name__ == "__main__":
+  # human readable manual at https://docs.amd.com/v/u/en-US/rdna35_instruction_set_architecture
+  fns = nn.state.zip_extract(Tensor.from_url("https://gpuopen.com/download/machine-readable-isa/latest/"))
+  xml_str = fns['amdgpu_isa_rdna3_5.xml'].to("CPU").data()
+  root = ET.fromstring(xml_str)
+
+  for op_el in root.findall("./ISA/OperandTypes/OperandType"):
+    op_name = op_el.findtext("OperandTypeName")
+    val_dict = {}
+    for op_val in op_el.findall("OperandPredefinedValues/PredefinedValue"):
+      val_dict[int(op_val.findtext("Value"))] = op_val.findtext("Name")
+    print(op_name, val_dict)
--- a/extra/augment.py
+++ b/extra/augment.py
@ -1,42 +0,0 @@
-import numpy as np
-from PIL import Image
-from pathlib import Path
-import sys
-cwd = Path.cwd()
-sys.path.append(cwd.as_posix())
-sys.path.append((cwd / 'test').as_posix())
-from extra.datasets import fetch_mnist
-from tqdm import trange
-
-def augment_img(X, rotate=10, px=3):
-  Xaug = np.zeros_like(X)
-  for i in trange(len(X)):
-    im = Image.fromarray(X[i])
-    im = im.rotate(np.random.randint(-rotate,rotate), resample=Image.BICUBIC)
-    w, h = X.shape[1:]
-    #upper left, lower left, lower right, upper right
-    quad = np.random.randint(-px,px,size=(8)) + np.array([0,0,0,h,w,h,w,0])
-    im = im.transform((w, h), Image.QUAD, quad, resample=Image.BICUBIC)
-    Xaug[i] = im
-  return Xaug
-
-if __name__ == "__main__":
-  import matplotlib.pyplot as plt
-  X_train, Y_train, X_test, Y_test = fetch_mnist()
-  X_train = X_train.reshape(-1, 28, 28).astype(np.uint8)
-  X_test = X_test.reshape(-1, 28, 28).astype(np.uint8)
-  X = np.vstack([X_train[:1]]*10+[X_train[1:2]]*10)
-  fig, a = plt.subplots(2,len(X))
-  Xaug = augment_img(X)
-  for i in range(len(X)):
-    a[0][i].imshow(X[i], cmap='gray')
-    a[1][i].imshow(Xaug[i],cmap='gray')
-    a[0][i].axis('off')
-    a[1][i].axis('off')
-  plt.show()
-
-  #create some nice gifs for doc?!
-  for i in range(10):
-    im = Image.fromarray(X_train[7353+i])
-    im_aug = [Image.fromarray(x) for x in augment_img(np.array([X_train[7353+i]]*100))]
-    im.save(f"aug{i}.gif", save_all=True, append_images=im_aug, duration=100, loop=0)
--- a/extra/backends/clang_graph.py
+++ b/extra/backends/clang_graph.py
@ -1,39 +0,0 @@
-from typing import List, Dict, cast
-import ctypes
-from tinygrad.helpers import dedup, cpu_time_execution, DEBUG
-from tinygrad.engine.jit import GraphRunner, GraphException
-from tinygrad.device import Buffer, Device
-from tinygrad.engine.realize import ExecItem, CompiledRunner
-from tinygrad.uop.ops import Variable
-from tinygrad.runtime.ops_cpu import ClangProgram
-from tinygrad.renderer.cstyle import ClangRenderer
-render_dtype = ClangRenderer().render_dtype
-
-class ClangGraph(GraphRunner):
-  def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[str, int]):
-    super().__init__(jit_cache, input_rawbuffers, var_vals)
-    if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
-
-    prgs = '\n'.join(dedup([cast(CompiledRunner, ji.prg).p.src for ji in jit_cache]))
-    args = [f"{render_dtype(x.dtype)}* arg{i}" for i,x in enumerate(input_rawbuffers)]
-    args += sorted([f"int {v}" for v in var_vals])
-    code = ["void batched("+','.join(args)+") {"]
-    for ji in jit_cache:
-      args = []
-      for buf in ji.bufs:
-        assert buf is not None
-        if buf in input_rawbuffers:
-          args.append(f"arg{input_rawbuffers.index(buf)}")
-        else:
-          args.append(f"({render_dtype(buf.dtype)}*)0x{ctypes.addressof(buf._buf):X}")
-      args += [x.expr for x in cast(CompiledRunner, ji.prg).p.vars]
-      code.append(f"  {cast(CompiledRunner, ji.prg).p.function_name}({','.join(args)});")
-    code.append("}")
-    if DEBUG >= 4: print("\n".join(code))
-    compiler = Device["CPU"].compiler
-    assert compiler is not None
-    self._prg = ClangProgram("batched", compiler.compile(prgs+"\n"+"\n".join(code))) # no point in caching the pointers
-
-  def __call__(self, rawbufs: List[Buffer], var_vals: Dict[str, int], wait=False):
-    return cpu_time_execution(
-    lambda: self._prg(*[x._buf for x in rawbufs], *[x[1] for x in sorted(var_vals.items(), key=lambda x: x[0])]), enable=wait)
--- a/extra/backends/graph_hip.py
+++ b/extra/backends/graph_hip.py
@ -1,27 +0,0 @@
-import ctypes
-from typing import Tuple
-import tinygrad.runtime.autogen.hip as hip
-from tinygrad.helpers import init_c_var, time_execution_cuda_style
-from tinygrad.runtime.ops_hip import check, hip_set_device
-from tinygrad.runtime.graph.cuda import CUDAGraph
-
-# TODO: this is only used in graph
-def hip_time_execution(cb, enable=False): return time_execution_cuda_style(cb, hip.hipEvent_t, hip.hipEventCreate, hip.hipEventRecord, hip.hipEventSynchronize, hip.hipEventDestroy, hip.hipEventElapsedTime, enable=enable)  # noqa: E501
-
-class HIPGraph(CUDAGraph):
-  def __del__(self):
-    if hasattr(self, 'graph'): check(hip.hipGraphDestroy(self.graph))
-    if hasattr(self, 'instance'): check(hip.hipGraphExecDestroy(self.instance))
-  def set_device(self): hip_set_device(self.dev)
-  def encode_args_info(self): return (hip.hipDeviceptr_t, (1,2,3))
-  def graph_create(self): return init_c_var(hip.hipGraph_t(), lambda x: check(hip.hipGraphCreate(ctypes.byref(x), 0)))
-  def graph_instantiate(self, graph):
-    return init_c_var(hip.hipGraphExec_t(), lambda x: check(hip.hipGraphInstantiate(ctypes.byref(x), graph, None, None, 0)))
-  def graph_add_kernel_node(self, graph, c_deps, c_params):
-    return init_c_var(hip.hipGraphNode_t(), lambda x: check(hip.hipGraphAddKernelNode(ctypes.byref(x), graph, c_deps, ctypes.sizeof(c_deps)//8 if c_deps else 0, ctypes.byref(c_params))))  # noqa: E501
-  def graph_launch(self, *args, wait=False): return hip_time_execution(lambda: check(hip.hipGraphLaunch(*args)), enable=wait)
-  def graph_exec_kernel_node_set_params(self, *args): return check(hip.hipGraphExecKernelNodeSetParams(*args))
-  def build_kernel_node_params(self, prg, global_size, local_size, c_config):
-    return hip.hipKernelNodeParams(hip.dim3(*local_size), c_config, ctypes.cast(prg.clprg.prg, ctypes.c_void_p), hip.dim3(*global_size), None, 0)
-  def set_kernel_node_launch_dims(self, node, global_size: Tuple[int, int, int], local_size: Tuple[int, int, int]):
-    node.blockDim.x, node.blockDim.y, node.blockDim.z, node.gridDim.x, node.gridDim.y, node.gridDim.z = *local_size, *global_size
--- a/extra/backends/hsa_driver.py
+++ b/extra/backends/hsa_driver.py
@ -1,143 +0,0 @@
-import ctypes, collections
-import tinygrad.runtime.autogen.hsa as hsa
-from tinygrad.helpers import init_c_var
-
-def check(status):
-  if status != 0:
-    hsa.hsa_status_string(status, ctypes.byref(status_str := ctypes.POINTER(ctypes.c_char)()))
-    raise RuntimeError(f"HSA Error {status}: {ctypes.string_at(status_str).decode()}")
-
-# Precalulated AQL info
-AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
-EMPTY_SIGNAL = hsa.hsa_signal_t()
-
-DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
-DISPATCH_KERNEL_HEADER  = 1 << hsa.HSA_PACKET_HEADER_BARRIER
-DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
-DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
-DISPATCH_KERNEL_HEADER |= hsa.HSA_PACKET_TYPE_KERNEL_DISPATCH << hsa.HSA_PACKET_HEADER_TYPE
-
-BARRIER_HEADER  = 1 << hsa.HSA_PACKET_HEADER_BARRIER
-BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
-BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
-BARRIER_HEADER |= hsa.HSA_PACKET_TYPE_BARRIER_AND << hsa.HSA_PACKET_HEADER_TYPE
-
-class AQLQueue:
-  def __init__(self, device, sz=-1):
-    self.device = device
-
-    check(hsa.hsa_agent_get_info(self.device.agent, hsa.HSA_AGENT_INFO_QUEUE_MAX_SIZE, ctypes.byref(max_queue_size := ctypes.c_uint32())))
-    queue_size = min(max_queue_size.value, sz) if sz != -1 else max_queue_size.value
-
-    null_func = ctypes.CFUNCTYPE(None, hsa.hsa_status_t, ctypes.POINTER(hsa.struct_hsa_queue_s), ctypes.c_void_p)()
-    self.hw_queue = init_c_var(ctypes.POINTER(hsa.hsa_queue_t)(), lambda x: check(
-      hsa.hsa_queue_create(self.device.agent, queue_size, hsa.HSA_QUEUE_TYPE_SINGLE, null_func, None, (1<<32)-1, (1<<32)-1, ctypes.byref(x))))
-
-    self.next_doorbell_index = 0
-    self.queue_base = self.hw_queue.contents.base_address
-    self.queue_size = self.hw_queue.contents.size * AQL_PACKET_SIZE # in bytes
-    self.write_addr = self.queue_base
-    self.write_addr_end = self.queue_base + self.queue_size - 1 # precalc saves some time
-    self.available_packet_slots = self.hw_queue.contents.size
-
-    check(hsa.hsa_amd_queue_set_priority(self.hw_queue, hsa.HSA_AMD_QUEUE_PRIORITY_HIGH))
-    check(hsa.hsa_amd_profiling_set_profiler_enabled(self.hw_queue, 1))
-
-  def __del__(self):
-    if hasattr(self, 'hw_queue'): check(hsa.hsa_queue_destroy(self.hw_queue))
-
-  def submit_kernel(self, prg, global_size, local_size, kernargs, completion_signal=None):
-    if self.available_packet_slots == 0: self._wait_queue()
-
-    packet = hsa.hsa_kernel_dispatch_packet_t.from_address(self.write_addr)
-    packet.workgroup_size_x = local_size[0]
-    packet.workgroup_size_y = local_size[1]
-    packet.workgroup_size_z = local_size[2]
-    packet.reserved0 = 0
-    packet.grid_size_x = global_size[0] * local_size[0]
-    packet.grid_size_y = global_size[1] * local_size[1]
-    packet.grid_size_z = global_size[2] * local_size[2]
-    packet.private_segment_size = prg.private_segment_size
-    packet.group_segment_size = prg.group_segment_size
-    packet.kernel_object = prg.handle
-    packet.kernarg_address = kernargs
-    packet.reserved2 = 0
-    packet.completion_signal = completion_signal if completion_signal else EMPTY_SIGNAL
-    packet.setup = DISPATCH_KERNEL_SETUP
-    packet.header = DISPATCH_KERNEL_HEADER
-    self._submit_packet()
-
-  def submit_barrier(self, wait_signals=None, completion_signal=None):
-    assert wait_signals is None or len(wait_signals) <= 5
-    if self.available_packet_slots == 0: self._wait_queue()
-
-    packet = hsa.hsa_barrier_and_packet_t.from_address(self.write_addr)
-    packet.reserved0 = 0
-    packet.reserved1 = 0
-    for i in range(5):
-      packet.dep_signal[i] = wait_signals[i] if wait_signals and len(wait_signals) > i else EMPTY_SIGNAL
-    packet.reserved2 = 0
-    packet.completion_signal = completion_signal if completion_signal else EMPTY_SIGNAL
-    packet.header = BARRIER_HEADER
-    self._submit_packet()
-
-  def blit_packets(self, packet_addr, packet_cnt):
-    if self.available_packet_slots < packet_cnt: self._wait_queue(packet_cnt)
-
-    tail_blit_packets = min((self.queue_base + self.queue_size - self.write_addr) // AQL_PACKET_SIZE, packet_cnt)
-    rem_packet_cnt = packet_cnt - tail_blit_packets
-    ctypes.memmove(self.write_addr, packet_addr, AQL_PACKET_SIZE * tail_blit_packets)
-    if rem_packet_cnt > 0: ctypes.memmove(self.queue_base, packet_addr + AQL_PACKET_SIZE * tail_blit_packets, AQL_PACKET_SIZE * rem_packet_cnt)
-
-    self._submit_packet(packet_cnt)
-
-  def wait(self):
-    self.submit_barrier([], finish_signal := self.device.alloc_signal(reusable=True))
-    hsa.hsa_signal_wait_scacquire(finish_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
-    self.available_packet_slots = self.queue_size // AQL_PACKET_SIZE
-
-  def _wait_queue(self, need_packets=1):
-    while self.available_packet_slots < need_packets:
-      rindex = hsa.hsa_queue_load_read_index_relaxed(self.hw_queue)
-      self.available_packet_slots = self.queue_size // AQL_PACKET_SIZE - (self.next_doorbell_index - rindex)
-
-  def _submit_packet(self, cnt=1):
-    self.available_packet_slots -= cnt
-    self.next_doorbell_index += cnt
-    hsa.hsa_queue_store_write_index_relaxed(self.hw_queue, self.next_doorbell_index)
-    hsa.hsa_signal_store_screlease(self.hw_queue.contents.doorbell_signal, self.next_doorbell_index-1)
-
-    self.write_addr += AQL_PACKET_SIZE * cnt
-    if self.write_addr > self.write_addr_end:
-      self.write_addr = self.queue_base + (self.write_addr - self.queue_base) % self.queue_size
-
-def scan_agents():
-  agents = collections.defaultdict(list)
-
-  @ctypes.CFUNCTYPE(hsa.hsa_status_t, hsa.hsa_agent_t, ctypes.c_void_p)
-  def __scan_agents(agent, data):
-    status = hsa.hsa_agent_get_info(agent, hsa.HSA_AGENT_INFO_DEVICE, ctypes.byref(device_type := hsa.hsa_device_type_t()))
-    if status == 0: agents[device_type.value].append(agent)
-    return hsa.HSA_STATUS_SUCCESS
-
-  hsa.hsa_iterate_agents(__scan_agents, None)
-  return agents
-
-def find_memory_pool(agent, segtyp=-1, location=-1):
-  @ctypes.CFUNCTYPE(hsa.hsa_status_t, hsa.hsa_amd_memory_pool_t, ctypes.c_void_p)
-  def __filter_amd_memory_pools(mem_pool, data):
-    check(hsa.hsa_amd_memory_pool_get_info(mem_pool, hsa.HSA_AMD_MEMORY_POOL_INFO_SEGMENT, ctypes.byref(segment := hsa.hsa_amd_segment_t())))
-    if segtyp >= 0 and segment.value != segtyp: return hsa.HSA_STATUS_SUCCESS
-
-    check(hsa.hsa_amd_memory_pool_get_info(mem_pool, hsa.HSA_AMD_MEMORY_POOL_INFO_LOCATION, ctypes.byref(loc:=hsa.hsa_amd_memory_pool_location_t())))
-    if location >= 0 and loc.value != location: return hsa.HSA_STATUS_SUCCESS
-
-    check(hsa.hsa_amd_memory_pool_get_info(mem_pool, hsa.HSA_AMD_MEMORY_POOL_INFO_SIZE, ctypes.byref(sz := ctypes.c_size_t())))
-    if sz.value == 0: return hsa.HSA_STATUS_SUCCESS
-
-    ret = ctypes.cast(data, ctypes.POINTER(hsa.hsa_amd_memory_pool_t))
-    ret[0] = mem_pool
-    return hsa.HSA_STATUS_INFO_BREAK
-
-  hsa.hsa_amd_agent_iterate_memory_pools(agent, __filter_amd_memory_pools, ctypes.byref(region := hsa.hsa_amd_memory_pool_t()))
-  return region
--- a/extra/backends/hsa_graph.py
+++ b/extra/backends/hsa_graph.py
@ -1,171 +0,0 @@
-import ctypes, collections, time, itertools
-from typing import List, Any, Dict, cast, Optional, Tuple
-from tinygrad.helpers import init_c_var, round_up
-from tinygrad.device import Buffer, BufferSpec
-from tinygrad.device import Compiled, Device
-from tinygrad.uop.ops import Variable
-from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
-from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
-from tinygrad.engine.jit import MultiGraphRunner, GraphException
-import tinygrad.runtime.autogen.hsa as hsa
-from tinygrad.runtime.support.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL
-
-def dedup_signals(signals): return [hsa.hsa_signal_t(hndl) for hndl in set([x.handle for x in signals if isinstance(x, hsa.hsa_signal_t)])]
-
-class VirtAQLQueue(AQLQueue):
-  def __init__(self, device, sz):
-    self.device = device
-    self.virt_queue = (hsa.hsa_kernel_dispatch_packet_t * sz)()
-    self.queue_base = self.write_addr = ctypes.addressof(self.virt_queue)
-    self.packets_count = 0
-    self.available_packet_slots = sz
-  def _wait_queue(self, need_packets=1): assert False, f"VirtQueue is too small to handle {self.packets_count+need_packets} packets!"
-  def _submit_packet(self):
-    self.write_addr += AQL_PACKET_SIZE
-    self.packets_count += 1
-    self.available_packet_slots -= 1
-
-class HSAGraph(MultiGraphRunner):
-  def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[str, int]):
-    super().__init__(jit_cache, input_rawbuffers, var_vals)
-
-    # Check all jit items are compatible.
-    compiled_devices = set()
-    for ji in self.jit_cache:
-      if isinstance(ji.prg, CompiledRunner): compiled_devices.add(ji.prg.dev)
-      elif isinstance(ji.prg, BufferXfer):
-        for x in ji.bufs[0:2]: compiled_devices.add(Device[cast(Buffer, x).device])
-      else: raise GraphException
-    if any(not isinstance(d, HSADevice) for d in compiled_devices): raise GraphException
-
-    self.devices: List[HSADevice] = list(compiled_devices) #type:ignore
-
-    # Allocate kernel args.
-    kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
-    for ji in self.jit_cache:
-      if isinstance(ji.prg, CompiledRunner): kernargs_size[ji.prg.dev] += round_up(ctypes.sizeof(ji.prg._prg.args_struct_t), 16)
-    kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz, BufferSpec()) for dev,sz in kernargs_size.items()}
-
-    # Fill initial arguments.
-    self.ji_kargs_structs: Dict[int, ctypes.Structure] = {}
-    for j,ji in enumerate(self.jit_cache):
-      if not isinstance(ji.prg, CompiledRunner): continue
-      self.ji_kargs_structs[j] = ji.prg._prg.args_struct_t.from_address(kernargs_ptrs[ji.prg.dev])
-      kernargs_ptrs[ji.prg.dev] += round_up(ctypes.sizeof(ji.prg._prg.args_struct_t), 16)
-      for i in range(len(ji.bufs)): self.ji_kargs_structs[j].__setattr__(f'f{i}', cast(Buffer, ji.bufs[i])._buf)
-      for i in range(len(ji.prg.p.vars)): self.ji_kargs_structs[j].__setattr__(f'v{i}', var_vals[ji.prg.p.vars[i].expr])
-
-    # Build queues.
-    self.virt_aql_queues: Dict[Compiled, VirtAQLQueue] = {dev:VirtAQLQueue(dev, 2*len(self.jit_cache)+16) for dev in self.devices}
-    self.packets = {}
-    self.transfers = []
-    self.ji_to_transfer: Dict[int, int] = {} # faster to store transfers as list and update using this mapping table.
-    self.signals_to_reset: List[hsa.hsa_signal_t] = []
-    self.signals_to_devices: Dict[ctypes.c_uint64, List[HSADevice]] = {}
-    self.profile_info: Dict[Compiled, List[Tuple[Any, ...]]] = collections.defaultdict(list)
-
-    # Special packet to wait for the world.
-    self.kickoff_signals: Dict[HSADevice, hsa.hsa_signal_t] = {dev:self.alloc_signal(reset_on_start=True) for dev in self.devices}
-    for dev in self.devices: self.virt_aql_queues[dev].submit_barrier([], self.kickoff_signals[dev])
-
-    for j,ji in enumerate(self.jit_cache):
-      if isinstance(ji.prg, CompiledRunner):
-        wait_signals = self.access_resources(ji.bufs, ji.prg.p.outs, new_dependency=j, sync_with_aql_packets=False)
-        for i in range(0, len(wait_signals), 5):
-          self.virt_aql_queues[ji.prg.dev].submit_barrier(wait_signals[i:i+5])
-        self.packets[j] = hsa.hsa_kernel_dispatch_packet_t.from_address(self.virt_aql_queues[ji.prg.dev].write_addr)
-
-        sync_signal = self.alloc_signal(reset_on_start=True) if PROFILE else None
-        self.virt_aql_queues[ji.prg.dev].submit_kernel(ji.prg._prg, *ji.prg.p.launch_dims(var_vals), #type:ignore
-                                                          ctypes.addressof(self.ji_kargs_structs[j]), completion_signal=sync_signal)
-        if PROFILE: self.profile_info[ji.prg.dev].append((sync_signal, ji.prg._prg.name, False))
-      elif isinstance(ji.prg, BufferXfer):
-        dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]]
-        dest_dev, src_dev = cast(HSADevice, Device[dest.device]), cast(HSADevice, Device[src.device])
-        sync_signal = self.alloc_signal(reset_on_start=True, wait_on=[dest_dev, src_dev])
-
-        wait_signals = self.access_resources([dest, src], write=[0], new_dependency=sync_signal, sync_with_aql_packets=True)
-        self.transfers.append([dest._buf, dest_dev.agent, src._buf, src_dev.agent, dest.nbytes, len(wait_signals),
-                              (hsa.hsa_signal_t*len(wait_signals))(*wait_signals), sync_signal, hsa.HSA_AMD_SDMA_ENGINE_0, True])
-        self.ji_to_transfer[j] = len(self.transfers) - 1
-        if PROFILE: self.profile_info[src_dev].append((sync_signal, f"transfer: HSA:{src_dev.device_id} -> HSA:{dest_dev.device_id}", True))
-
-    # Wait for all active signals to finish the graph
-    wait_signals_to_finish: Dict[HSADevice, List[hsa.hsa_signal_t]] = collections.defaultdict(list)
-    for v in dedup_signals(list(self.w_dependency_map.values()) + list(itertools.chain.from_iterable(self.r_dependency_map.values()))):
-      for dev in self.signals_to_devices[v.handle]:
-        wait_signals_to_finish[dev].append(v)
-
-    self.finish_signal = init_c_var(hsa.hsa_signal_t(), lambda x: check(hsa.hsa_amd_signal_create(1, 0, None, 0, ctypes.byref(x))))
-    for dev in self.devices:
-      wait_signals = wait_signals_to_finish[dev]
-      for i in range(0, max(1, len(wait_signals)), 5):
-        self.virt_aql_queues[dev].submit_barrier(wait_signals[i:i+5], completion_signal=self.finish_signal if i+5>=len(wait_signals) else None)
-
-    # Zero signals to allow graph to start and execute.
-    for sig in self.signals_to_reset: hsa.hsa_signal_silent_store_relaxed(sig, 0)
-    hsa.hsa_signal_silent_store_relaxed(self.finish_signal, 0)
-
-  def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[str, int], wait=False) -> Optional[float]:
-    # Wait and restore signals
-    hsa.hsa_signal_wait_scacquire(self.finish_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
-    for sig in self.signals_to_reset: hsa.hsa_signal_silent_store_relaxed(sig, 1)
-    hsa.hsa_signal_silent_store_relaxed(self.finish_signal, len(self.devices))
-
-    # Update rawbuffers
-    for (j,i),input_idx in self.input_replace.items():
-      if j in self.ji_kargs_structs:
-        self.ji_kargs_structs[j].__setattr__(f'f{i}', input_rawbuffers[input_idx]._buf)
-      else:
-        if i == 0: self.transfers[self.ji_to_transfer[j]][0] = input_rawbuffers[input_idx]._buf # dest
-        elif i == 1: self.transfers[self.ji_to_transfer[j]][2] = input_rawbuffers[input_idx]._buf # src
-
-    # Update var_vals
-    for j in self.jc_idx_with_updatable_var_vals:
-      for i,v in enumerate(cast(CompiledRunner, self.jit_cache[j].prg).p.vars):
-        self.ji_kargs_structs[j].__setattr__(f'v{i}', var_vals[v.expr])
-
-    # Update launch dims
-    for j in self.jc_idx_with_updatable_launch_dims:
-      gl, lc = cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals)
-      self.packets[j].workgroup_size_x = lc[0]
-      self.packets[j].workgroup_size_y = lc[1]
-      self.packets[j].workgroup_size_z = lc[2]
-      self.packets[j].grid_size_x = gl[0] * lc[0]
-      self.packets[j].grid_size_y = gl[1] * lc[1]
-      self.packets[j].grid_size_z = gl[2] * lc[2]
-
-    for dev in self.devices:
-      dev.flush_hdp()
-      dev.hw_queue.blit_packets(self.virt_aql_queues[dev].queue_base, self.virt_aql_queues[dev].packets_count)
-
-    for transfer_data in self.transfers:
-      check(hsa.hsa_amd_memory_async_copy_on_engine(*transfer_data))
-
-    et = None
-    if wait:
-      st = time.perf_counter()
-      hsa.hsa_signal_wait_scacquire(self.finish_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
-      et = time.perf_counter() - st
-
-    for profdev,profdata in self.profile_info.items(): Profiler.tracked_signals[profdev] += profdata
-    return et
-
-  def alloc_signal(self, reset_on_start=False, wait_on=None):
-    sync_signal = init_c_var(hsa.hsa_signal_t(), lambda x: check(hsa.hsa_amd_signal_create(1, 0, None, 0, ctypes.byref(x))))
-    if reset_on_start: self.signals_to_reset.append(sync_signal)
-    if wait_on is not None: self.signals_to_devices[sync_signal.handle] = wait_on
-    return sync_signal
-
-  def dependency_as_signal(self, dep, sync_with_aql_packets) -> Optional[hsa.hsa_signal_t]:
-    if isinstance(dep, hsa.hsa_signal_t): return dep
-    elif sync_with_aql_packets and isinstance(packet := self.packets.get(dep), hsa.hsa_kernel_dispatch_packet_t):
-      if packet.completion_signal.handle == EMPTY_SIGNAL.handle: packet.completion_signal = self.alloc_signal(reset_on_start=True)
-      return packet.completion_signal
-    return None
-
-  def access_resources(self, rawbufs, write, new_dependency, sync_with_aql_packets=False):
-    rdeps = self._access_resources(rawbufs, write, new_dependency)
-    wait_signals = [self.dependency_as_signal(dep, sync_with_aql_packets=sync_with_aql_packets) for dep in rdeps]
-    if sync_with_aql_packets: wait_signals += [self.kickoff_signals[cast(HSADevice, Device[rawbuf.device])] for rawbuf in rawbufs]
-    return dedup_signals(wait_signals)
--- a/extra/backends/ops_hsa.py
+++ b/extra/backends/ops_hsa.py
@ -1,275 +0,0 @@
-from __future__ import annotations
-import ctypes, functools, subprocess, io, atexit, collections, json
-from typing import Tuple, TypeVar, List, Dict, Any
-import tinygrad.runtime.autogen.hsa as hsa
-from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv, PROFILE
-from tinygrad.device import Compiled, Compiler, CompileError, BufferSpec, LRUAllocator
-from tinygrad.renderer.cstyle import HIPRenderer
-from tinygrad.runtime.support.hsa import check, scan_agents, find_memory_pool, AQLQueue
-from tinygrad.runtime.support.hip_comgr import compile_hip
-if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl  # noqa: F401
-
-class HSAProfiler:
-  def __init__(self):
-    self.tracked_signals = collections.defaultdict(list)
-    self.collected_events: List[Tuple[Any, ...]] = []
-    self.copy_timings = hsa.hsa_amd_profiling_async_copy_time_t()
-    self.disp_timings = hsa.hsa_amd_profiling_dispatch_time_t()
-
-  def track(self, signal, device, name, is_copy=False): self.tracked_signals[device].append((signal, name, is_copy))
-  def process(self, device):
-    # Process all tracked signals, should be called before any of tracked signals are reused.
-    for sig,name,is_copy in self.tracked_signals[device]:
-      if is_copy: check(hsa.hsa_amd_profiling_get_async_copy_time(sig, ctypes.byref(timings :=  self.copy_timings)))
-      else: check(hsa.hsa_amd_profiling_get_dispatch_time(device.agent, sig, ctypes.byref(timings := self.disp_timings))) #type:ignore
-      self.collected_events.append((device.device_id, 1 if is_copy else 0, name, timings.start, timings.end))
-    self.tracked_signals.pop(device)
-
-  def save(self, path):
-    mjson = []
-    for i in range(len(HSADevice.devices)):
-      mjson.append({"name": "process_name", "ph": "M", "pid": i, "args": {"name": "HSA"}})
-      mjson.append({"name": "thread_name", "ph": "M", "pid": i, "tid": 0, "args": {"name": "AQL"}})
-      mjson.append({"name": "thread_name", "ph": "M", "pid": i, "tid": 1, "args": {"name": "SDMA"}})
-
-    for dev_id,queue_id,name,st,et in self.collected_events:
-      mjson.append({"name": name, "ph": "B", "pid": dev_id, "tid": queue_id, "ts": st*1e-3})
-      mjson.append({"name": name, "ph": "E", "pid": dev_id, "tid": queue_id, "ts": et*1e-3})
-    with open(path, "w") as f: f.write(json.dumps({"traceEvents": mjson}))
-    print(f"Saved HSA profile to {path}")
-Profiler = HSAProfiler()
-
-class HSACompiler(Compiler):
-  def __init__(self, arch:str):
-    self.arch = arch
-    super().__init__(f"compile_hip_{self.arch}")
-  def compile(self, src:str) -> bytes:
-    try: return compile_hip(src, self.arch)
-    except RuntimeError as e: raise CompileError(e)
-
-class HSAProgram:
-  def __init__(self, device:HSADevice, name:str, lib:bytes):
-    self.device, self.name, self.lib = device, name, lib
-
-    if DEBUG >= 6:
-      asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
-      print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
-
-    self.exec = init_c_var(hsa.hsa_executable_t(), lambda x: check(hsa.hsa_executable_create_alt(hsa.HSA_PROFILE_FULL, hsa.HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, None, ctypes.byref(x)))) # noqa: E501
-    self.code_reader = init_c_var(hsa.hsa_code_object_reader_t(),
-                                  lambda x: check(hsa.hsa_code_object_reader_create_from_memory(lib, len(lib), ctypes.byref(x))))
-    check(hsa.hsa_executable_load_agent_code_object(self.exec, self.device.agent, self.code_reader, None, None))
-    check(hsa.hsa_executable_freeze(self.exec, None))
-
-    self.kernel = init_c_var(hsa.hsa_executable_symbol_t(), lambda x: check(hsa.hsa_executable_get_symbol_by_name(self.exec, (name+".kd").encode("utf-8"), ctypes.byref(self.device.agent), ctypes.byref(x)))) # noqa: E501
-    self.handle = init_c_var(ctypes.c_uint64(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, ctypes.byref(x)))) # noqa: E501
-    self.kernargs_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
-    self.group_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
-    self.private_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
-
-  def __del__(self):
-    self.device.synchronize()
-    if hasattr(self, 'code_reader'): check(hsa.hsa_code_object_reader_destroy(self.code_reader))
-    if hasattr(self, 'exec'): check(hsa.hsa_executable_destroy(self.exec))
-
-  def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
-    if not hasattr(self, "args_struct_t"):
-      self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
-                                                 [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
-      if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
-        raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
-
-    kernargs = None
-    if self.kernargs_segment_size > 0:
-      kernargs = self.device.alloc_kernargs(self.kernargs_segment_size)
-      args_st = self.args_struct_t.from_address(kernargs)
-      for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i])
-      for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
-      self.device.flush_hdp()
-
-    signal = self.device.alloc_signal(reusable=True) if wait or PROFILE else None
-    self.device.hw_queue.submit_kernel(self, global_size, local_size, kernargs, completion_signal=signal)
-    if PROFILE: Profiler.track(signal, self.device, self.name)
-    if wait:
-      hsa.hsa_signal_wait_scacquire(signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
-      check(hsa.hsa_amd_profiling_get_dispatch_time(self.device.agent, signal, ctypes.byref(timings := hsa.hsa_amd_profiling_dispatch_time_t())))
-      return (timings.end - timings.start) * self.device.clocks_to_time
-
-T = TypeVar("T")
-CHUNK_SIZE, PAGE_SIZE = 256*1024*1024, 0x1000
-class HSAAllocator(LRUAllocator):
-  def __init__(self, device:HSADevice):
-    self.device = device
-    super().__init__()
-
-  def _alloc(self, size:int, options:BufferSpec):
-    if options.host:
-      check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p())))
-      check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem))
-      return mem.value
-    c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
-    check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
-    check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
-    return buf.value
-
-  def _free(self, opaque:T, options:BufferSpec):
-    HSADevice.synchronize_system()
-    check(hsa.hsa_amd_memory_pool_free(opaque))
-
-  def _copyin(self, dest:T, src: memoryview):
-    # Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets.
-    self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
-    mem = self._alloc(src.nbytes, BufferSpec(host=True))
-    ctypes.memmove(mem, from_mv(src), src.nbytes)
-    check(hsa.hsa_amd_memory_async_copy_on_engine(dest, self.device.agent, mem, HSADevice.cpu_agent, src.nbytes, 1, ctypes.byref(sync_signal),
-                                                  copy_signal := self.device.alloc_signal(reusable=True), hsa.HSA_AMD_SDMA_ENGINE_0, True))
-    self.device.hw_queue.submit_barrier([copy_signal])
-    self.device.delayed_free.append(mem)
-    if PROFILE: Profiler.track(copy_signal, self.device, f"copyin: CPU -> HSA:{self.device.device_id}", is_copy=True)
-
-  def copy_from_fd(self, dest, fd, offset, size):
-    self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
-
-    if not hasattr(self, 'hb'):
-      self.hb = [self._alloc(CHUNK_SIZE, BufferSpec(host=True)) for _ in range(2)]
-      self.hb_signals = [self.device.alloc_signal(reusable=False) for _ in range(2)]
-      self.hb_polarity = 0
-      self.sdma = [hsa.HSA_AMD_SDMA_ENGINE_0, hsa.HSA_AMD_SDMA_ENGINE_1]
-      for sig in self.hb_signals: hsa.hsa_signal_store_relaxed(sig, 0)
-
-    fo = io.FileIO(fd, "a+b", closefd=False)
-    fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
-
-    copies_called = 0
-    copied_in = 0
-    for local_offset in range(0, size+minor_offset, CHUNK_SIZE):
-      local_size = min(round_up(size+minor_offset, PAGE_SIZE)-local_offset, CHUNK_SIZE)
-      copy_size = min(local_size-minor_offset, size-copied_in)
-      if copy_size == 0: break
-
-      hsa.hsa_signal_wait_scacquire(self.hb_signals[self.hb_polarity], hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
-      self.device.reusable_signals.append(self.hb_signals[self.hb_polarity]) # it's free now and can be reused
-      self.hb_signals[self.hb_polarity] = self.device.alloc_signal(reusable=False)
-
-      fo.readinto(to_mv(self.hb[self.hb_polarity], local_size))
-      check(hsa.hsa_amd_memory_async_copy_on_engine(dest+copied_in, self.device.agent, self.hb[self.hb_polarity]+minor_offset, HSADevice.cpu_agent,
-                                                    copy_size, 1, ctypes.byref(sync_signal), self.hb_signals[self.hb_polarity],
-                                                    self.sdma[self.hb_polarity], True))
-      copied_in += copy_size
-      self.hb_polarity = (self.hb_polarity + 1) % len(self.hb)
-      minor_offset = 0 # only on the first
-      copies_called += 1
-
-    wait_signals = [self.hb_signals[self.hb_polarity - 1]]
-    if copies_called > 1: wait_signals.append(self.hb_signals[self.hb_polarity])
-    self.device.hw_queue.submit_barrier(wait_signals)
-
-  def _copyout(self, dest:memoryview, src:T):
-    HSADevice.synchronize_system()
-    copy_signal = self.device.alloc_signal(reusable=True)
-    c_agents = (hsa.hsa_agent_t*2)(self.device.agent, HSADevice.cpu_agent)
-    check(hsa.hsa_amd_memory_lock_to_pool(from_mv(dest), dest.nbytes, c_agents, 2, HSADevice.cpu_mempool, 0, ctypes.byref(addr:=ctypes.c_void_p())))
-    check(hsa.hsa_amd_memory_async_copy(addr, HSADevice.cpu_agent, src, self.device.agent, dest.nbytes, 0, None, copy_signal))
-    hsa.hsa_signal_wait_scacquire(copy_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
-    check(hsa.hsa_amd_memory_unlock(from_mv(dest)))
-    if PROFILE: Profiler.track(copy_signal, self.device, f"copyout: HSA:{self.device.device_id} -> CPU", is_copy=True)
-
-  def transfer(self, dest:T, src:T, sz:int, src_dev=None, dest_dev=None):
-    src_dev.hw_queue.submit_barrier([], sync_signal_1 := src_dev.alloc_signal(reusable=True))
-    dest_dev.hw_queue.submit_barrier([], sync_signal_2 := dest_dev.alloc_signal(reusable=True))
-    c_wait_signal = (hsa.hsa_signal_t*2)(sync_signal_1, sync_signal_2)
-    check(hsa.hsa_amd_memory_async_copy_on_engine(dest, dest_dev.agent, src, src_dev.agent, sz, 2, c_wait_signal,
-                                                  copy_signal := dest_dev.alloc_signal(reusable=False), hsa.HSA_AMD_SDMA_ENGINE_0, True))
-    src_dev.hw_queue.submit_barrier([copy_signal])
-    dest_dev.hw_queue.submit_barrier([copy_signal])
-    if PROFILE: Profiler.track(copy_signal, src_dev, f"transfer: HSA:{src_dev.device_id} -> HSA:{dest_dev.device_id}", is_copy=True)
-
-class HSADevice(Compiled):
-  devices: List[HSADevice] = []
-  agents: Dict[int, List[hsa.hsa_agent_t]] = {}
-  cpu_agent: hsa.hsa_agent_t
-  cpu_mempool: hsa.hsa_amd_memory_pool_t
-  def __init__(self, device:str=""):
-    if not HSADevice.agents:
-      check(hsa.hsa_init())
-      atexit.register(hsa_terminate)
-      HSADevice.agents = scan_agents()
-      HSADevice.cpu_agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_CPU][0]
-      HSADevice.cpu_mempool = find_memory_pool(HSADevice.cpu_agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_CPU)
-      if PROFILE: check(hsa.hsa_amd_profiling_async_copy_enable(1))
-
-    self.device_id = int(device.split(":")[1]) if ":" in device else 0
-    self.agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU][self.device_id]
-    self.gpu_mempool = find_memory_pool(self.agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_GPU)
-    self.hw_queue = AQLQueue(self)
-    HSADevice.devices.append(self)
-
-    check(hsa.hsa_agent_get_info(self.agent, hsa.HSA_AGENT_INFO_NAME, ctypes.byref(agent_name_buf := ctypes.create_string_buffer(256))))
-    self.arch = ctypes.string_at(agent_name_buf).decode()
-
-    check(hsa.hsa_system_get_info(hsa.HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ctypes.byref(gpu_freq := ctypes.c_uint64())))
-    self.clocks_to_time: float = 1 / gpu_freq.value
-
-    check(hsa.hsa_agent_get_info(self.agent, hsa.HSA_AMD_AGENT_INFO_HDP_FLUSH, ctypes.byref(hdp_flush := hsa.hsa_amd_hdp_flush_t())))
-    self.hdp_flush = hdp_flush
-
-    self.delayed_free: List[int] = []
-    self.reusable_signals: List[hsa.hsa_signal_t] = []
-
-    from tinygrad.runtime.graph.hsa import HSAGraph
-    super().__init__(device, HSAAllocator(self), HIPRenderer(), HSACompiler(self.arch), functools.partial(HSAProgram, self), HSAGraph)
-
-    # Finish init: preallocate some signals + space for kernargs
-    self.signal_pool = [init_c_var(hsa.hsa_signal_t(), lambda x: check(hsa.hsa_signal_create(1, 0, None, ctypes.byref(x)))) for _ in range(4096)]
-    self._new_kernargs_region(16 << 20) # initial region size is 16mb
-
-  def synchronize(self):
-    self.hw_queue.wait()
-
-    for sig in self.reusable_signals: hsa.hsa_signal_silent_store_relaxed(sig, 1)
-    self.signal_pool.extend(self.reusable_signals)
-    self.reusable_signals.clear()
-
-    for opaque_to_free in self.delayed_free: check(hsa.hsa_amd_memory_pool_free(opaque_to_free))
-    self.delayed_free.clear()
-
-    self.kernarg_next_addr = self.kernarg_start_addr
-    Profiler.process(self)
-
-  @staticmethod
-  def synchronize_system():
-    for d in HSADevice.devices: d.synchronize()
-
-  def alloc_signal(self, reusable=False):
-    if len(self.signal_pool): signal = self.signal_pool.pop()
-    else: check(hsa.hsa_amd_signal_create(1, 0, None, 0, ctypes.byref(signal := hsa.hsa_signal_t())))
-
-    # reusable means a signal could be reused after synchronize for the device it's allocated from is called.
-    if reusable: self.reusable_signals.append(signal)
-    return signal
-
-  def alloc_kernargs(self, sz):
-    if self.kernarg_next_addr + sz >= self.kernarg_start_addr + self.kernarg_pool_sz: self._new_kernargs_region(int(self.kernarg_pool_sz * 2))
-    result = self.kernarg_next_addr
-    self.kernarg_next_addr = round_up(self.kernarg_next_addr + sz, 16)
-    return result
-
-  def _new_kernargs_region(self, sz:int):
-    if hasattr(self, 'kernarg_start_addr'): self.delayed_free.append(self.kernarg_start_addr)
-    self.kernarg_start_addr: int = self.allocator._alloc(sz, BufferSpec())
-    self.kernarg_next_addr = self.kernarg_start_addr
-    self.kernarg_pool_sz: int = sz
-
-  def flush_hdp(self): self.hdp_flush.HDP_MEM_FLUSH_CNTL[0] = 1
-
-def hsa_terminate():
-  # Need to stop/delete aql queue before hsa shut down, this leads to gpu hangs.
-  for dev in HSADevice.devices:
-    Profiler.process(dev)
-    del dev.hw_queue
-
-  # hsa_shut_down cleans up all hsa-related resources.
-  hsa.hsa_shut_down()
-  HSADevice.synchronize = lambda: None #type:ignore
-  HSAProgram.__del__ = lambda _: None #type:ignore
-  if Profiler.collected_events: Profiler.save("/tmp/profile.json")
--- a/extra/backends/rdna.py
+++ b/extra/backends/rdna.py
@ -1,127 +0,0 @@
-from typing import Dict, Set
-import yaml
-from tinygrad.codegen.uops import UOpGraph, UOps, UOp
-from tinygrad.uop.ops import BinaryOps
-from tinygrad.dtype import dtypes
-
-def uops_to_rdna(function_name:str, uops:UOpGraph) -> str:
-  replace: Dict[UOp, UOp] = {}
-  seen: Set[UOp] = set()
-  for u in uops:
-    if u in seen: continue
-    seen.add(u)
-    for o,n in replace.items():
-      if o in u.vin and u is not n:
-        u.vin = tuple(n if x == o else x for x in u.vin)
-    # pointer indexing
-    if u.uop in {UOps.LOAD, UOps.STORE} and u.vin[0].dtype.itemsize > 1:
-      val = UOp(UOps.CONST, dtypes.int, tuple(), arg=u.vin[0].dtype.itemsize, insert_at=uops.uops.index(u))
-      ptr = UOp(UOps.ALU, dtypes.int, (u.vin[1], val), arg=BinaryOps.MUL, insert_at=uops.uops.index(u))
-      u.vin = (u.vin[0], ptr) + u.vin[2:]
-  #uops.print()
-
-  args = []
-  ins = []
-
-  v_cnt = 3  # v[0:2] is local_xyz
-  s_cnt = 5  # s[0:1] is the address, s[2:4] is global_xyz
-
-  r: Dict[UOp, str] = {}
-  for u in uops:
-    if u.uop == UOps.SPECIAL:
-      if u.arg.startswith("lidx"):
-        r[u] = f'v{u.src[0].arg}'
-      elif u.arg.startswith("gidx"):
-        r[u] = f's{2+u.src[0].arg}'
-      else:
-        raise NotImplementedError
-    elif u.uop == UOps.CONST:
-      #r[u] = u.arg
-
-      # TODO: sometimes we can use s
-      #r[u] = f"s{s_cnt}"
-      #s_cnt += 1
-      #ins.append(f"s_mov_b32 {r[u]}, {u.arg}")
-
-      r[u] = f"v{v_cnt}"
-      v_cnt += 1
-      ins.append(f"v_mov_b32 {r[u]}, {u.arg}")
-    elif u.uop == UOps.ALU:
-      if u.arg == BinaryOps.ADD:
-        r[u] = f"v{v_cnt}"
-        v_cnt += 1
-        ins.append(f"v_add_f32_e32 {r[u]}, {r[u.vin[0]]}, {r[u.vin[1]]}")
-      elif u.arg == BinaryOps.MUL:
-        r[u] = f"v{v_cnt}"
-        v_cnt += 1
-        if dtypes.is_float(u.dtype):
-          ins.append(f"v_mul_f32_e32 {r[u]}, {r[u.vin[0]]}, {r[u.vin[1]]}")
-        else:
-          ins.append(f"v_mul_u32_u24 {r[u]}, {r[u.vin[0]]}, {r[u.vin[1]]}")
-      else:
-        raise NotImplementedError
-    elif u.uop == UOps.LOAD:
-      r[u] = f"v{v_cnt}"
-      v_cnt += 1
-      ins.append(f"global_load_b32 {r[u]}, {r[u.vin[1]]}, {r[u.vin[0]]}")
-      ins.append("s_waitcnt vmcnt(0)")
-    elif u.uop == UOps.STORE:
-      ins.append(f"global_store_b32 {r[u.vin[1]]}, {r[u.vin[2]]}, {r[u.vin[0]]}")
-    elif u.uop == UOps.DEFINE_GLOBAL:
-      i = u.arg[0]
-      args.append({'.address_space': 'global', '.name': f'buf_{i}', '.offset': i*8, '.size': 8,
-                   '.type_name': u.dtype.name+"*", '.value_kind': 'global_buffer'})
-      s_cnt += s_cnt%2  # skip
-      r[u] = f"s[{s_cnt}:{s_cnt+1}]"
-      s_cnt += 2
-      ins.append(f"s_load_b64 {r[u]}, s[0:1], {i*8}")
-      ins.append("s_waitcnt lgkmcnt(0)")
-    else:
-      raise NotImplementedError(f"can't render {u.uop}")
-
-  # *** boilerplate rendering ***
-
-  metadata = {
-    'amdhsa.kernels': [{'.args': args,
-      '.group_segment_fixed_size': 0, '.kernarg_segment_align': 8, '.kernarg_segment_size': args[-1][".offset"] + args[-1][".size"],
-      '.language': 'OpenCL C', '.language_version': [1, 2], '.max_flat_workgroup_size': 256,
-      '.name': function_name, '.private_segment_fixed_size': 0, '.sgpr_count': s_cnt, '.sgpr_spill_count': 0,
-      '.symbol': f'{function_name}.kd', '.uses_dynamic_stack': False, '.vgpr_count': v_cnt, '.vgpr_spill_count': 0,
-      '.wavefront_size': 32}],
-    'amdhsa.target': 'amdgcn-amd-amdhsa--gfx1100', 'amdhsa.version': [1, 2]}
-
-  boilerplate_start = f"""
-.rodata
-.global {function_name}.kd
-.type {function_name}.kd,STT_OBJECT
-.align 0x10
-.amdhsa_kernel {function_name}"""
-
-  kernel_desc = {
-    '.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,
-    '.amdhsa_next_free_vgpr': v_cnt,   # this matters!
-    '.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,
-    '.amdhsa_next_free_sgpr': s_cnt,
-    '.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3,
-    '.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0,
-    '.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,
-    '.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,
-    '.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?
-    '.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0,
-    '.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,
-    '.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0,
-    '.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,
-    '.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}
-
-  code_start = f""".end_amdhsa_kernel
-.text
-.global {function_name}
-.type {function_name},@function
-.p2align 8
-{function_name}:
-"""
-
-  ins += ['s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)', 's_endpgm', 's_code_end']
-  return ".amdgpu_metadata\n" + yaml.dump(metadata) + ".end_amdgpu_metadata" + \
-         boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) + "\n" + code_start + \
-         '\n'.join(ins) + f"\n.size {function_name}, .-{function_name}"
--- a/extra/backends/triton.py
+++ b/extra/backends/triton.py
@ -1,131 +0,0 @@
-from typing import Dict, List, Final, Callable, DefaultDict
-from collections import defaultdict
-from tinygrad.uop.ops import UnaryOps, BinaryOps, TernaryOps, Op
-from tinygrad.helpers import DType, PtrDType, dtypes, ImageDType, DEBUG, getenv
-from tinygrad.codegen.opt.kernel import  UOp, Ops
-from triton.compiler import compile as triton_compile
-import linecache
-import math
-import re
-
-triton_dtypes = {dtypes.double: "tl.float64", dtypes.float32: "tl.float32", dtypes.float16: "tl.float16", dtypes.bool: "tl.int1", dtypes.int8: "tl.int8", dtypes.uint8: "tl.uint8", dtypes.int32: "tl.int32", dtypes.int64: "tl.int64", dtypes.uint32: "tl.uint32", dtypes.uint64: "tl.uint64", dtypes.int16: "tl.int16", dtypes.uint16: "tl.uint16"}
-signature_dtypes = {dtypes.double: "fp64",dtypes.float32: "fp32", dtypes.float16: "fp16", dtypes.bool: "i8", dtypes.int8: "i1", dtypes.uint8: "u8", dtypes.int32: "i32", dtypes.int64: "i64", dtypes.uint32: "u32", dtypes.uint64: "u64", dtypes.int16: "i16", dtypes.uint16: "u16"}
-
-def next_power_of_2(x):
-  return 1 << (x - 1).bit_length()
-
-def render_valid(valid):
-  return '(' * (len(valid) -1) + ') and '.join(valid) if len(valid) else 'True'
-
-#NOTE Triton requires matching dimensions for load/store, disable this and see TestOps::test_output_padded_conv_transpose2d fail to compile
-def fill_dims_for_idx(idx, dims):
-  return "(" + idx + "+ (" + (f"0*({'+'.join(d for d in dims)})))") if len(dims) else idx
-
-def get_max(var):
-  if isinstance(var, int): return var
-  return re.sub(r'\[(.*?)\]', '', str(var))[1:-1]
-
-#NOTE can be removed after https://github.com/gpuocelot/gpuocelot/issues/8 gets resolved
-def remove_single_scalar_curly_braces(ptx_code):
-  return '\n'.join([re.sub(r'\{\s*(%\w+)\s*\}', r'\1', line) for line in ptx_code.split('\n')])
-
-def render_const(args,dtype:DType):
-  return (('-' if args<0 else '') + 'tl.where(1,float("inf"),0)') if math.isinf(args) else ('tl.where(1,float("nan"),0)' if math.isnan(args) else f"{int(args)}" if dtypes.is_int(dtype) else str(args))
-
-def render_cast(x:str, dtype:DType, bitcast=False):
-  return f"{x}.to({triton_dtypes[dtype]}, bitcast={bitcast})"
-
-def define_scalar(local_size, dtype, args):
-  if len(local_size) > 0: return f"tl.full(({','.join([str(next_power_of_2(x)) for x in local_size])},),{render_const(args,dtype)}, dtype={triton_dtypes[dtype]})"
-  return render_const(args,dtype)
-
-def uops_to_triton(function_name:str, uops:List[UOp]):
-  local_size: List[int] = []
-  depth = 1
-  signatures, dims, bufs, kernel, valid = [], [], [], [], [] #type: ignore
-
-  c: DefaultDict[str, int] = defaultdict(int)
-  r: Dict[UOp, str] = {}
-  def ssa(u, prefix="t"):
-    nonlocal c, r
-    c[prefix] += 1
-    r[u]=f"{prefix}{c[prefix]-1}"
-    return r[u]
-
-  child_count: DefaultDict[UOp, int] = defaultdict(int)
-  for ru in uops:
-    for v in ru.vin:
-      child_count[v] += 1
-
-  def kk(s): kernel.append("  "*depth+s)
-  code_for_op: Final[Dict[Op, Callable]] = {
-    UnaryOps.EXP2: lambda x,dtype,: f"tl.math.exp2({x})",
-    UnaryOps.LOG2: lambda x,dtype,: f"tl.math.log2({x})",
-    UnaryOps.SIN: lambda x,dtype: f"tl.sin({x})",
-    UnaryOps.SQRT: lambda x,dtype: f"tl.sqrt({x})",
-    UnaryOps.NEG: lambda x,dtype: f"-{x}",
-    BinaryOps.ADD: lambda x,y,dtype: f"({x}+{y})", BinaryOps.SUB: lambda x,y,: f"({x}-{y})",
-    BinaryOps.MUL: lambda x,y,dtype: f"({x}*{y})", BinaryOps.DIV: lambda x,y,: f"({x}/{y})" if y != '0.0' else f"{x}*tl.where({x}==0.0, float('nan'), float('inf'))",
-    BinaryOps.MAX: lambda x,y,dtype: f"tl.maximum({x},{y})",
-    BinaryOps.CMPLT: lambda x,y,dtype: f"({x}<{y})",
-    BinaryOps.MOD: lambda x,y,dtype: f"tl.abs({x})%tl.abs({y})*tl.where({x}<0,-1,1)",
-    TernaryOps.MULACC: lambda x,y,z,dtype: f"(({x}*{y})+{z})",
-    TernaryOps.WHERE: lambda x,y,z,dtype: f"tl.where({x},{y},{z})",
-  }
-  def int_div(x,y): return f"({x}//{y})" if y != '0' else f"{x}*tl.where({x}==0, float('nan'), float('inf'))"
-  for u in uops:
-    uop,dtype,vin,args = u.uop,u.dtype,u.vin,u.arg
-    if uop == Ops.LOOP:
-      kk(f"for {ssa(u, 'ridx')} in range({vin[0].arg}, {r[vin[1]]}):")
-      depth += 1
-    elif uop == Ops.END: depth -= 1
-    elif uop == Ops.ALU:
-      assert dtype is not None
-      val = code_for_op[args](*[r[x] for x in vin])
-      if child_count[u] <=1 or dtypes.is_int(dtype): r[u] = int_div(*[r[x] for x in vin]) if args == BinaryOps.DIV and dtypes.is_int(dtype) else val
-      else: kk(f"{ssa(u, 'alu')} = ({val})")
-    elif uop == Ops.LOAD:
-      assert dtype is not None
-      if len(vin) == 2: kk(f"{ssa(u, 'val')} = {render_cast(f'tl.load({r[vin[0]]} + { fill_dims_for_idx(r[vin[1]], dims)}, mask = {render_valid(valid)})', dtype)}")
-      else: kk(f"{ssa(u, 'val')} = {render_cast(f'tl.where({r[vin[2]]}, tl.load({r[vin[0]]}+{fill_dims_for_idx(r[vin[1]],dims)} , mask={render_valid(valid+[r[vin[2]]])}), 0.0)', dtype)}")
-    elif uop == Ops.DEFINE_REG: kk(f"{ssa(u, 'acc')} = {define_scalar(local_size, dtype, args).replace('//', '/')}")
-    elif uop == Ops.CONST: r[u] = define_scalar([], dtype, args)
-    elif uop == Ops.ASSIGN:
-      kk(f"{r[vin[0]]} = {r[vin[1]].replace('//', '/')}")
-      r[u] = r[vin[0]]
-    elif uop == Ops.STORE:
-      assert not isinstance(dtype, ImageDType), "unimplemented: image store"
-      kk(f"{'if '+r[vin[3]]+': ' if len(vin)>3 else ''}tl.store({r[vin[0]]} + {r[vin[1]]}, {r[vin[2]].replace('//', '/')}, mask = {render_valid(valid)}) ")
-    elif uop == Ops.DEFINE_GLOBAL:
-      bufs.append(args)
-      signatures.append("*" if isinstance(dtype, PtrDType) else "" +  signature_dtypes[dtype])
-      r[u] = args
-    elif uop == Ops.SPECIAL:
-      dims.append(args[1])
-      valid.append(f"{args[1]}<{get_max(args[2])}")
-      if args[1].startswith("g"): kk(f"{args[1]} = tl.program_id({args[0]}) # {args[2]}")
-      elif args[1].startswith("l"):
-        kk(f"{args[1]} = tl.arange({0}, {next_power_of_2(args[2])})")
-        local_size.append(args[2])
-      r[u] = args[1]
-    elif uop == Ops.CAST and dtype is not None: r[u] = render_cast(r[vin[0]], dtype, isinstance(args, tuple) and args[1])
-    else: raise NotImplementedError(f"unimplemented: {uop}")
-
-  prg = f"import triton\nimport triton.language as tl\ntl.core.TRITON_MAX_TENSOR_NUMEL = float('inf')\n@triton.jit\ndef {function_name}("+','.join(bufs)+"):\n"
-  for i, line in enumerate(list(filter(lambda line: "tl.arange" in line, kernel))): kernel[kernel.index(line)] +=  f"[{', '.join([':' if i == j else 'None' for j in range(len(local_size))])}]"
-  prg += "\n".join(kernel)
-
-  acc_local_size = 1
-  for x in local_size: acc_local_size *= next_power_of_2(x)
-  local_size = [acc_local_size] + [1] * (len(local_size) - 1)
-
-  if DEBUG >= 4: print(prg)
-  getlines = linecache.getlines
-  linecache.getlines = lambda filename, module_globals=None: prg.splitlines(keepends=True) if "<triton>" == filename else getlines(filename, module_globals)
-  exec(compile(prg, "<triton>", "exec"), globals()) # pylint: disable=W0122\
-  compiled = triton_compile(globals()[function_name], signature=",".join(signatures), device_type="cuda", debug=False, cc=(35 if getenv("CUDACPU", 0) else None))
-  prg = remove_single_scalar_curly_braces(compiled.asm["ptx"].split(".file")[0].split(".visible .func")[0])
-  max_local_size =  [int(x) for x in prg.split(".maxntid ")[1].split("\n")[0].split(", ")]
-  for i in range(len(local_size)): local_size[i] = min(local_size[i], max_local_size[i])
-
-  return prg, {"shared":compiled.metadata["shared"], "local_size":local_size + [1]*(3-len(local_size))}
--- a/extra/datasets/coco.py
+++ b/extra/datasets/coco.py
@ -1,199 +0,0 @@
-import json
-import pathlib
-import zipfile
-import numpy as np
-from tinygrad.helpers import fetch
-import pycocotools._mask as _mask
-from examples.mask_rcnn import Masker
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-
-iou         = _mask.iou
-merge       = _mask.merge
-frPyObjects = _mask.frPyObjects
-
-BASEDIR = pathlib.Path(__file__).parent / "COCO"
-BASEDIR.mkdir(exist_ok=True)
-
-def create_dict(key_row, val_row, rows): return {row[key_row]:row[val_row] for row in rows}
-
-
-if not pathlib.Path(BASEDIR/'val2017').is_dir():
-  fn = fetch('http://images.cocodataset.org/zips/val2017.zip')
-  with zipfile.ZipFile(fn, 'r') as zip_ref:
-    zip_ref.extractall(BASEDIR)
-  fn.unlink()
-
-
-if not pathlib.Path(BASEDIR/'annotations').is_dir():
-  fn = fetch('http://images.cocodataset.org/annotations/annotations_trainval2017.zip')
-  with zipfile.ZipFile(fn, 'r') as zip_ref:
-    zip_ref.extractall(BASEDIR)
-  fn.unlink()
-
-with open(BASEDIR/'annotations/instances_val2017.json', 'r') as f:
-  annotations_raw = json.loads(f.read())
-images = annotations_raw['images']
-categories = annotations_raw['categories']
-annotations = annotations_raw['annotations']
-file_name_to_id = create_dict('file_name', 'id', images)
-id_to_width = create_dict('id', 'width', images)
-id_to_height = create_dict('id', 'height', images)
-json_category_id_to_contiguous_id = {v['id']: i + 1 for i, v in enumerate(categories)}
-contiguous_category_id_to_json_id = {v:k for k,v in json_category_id_to_contiguous_id.items()}
-
-
-def encode(bimask):
-  if len(bimask.shape) == 3:
-    return _mask.encode(bimask)
-  elif len(bimask.shape) == 2:
-    h, w = bimask.shape
-    return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
-
-def decode(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.decode(rleObjs)
-  else:
-    return _mask.decode([rleObjs])[:,:,0]
-
-def area(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.area(rleObjs)
-  else:
-    return _mask.area([rleObjs])[0]
-
-def toBbox(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.toBbox(rleObjs)
-  else:
-    return _mask.toBbox([rleObjs])[0]
-
-
-def convert_prediction_to_coco_bbox(file_name, prediction):
-  coco_results = []
-  try:
-    original_id = file_name_to_id[file_name]
-    if len(prediction) == 0:
-      return coco_results
-
-    image_width = id_to_width[original_id]
-    image_height = id_to_height[original_id]
-    prediction = prediction.resize((image_width, image_height))
-    prediction = prediction.convert("xywh")
-
-    boxes = prediction.bbox.numpy().tolist()
-    scores = prediction.get_field("scores").numpy().tolist()
-    labels = prediction.get_field("labels").numpy().tolist()
-
-    mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
-
-    coco_results.extend(
-      [
-        {
-          "image_id": original_id,
-          "category_id": mapped_labels[k],
-          "bbox": box,
-          "score": scores[k],
-        }
-          for k, box in enumerate(boxes)
-      ]
-    )
-  except Exception as e:
-    print(file_name, e)
-  return coco_results
-
-masker = Masker(threshold=0.5, padding=1)
-
-def convert_prediction_to_coco_mask(file_name, prediction):
-  coco_results = []
-  try:
-    original_id = file_name_to_id[file_name]
-    if len(prediction) == 0:
-      return coco_results
-
-    image_width = id_to_width[original_id]
-    image_height = id_to_height[original_id]
-    prediction = prediction.resize((image_width, image_height))
-    masks = prediction.get_field("mask")
-
-    scores = prediction.get_field("scores").numpy().tolist()
-    labels = prediction.get_field("labels").numpy().tolist()
-
-    masks = masker([masks], [prediction])[0].numpy()
-
-    rles = [
-      encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0]
-      for mask in masks
-    ]
-    for rle in rles:
-      rle["counts"] = rle["counts"].decode("utf-8")
-
-    mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
-
-    coco_results.extend(
-      [
-        {
-          "image_id": original_id,
-          "category_id": mapped_labels[k],
-          "segmentation": rle,
-          "score": scores[k],
-        }
-          for k, rle in enumerate(rles)
-      ]
-    )
-  except Exception as e:
-    print(file_name, e)
-  return coco_results
-
-
-
-def accumulate_predictions_for_coco(coco_results, json_result_file, rm=False):
-  path = pathlib.Path(json_result_file)
-  if rm and path.exists(): path.unlink()
-  with open(path, "a") as f:
-    for s in coco_results:
-      f.write(json.dumps(s))
-      f.write('\n')
-
-def remove_dup(l):
-  seen = set()
-  seen_add = seen.add
-  return [x for x in l if not (x in seen or seen_add(x))]
-
-class NpEncoder(json.JSONEncoder):
-  def default(self, obj):
-    if isinstance(obj, np.integer):
-      return int(obj)
-    if isinstance(obj, np.floating):
-      return float(obj)
-    if isinstance(obj, np.ndarray):
-      return obj.tolist()
-    return super(NpEncoder, self).default(obj)
-
-
-def evaluate_predictions_on_coco(json_result_file, iou_type="bbox"):
-  coco_results = []
-  with open(json_result_file, "r") as f:
-    for line in f:
-      coco_results.append(json.loads(line))
-
-  coco_gt = COCO(str(BASEDIR/'annotations/instances_val2017.json'))
-  set_of_json = remove_dup([json.dumps(d, cls=NpEncoder) for d in coco_results])
-  unique_list = [json.loads(s) for s in set_of_json]
-
-  with open(f'{json_result_file}.flattend', "w") as f:
-    json.dump(unique_list, f)
-
-  coco_dt = coco_gt.loadRes(str(f'{json_result_file}.flattend'))
-  coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
-  coco_eval.evaluate()
-  coco_eval.accumulate()
-  coco_eval.summarize()
-  return coco_eval
-
-def iterate(files, bs=1):
-  batch = []
-  for file in files:
-    batch.append(file)
-    if len(batch) >= bs: yield batch; batch = []
-  if len(batch) > 0: yield batch; batch = []
--- a/extra/disassemblers/adreno/.gitignore
+++ b/extra/disassemblers/adreno/.gitignore
@ -1 +0,0 @@
-disasm.so
--- a/extra/disassemblers/adreno/README
+++ b/extra/disassemblers/adreno/README
@ -1,5 +0,0 @@
-From the Freedreno project
-
-https://gallium.readthedocs.io/en/latest/gallium/drivers/freedreno.html
-
-In Mesa3D, so licensed MIT.
--- a/extra/disassemblers/adreno/init.py
+++ b/extra/disassemblers/adreno/init.py
@ -1,22 +0,0 @@
-import ctypes
-import os
-import pathlib
-import struct
-from hexdump import hexdump
-
-fxn = None
-def disasm_raw(buf):
-  global fxn
-  if fxn is None:
-    shared = pathlib.Path(__file__).parent / "disasm.so"
-    if not shared.is_file():
-      os.system(f'cd {pathlib.Path(__file__).parent} && gcc -shared disasm-a3xx.c -o disasm.so')
-    fxn = ctypes.CDLL(shared.as_posix())['disasm']
-  fxn(buf, len(buf))
-
-def disasm(buf):
-  def _read_lib(off): return struct.unpack("I", buf[off:off+4])[0]
-
-  image_offset = _read_lib(0xc0)
-  image_size = _read_lib(0x100)
-  disasm_raw(buf[image_offset:image_offset+image_size])
--- a/extra/disassemblers/adreno/disasm-a3xx.c
+++ b/extra/disassemblers/adreno/disasm-a3xx.c
--- a/extra/disassemblers/adreno/instr-a3xx.h
+++ b/extra/disassemblers/adreno/instr-a3xx.h
--- a/extra/disassemblers/adreno/ir3.h
+++ b/extra/disassemblers/adreno/ir3.h
--- a/extra/disassemblers/adreno/shader_enums.h
+++ b/extra/disassemblers/adreno/shader_enums.h
@ -1,906 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef SHADER_ENUMS_H
-#define SHADER_ENUMS_H
-
-#include <stdbool.h>
-
-/* Project-wide (GL and Vulkan) maximum. */
-#define MAX_DRAW_BUFFERS 8
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Shader stages.
- *
- * The order must match how shaders are ordered in the pipeline.
- * The GLSL linker assumes that if i<j, then the j-th shader is
- * executed later than the i-th shader.
- */
-typedef enum
-{
-   MESA_SHADER_NONE = -1,
-   MESA_SHADER_VERTEX = 0,
-   MESA_SHADER_TESS_CTRL = 1,
-   MESA_SHADER_TESS_EVAL = 2,
-   MESA_SHADER_GEOMETRY = 3,
-   MESA_SHADER_FRAGMENT = 4,
-   MESA_SHADER_COMPUTE = 5,
-   /* must be last so it doesn't affect the GL pipeline */
-   MESA_SHADER_KERNEL = 6,
-} gl_shader_stage;
-
-static inline bool
-gl_shader_stage_is_compute(gl_shader_stage stage)
-{
-   return stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL;
-}
-
-/**
- * Number of STATE_* values we need to address any GL state.
- * Used to dimension arrays.
- */
-#define STATE_LENGTH 5
-
-typedef short gl_state_index16; /* see enum gl_state_index */
-
-const char *gl_shader_stage_name(gl_shader_stage stage);
-
-/**
- * Translate a gl_shader_stage to a short shader stage name for debug
- * printouts and error messages.
- */
-const char *_mesa_shader_stage_to_string(unsigned stage);
-
-/**
- * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
- * for debug printouts and error messages.
- */
-const char *_mesa_shader_stage_to_abbrev(unsigned stage);
-
-/**
- * GL related stages (not including CL)
- */
-#define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
-
-/**
- * All stages
- */
-#define MESA_ALL_SHADER_STAGES (MESA_SHADER_KERNEL + 1)
-
-
-/**
- * Indexes for vertex program attributes.
- * GL_NV_vertex_program aliases generic attributes over the conventional
- * attributes.  In GL_ARB_vertex_program shader the aliasing is optional.
- * In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
- * generic attributes are distinct/separate).
- */
-typedef enum
-{
-   VERT_ATTRIB_POS,
-   VERT_ATTRIB_NORMAL,
-   VERT_ATTRIB_COLOR0,
-   VERT_ATTRIB_COLOR1,
-   VERT_ATTRIB_FOG,
-   VERT_ATTRIB_COLOR_INDEX,
-   VERT_ATTRIB_EDGEFLAG,
-   VERT_ATTRIB_TEX0,
-   VERT_ATTRIB_TEX1,
-   VERT_ATTRIB_TEX2,
-   VERT_ATTRIB_TEX3,
-   VERT_ATTRIB_TEX4,
-   VERT_ATTRIB_TEX5,
-   VERT_ATTRIB_TEX6,
-   VERT_ATTRIB_TEX7,
-   VERT_ATTRIB_POINT_SIZE,
-   VERT_ATTRIB_GENERIC0,
-   VERT_ATTRIB_GENERIC1,
-   VERT_ATTRIB_GENERIC2,
-   VERT_ATTRIB_GENERIC3,
-   VERT_ATTRIB_GENERIC4,
-   VERT_ATTRIB_GENERIC5,
-   VERT_ATTRIB_GENERIC6,
-   VERT_ATTRIB_GENERIC7,
-   VERT_ATTRIB_GENERIC8,
-   VERT_ATTRIB_GENERIC9,
-   VERT_ATTRIB_GENERIC10,
-   VERT_ATTRIB_GENERIC11,
-   VERT_ATTRIB_GENERIC12,
-   VERT_ATTRIB_GENERIC13,
-   VERT_ATTRIB_GENERIC14,
-   VERT_ATTRIB_GENERIC15,
-   VERT_ATTRIB_MAX
-} gl_vert_attrib;
-
-const char *gl_vert_attrib_name(gl_vert_attrib attrib);
-
-/**
- * Symbolic constats to help iterating over
- * specific blocks of vertex attributes.
- *
- * VERT_ATTRIB_FF
- *   includes all fixed function attributes as well as
- *   the aliased GL_NV_vertex_program shader attributes.
- * VERT_ATTRIB_TEX
- *   include the classic texture coordinate attributes.
- *   Is a subset of VERT_ATTRIB_FF.
- * VERT_ATTRIB_GENERIC
- *   include the OpenGL 2.0+ GLSL generic shader attributes.
- *   These alias the generic GL_ARB_vertex_shader attributes.
- * VERT_ATTRIB_MAT
- *   include the generic shader attributes used to alias
- *   varying material values for the TNL shader programs.
- *   They are located at the end of the generic attribute
- *   block not to overlap with the generic 0 attribute.
- */
-#define VERT_ATTRIB_FF(i)           (VERT_ATTRIB_POS + (i))
-#define VERT_ATTRIB_FF_MAX          VERT_ATTRIB_GENERIC0
-
-#define VERT_ATTRIB_TEX(i)          (VERT_ATTRIB_TEX0 + (i))
-#define VERT_ATTRIB_TEX_MAX         MAX_TEXTURE_COORD_UNITS
-
-#define VERT_ATTRIB_GENERIC(i)      (VERT_ATTRIB_GENERIC0 + (i))
-#define VERT_ATTRIB_GENERIC_MAX     MAX_VERTEX_GENERIC_ATTRIBS
-
-#define VERT_ATTRIB_MAT0            \
-   (VERT_ATTRIB_GENERIC_MAX - VERT_ATTRIB_MAT_MAX)
-#define VERT_ATTRIB_MAT(i)          \
-   VERT_ATTRIB_GENERIC((i) + VERT_ATTRIB_MAT0)
-#define VERT_ATTRIB_MAT_MAX         MAT_ATTRIB_MAX
-
-/**
- * Bitflags for vertex attributes.
- * These are used in bitfields in many places.
- */
-/*@{*/
-#define VERT_BIT_POS             BITFIELD_BIT(VERT_ATTRIB_POS)
-#define VERT_BIT_NORMAL          BITFIELD_BIT(VERT_ATTRIB_NORMAL)
-#define VERT_BIT_COLOR0          BITFIELD_BIT(VERT_ATTRIB_COLOR0)
-#define VERT_BIT_COLOR1          BITFIELD_BIT(VERT_ATTRIB_COLOR1)
-#define VERT_BIT_FOG             BITFIELD_BIT(VERT_ATTRIB_FOG)
-#define VERT_BIT_COLOR_INDEX     BITFIELD_BIT(VERT_ATTRIB_COLOR_INDEX)
-#define VERT_BIT_EDGEFLAG        BITFIELD_BIT(VERT_ATTRIB_EDGEFLAG)
-#define VERT_BIT_TEX0            BITFIELD_BIT(VERT_ATTRIB_TEX0)
-#define VERT_BIT_TEX1            BITFIELD_BIT(VERT_ATTRIB_TEX1)
-#define VERT_BIT_TEX2            BITFIELD_BIT(VERT_ATTRIB_TEX2)
-#define VERT_BIT_TEX3            BITFIELD_BIT(VERT_ATTRIB_TEX3)
-#define VERT_BIT_TEX4            BITFIELD_BIT(VERT_ATTRIB_TEX4)
-#define VERT_BIT_TEX5            BITFIELD_BIT(VERT_ATTRIB_TEX5)
-#define VERT_BIT_TEX6            BITFIELD_BIT(VERT_ATTRIB_TEX6)
-#define VERT_BIT_TEX7            BITFIELD_BIT(VERT_ATTRIB_TEX7)
-#define VERT_BIT_POINT_SIZE      BITFIELD_BIT(VERT_ATTRIB_POINT_SIZE)
-#define VERT_BIT_GENERIC0        BITFIELD_BIT(VERT_ATTRIB_GENERIC0)
-
-#define VERT_BIT(i)              BITFIELD_BIT(i)
-#define VERT_BIT_ALL             BITFIELD_RANGE(0, VERT_ATTRIB_MAX)
-
-#define VERT_BIT_FF(i)           VERT_BIT(i)
-#define VERT_BIT_FF_ALL          BITFIELD_RANGE(0, VERT_ATTRIB_FF_MAX)
-#define VERT_BIT_TEX(i)          VERT_BIT(VERT_ATTRIB_TEX(i))
-#define VERT_BIT_TEX_ALL         \
-   BITFIELD_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)
-
-#define VERT_BIT_GENERIC(i)      VERT_BIT(VERT_ATTRIB_GENERIC(i))
-#define VERT_BIT_GENERIC_ALL     \
-   BITFIELD_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
-
-#define VERT_BIT_MAT(i)	         VERT_BIT(VERT_ATTRIB_MAT(i))
-#define VERT_BIT_MAT_ALL         \
-   BITFIELD_RANGE(VERT_ATTRIB_MAT(0), VERT_ATTRIB_MAT_MAX)
-/*@}*/
-
-#define MAX_VARYING 32 /**< number of float[4] vectors */
-
-/**
- * Indexes for vertex shader outputs, geometry shader inputs/outputs, and
- * fragment shader inputs.
- *
- * Note that some of these values are not available to all pipeline stages.
- *
- * When this enum is updated, the following code must be updated too:
- * - vertResults (in prog_print.c's arb_output_attrib_string())
- * - fragAttribs (in prog_print.c's arb_input_attrib_string())
- * - _mesa_varying_slot_in_fs()
- */
-typedef enum
-{
-   VARYING_SLOT_POS,
-   VARYING_SLOT_COL0, /* COL0 and COL1 must be contiguous */
-   VARYING_SLOT_COL1,
-   VARYING_SLOT_FOGC,
-   VARYING_SLOT_TEX0, /* TEX0-TEX7 must be contiguous */
-   VARYING_SLOT_TEX1,
-   VARYING_SLOT_TEX2,
-   VARYING_SLOT_TEX3,
-   VARYING_SLOT_TEX4,
-   VARYING_SLOT_TEX5,
-   VARYING_SLOT_TEX6,
-   VARYING_SLOT_TEX7,
-   VARYING_SLOT_PSIZ, /* Does not appear in FS */
-   VARYING_SLOT_BFC0, /* Does not appear in FS */
-   VARYING_SLOT_BFC1, /* Does not appear in FS */
-   VARYING_SLOT_EDGE, /* Does not appear in FS */
-   VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */
-   VARYING_SLOT_CLIP_DIST0,
-   VARYING_SLOT_CLIP_DIST1,
-   VARYING_SLOT_CULL_DIST0,
-   VARYING_SLOT_CULL_DIST1,
-   VARYING_SLOT_PRIMITIVE_ID, /* Does not appear in VS */
-   VARYING_SLOT_LAYER, /* Appears as VS or GS output */
-   VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
-   VARYING_SLOT_FACE, /* FS only */
-   VARYING_SLOT_PNTC, /* FS only */
-   VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
-   VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
-   VARYING_SLOT_BOUNDING_BOX0, /* Only appears as TCS output. */
-   VARYING_SLOT_BOUNDING_BOX1, /* Only appears as TCS output. */
-   VARYING_SLOT_VIEW_INDEX,
-   VARYING_SLOT_VIEWPORT_MASK, /* Does not appear in FS */
-   VARYING_SLOT_VAR0, /* First generic varying slot */
-   /* the remaining are simply for the benefit of gl_varying_slot_name()
-    * and not to be construed as an upper bound:
-    */
-   VARYING_SLOT_VAR1,
-   VARYING_SLOT_VAR2,
-   VARYING_SLOT_VAR3,
-   VARYING_SLOT_VAR4,
-   VARYING_SLOT_VAR5,
-   VARYING_SLOT_VAR6,
-   VARYING_SLOT_VAR7,
-   VARYING_SLOT_VAR8,
-   VARYING_SLOT_VAR9,
-   VARYING_SLOT_VAR10,
-   VARYING_SLOT_VAR11,
-   VARYING_SLOT_VAR12,
-   VARYING_SLOT_VAR13,
-   VARYING_SLOT_VAR14,
-   VARYING_SLOT_VAR15,
-   VARYING_SLOT_VAR16,
-   VARYING_SLOT_VAR17,
-   VARYING_SLOT_VAR18,
-   VARYING_SLOT_VAR19,
-   VARYING_SLOT_VAR20,
-   VARYING_SLOT_VAR21,
-   VARYING_SLOT_VAR22,
-   VARYING_SLOT_VAR23,
-   VARYING_SLOT_VAR24,
-   VARYING_SLOT_VAR25,
-   VARYING_SLOT_VAR26,
-   VARYING_SLOT_VAR27,
-   VARYING_SLOT_VAR28,
-   VARYING_SLOT_VAR29,
-   VARYING_SLOT_VAR30,
-   VARYING_SLOT_VAR31,
-} gl_varying_slot;
-
-
-#define VARYING_SLOT_MAX	(VARYING_SLOT_VAR0 + MAX_VARYING)
-#define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
-#define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
-#define MAX_VARYINGS_INCL_PATCH (VARYING_SLOT_TESS_MAX - VARYING_SLOT_VAR0)
-
-const char *gl_varying_slot_name(gl_varying_slot slot);
-
-/**
- * Bitflags for varying slots.
- */
-/*@{*/
-#define VARYING_BIT_POS BITFIELD64_BIT(VARYING_SLOT_POS)
-#define VARYING_BIT_COL0 BITFIELD64_BIT(VARYING_SLOT_COL0)
-#define VARYING_BIT_COL1 BITFIELD64_BIT(VARYING_SLOT_COL1)
-#define VARYING_BIT_FOGC BITFIELD64_BIT(VARYING_SLOT_FOGC)
-#define VARYING_BIT_TEX0 BITFIELD64_BIT(VARYING_SLOT_TEX0)
-#define VARYING_BIT_TEX1 BITFIELD64_BIT(VARYING_SLOT_TEX1)
-#define VARYING_BIT_TEX2 BITFIELD64_BIT(VARYING_SLOT_TEX2)
-#define VARYING_BIT_TEX3 BITFIELD64_BIT(VARYING_SLOT_TEX3)
-#define VARYING_BIT_TEX4 BITFIELD64_BIT(VARYING_SLOT_TEX4)
-#define VARYING_BIT_TEX5 BITFIELD64_BIT(VARYING_SLOT_TEX5)
-#define VARYING_BIT_TEX6 BITFIELD64_BIT(VARYING_SLOT_TEX6)
-#define VARYING_BIT_TEX7 BITFIELD64_BIT(VARYING_SLOT_TEX7)
-#define VARYING_BIT_TEX(U) BITFIELD64_BIT(VARYING_SLOT_TEX0 + (U))
-#define VARYING_BITS_TEX_ANY BITFIELD64_RANGE(VARYING_SLOT_TEX0, \
-                                              MAX_TEXTURE_COORD_UNITS)
-#define VARYING_BIT_PSIZ BITFIELD64_BIT(VARYING_SLOT_PSIZ)
-#define VARYING_BIT_BFC0 BITFIELD64_BIT(VARYING_SLOT_BFC0)
-#define VARYING_BIT_BFC1 BITFIELD64_BIT(VARYING_SLOT_BFC1)
-#define VARYING_BITS_COLOR (VARYING_BIT_COL0 | \
-                            VARYING_BIT_COL1 |        \
-                            VARYING_BIT_BFC0 |        \
-                            VARYING_BIT_BFC1)
-#define VARYING_BIT_EDGE BITFIELD64_BIT(VARYING_SLOT_EDGE)
-#define VARYING_BIT_CLIP_VERTEX BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX)
-#define VARYING_BIT_CLIP_DIST0 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)
-#define VARYING_BIT_CLIP_DIST1 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)
-#define VARYING_BIT_CULL_DIST0 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)
-#define VARYING_BIT_CULL_DIST1 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)
-#define VARYING_BIT_PRIMITIVE_ID BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID)
-#define VARYING_BIT_LAYER BITFIELD64_BIT(VARYING_SLOT_LAYER)
-#define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
-#define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
-#define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
-#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
-#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
-#define VARYING_BIT_BOUNDING_BOX0 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX0)
-#define VARYING_BIT_BOUNDING_BOX1 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX1)
-#define VARYING_BIT_VIEWPORT_MASK BITFIELD64_BIT(VARYING_SLOT_VIEWPORT_MASK)
-#define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
-/*@}*/
-
-/**
- * Bitflags for system values.
- */
-#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
-#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
-#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
-#define SYSTEM_BIT_LOCAL_INVOCATION_ID ((uint64_t)1 << SYSTEM_VALUE_LOCAL_INVOCATION_ID)
-
-/**
- * If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
- * one of these values.  If a NIR variable's mode is nir_var_system_value, it
- * will be one of these values.
- */
-typedef enum
-{
-   /**
-    * \name System values applicable to all shaders
-    */
-   /*@{*/
-
-   /**
-    * Builtin variables added by GL_ARB_shader_ballot.
-    */
-   /*@{*/
-
-   /**
-    * From the GL_ARB_shader-ballot spec:
-    *
-    *    "A sub-group is a collection of invocations which execute in lockstep.
-    *     The variable <gl_SubGroupSizeARB> is the maximum number of
-    *     invocations in a sub-group. The maximum <gl_SubGroupSizeARB>
-    *     supported in this extension is 64."
-    *
-    * The spec defines this as a uniform. However, it's highly unlikely that
-    * implementations actually treat it as a uniform (which is loaded from a
-    * constant buffer). Most likely, this is an implementation-wide constant,
-    * or perhaps something that depends on the shader stage.
-    */
-   SYSTEM_VALUE_SUBGROUP_SIZE,
-
-   /**
-    * From the GL_ARB_shader_ballot spec:
-    *
-    *    "The variable <gl_SubGroupInvocationARB> holds the index of the
-    *     invocation within sub-group. This variable is in the range 0 to
-    *     <gl_SubGroupSizeARB>-1, where <gl_SubGroupSizeARB> is the total
-    *     number of invocations in a sub-group."
-    */
-   SYSTEM_VALUE_SUBGROUP_INVOCATION,
-
-   /**
-    * From the GL_ARB_shader_ballot spec:
-    *
-    *    "The <gl_SubGroup??MaskARB> variables provide a bitmask for all
-    *     invocations, with one bit per invocation starting with the least
-    *     significant bit, according to the following table,
-    *
-    *       variable               equation for bit values
-    *       --------------------   ------------------------------------
-    *       gl_SubGroupEqMaskARB   bit index == gl_SubGroupInvocationARB
-    *       gl_SubGroupGeMaskARB   bit index >= gl_SubGroupInvocationARB
-    *       gl_SubGroupGtMaskARB   bit index >  gl_SubGroupInvocationARB
-    *       gl_SubGroupLeMaskARB   bit index <= gl_SubGroupInvocationARB
-    *       gl_SubGroupLtMaskARB   bit index <  gl_SubGroupInvocationARB
-    */
-   SYSTEM_VALUE_SUBGROUP_EQ_MASK,
-   SYSTEM_VALUE_SUBGROUP_GE_MASK,
-   SYSTEM_VALUE_SUBGROUP_GT_MASK,
-   SYSTEM_VALUE_SUBGROUP_LE_MASK,
-   SYSTEM_VALUE_SUBGROUP_LT_MASK,
-   /*@}*/
-
-   /**
-    * Builtin variables added by VK_KHR_subgroups
-    */
-   /*@{*/
-   SYSTEM_VALUE_NUM_SUBGROUPS,
-   SYSTEM_VALUE_SUBGROUP_ID,
-   /*@}*/
-
-   /*@}*/
-
-   /**
-    * \name Vertex shader system values
-    */
-   /*@{*/
-   /**
-    * OpenGL-style vertex ID.
-    *
-    * Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
-    * OpenGL 3.3 core profile spec says:
-    *
-    *     "gl_VertexID holds the integer index i implicitly passed by
-    *     DrawArrays or one of the other drawing commands defined in section
-    *     2.8.3."
-    *
-    * Section 2.8.3 (Drawing Commands) of the same spec says:
-    *
-    *     "The commands....are equivalent to the commands with the same base
-    *     name (without the BaseVertex suffix), except that the ith element
-    *     transferred by the corresponding draw call will be taken from
-    *     element indices[i] + basevertex of each enabled array."
-    *
-    * Additionally, the overview in the GL_ARB_shader_draw_parameters spec
-    * says:
-    *
-    *     "In unextended GL, vertex shaders have inputs named gl_VertexID and
-    *     gl_InstanceID, which contain, respectively the index of the vertex
-    *     and instance. The value of gl_VertexID is the implicitly passed
-    *     index of the vertex being processed, which includes the value of
-    *     baseVertex, for those commands that accept it."
-    *
-    * gl_VertexID gets basevertex added in.  This differs from DirectX where
-    * SV_VertexID does \b not get basevertex added in.
-    *
-    * \note
-    * If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
-    * equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
-    * \c SYSTEM_VALUE_BASE_VERTEX.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID,
-
-   /**
-    * Instanced ID as supplied to gl_InstanceID
-    *
-    * Values assigned to gl_InstanceID always begin with zero, regardless of
-    * the value of baseinstance.
-    *
-    * Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
-    * says:
-    *
-    *     "gl_InstanceID holds the integer instance number of the current
-    *     primitive in an instanced draw call (see section 10.5)."
-    *
-    * Through a big chain of pseudocode, section 10.5 describes that
-    * baseinstance is not counted by gl_InstanceID.  In that section, notice
-    *
-    *     "If an enabled vertex attribute array is instanced (it has a
-    *     non-zero divisor as specified by VertexAttribDivisor), the element
-    *     index that is transferred to the GL, for all vertices, is given by
-    *
-    *         floor(instance/divisor) + baseinstance
-    *
-    *     If an array corresponding to an attribute required by a vertex
-    *     shader is not enabled, then the corresponding element is taken from
-    *     the current attribute state (see section 10.2)."
-    *
-    * Note that baseinstance is \b not included in the value of instance.
-    */
-   SYSTEM_VALUE_INSTANCE_ID,
-
-   /**
-    * Vulkan InstanceIndex.
-    *
-    * InstanceIndex = gl_InstanceID + gl_BaseInstance
-    */
-   SYSTEM_VALUE_INSTANCE_INDEX,
-
-   /**
-    * DirectX-style vertex ID.
-    *
-    * Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
-    * the value of basevertex.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
-
-   /**
-    * Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
-    * functions.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
-    */
-   SYSTEM_VALUE_BASE_VERTEX,
-
-   /**
-    * Depending on the type of the draw call (indexed or non-indexed),
-    * is the value of \c basevertex passed to \c glDrawElementsBaseVertex and
-    * similar, or is the value of \c first passed to \c glDrawArrays and
-    * similar.
-    *
-    * \note
-    * It can be used to calculate the \c SYSTEM_VALUE_VERTEX_ID as
-    * \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus \c SYSTEM_VALUE_FIRST_VERTEX.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_VERTEX_ID
-    */
-   SYSTEM_VALUE_FIRST_VERTEX,
-
-   /**
-    * If the Draw command used to start the rendering was an indexed draw
-    * or not (~0/0). Useful to calculate \c SYSTEM_VALUE_BASE_VERTEX as
-    * \c SYSTEM_VALUE_IS_INDEXED_DRAW & \c SYSTEM_VALUE_FIRST_VERTEX.
-    */
-   SYSTEM_VALUE_IS_INDEXED_DRAW,
-
-   /**
-    * Value of \c baseinstance passed to instanced draw entry points
-    *
-    * \sa SYSTEM_VALUE_INSTANCE_ID
-    */
-   SYSTEM_VALUE_BASE_INSTANCE,
-
-   /**
-    * From _ARB_shader_draw_parameters:
-    *
-    *   "Additionally, this extension adds a further built-in variable,
-    *    gl_DrawID to the shading language. This variable contains the index
-    *    of the draw currently being processed by a Multi* variant of a
-    *    drawing command (such as MultiDrawElements or
-    *    MultiDrawArraysIndirect)."
-    *
-    * If GL_ARB_multi_draw_indirect is not supported, this is always 0.
-    */
-   SYSTEM_VALUE_DRAW_ID,
-   /*@}*/
-
-   /**
-    * \name Geometry shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_INVOCATION_ID,  /**< (Also in Tessellation Control shader) */
-   /*@}*/
-
-   /**
-    * \name Fragment shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_FRAG_COORD,
-   SYSTEM_VALUE_POINT_COORD,
-   SYSTEM_VALUE_FRONT_FACE,
-   SYSTEM_VALUE_SAMPLE_ID,
-   SYSTEM_VALUE_SAMPLE_POS,
-   SYSTEM_VALUE_SAMPLE_MASK_IN,
-   SYSTEM_VALUE_HELPER_INVOCATION,
-   SYSTEM_VALUE_COLOR0,
-   SYSTEM_VALUE_COLOR1,
-   /*@}*/
-
-   /**
-    * \name Tessellation Evaluation shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_TESS_COORD,
-   SYSTEM_VALUE_VERTICES_IN,    /**< Tessellation vertices in input patch */
-   SYSTEM_VALUE_PRIMITIVE_ID,
-   SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
-   SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
-   SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT, /**< TCS input for passthru TCS */
-   SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT, /**< TCS input for passthru TCS */
-   /*@}*/
-
-   /**
-    * \name Compute shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_LOCAL_INVOCATION_ID,
-   SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
-   SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
-   SYSTEM_VALUE_GLOBAL_INVOCATION_INDEX,
-   SYSTEM_VALUE_WORK_GROUP_ID,
-   SYSTEM_VALUE_NUM_WORK_GROUPS,
-   SYSTEM_VALUE_LOCAL_GROUP_SIZE,
-   SYSTEM_VALUE_GLOBAL_GROUP_SIZE,
-   SYSTEM_VALUE_WORK_DIM,
-   SYSTEM_VALUE_USER_DATA_AMD,
-   /*@}*/
-
-   /** Required for VK_KHR_device_group */
-   SYSTEM_VALUE_DEVICE_INDEX,
-
-   /** Required for VK_KHX_multiview */
-   SYSTEM_VALUE_VIEW_INDEX,
-
-   /**
-    * Driver internal vertex-count, used (for example) for drivers to
-    * calculate stride for stream-out outputs.  Not externally visible.
-    */
-   SYSTEM_VALUE_VERTEX_CNT,
-
-   /**
-    * Required for AMD_shader_explicit_vertex_parameter and also used for
-    * varying-fetch instructions.
-    *
-    * The _SIZE value is "primitive size", used to scale i/j in primitive
-    * space to pixel space.
-    */
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE,
-   SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL,
-
-   /**
-    * IR3 specific geometry shader and tesselation control shader system
-    * values that packs invocation id, thread id and vertex id.  Having this
-    * as a nir level system value lets us do the unpacking in nir.
-    */
-   SYSTEM_VALUE_GS_HEADER_IR3,
-   SYSTEM_VALUE_TCS_HEADER_IR3,
-
-   SYSTEM_VALUE_MAX             /**< Number of values */
-} gl_system_value;
-
-const char *gl_system_value_name(gl_system_value sysval);
-
-/**
- * The possible interpolation qualifiers that can be applied to a fragment
- * shader input in GLSL.
- *
- * Note: INTERP_MODE_NONE must be 0 so that memsetting the
- * ir_variable data structure to 0 causes the default behavior.
- */
-enum glsl_interp_mode
-{
-   INTERP_MODE_NONE = 0,
-   INTERP_MODE_SMOOTH,
-   INTERP_MODE_FLAT,
-   INTERP_MODE_NOPERSPECTIVE,
-   INTERP_MODE_EXPLICIT,
-   INTERP_MODE_COUNT /**< Number of interpolation qualifiers */
-};
-
-enum glsl_interface_packing {
-   GLSL_INTERFACE_PACKING_STD140,
-   GLSL_INTERFACE_PACKING_SHARED,
-   GLSL_INTERFACE_PACKING_PACKED,
-   GLSL_INTERFACE_PACKING_STD430
-};
-
-const char *glsl_interp_mode_name(enum glsl_interp_mode qual);
-
-/**
- * Fragment program results
- */
-typedef enum
-{
-   FRAG_RESULT_DEPTH = 0,
-   FRAG_RESULT_STENCIL = 1,
-   /* If a single color should be written to all render targets, this
-    * register is written.  No FRAG_RESULT_DATAn will be written.
-    */
-   FRAG_RESULT_COLOR = 2,
-   FRAG_RESULT_SAMPLE_MASK = 3,
-
-   /* FRAG_RESULT_DATAn are the per-render-target (GLSL gl_FragData[n]
-    * or ARB_fragment_program fragment.color[n]) color results.  If
-    * any are written, FRAG_RESULT_COLOR will not be written.
-    * FRAG_RESULT_DATA1 and up are simply for the benefit of
-    * gl_frag_result_name() and not to be construed as an upper bound
-    */
-   FRAG_RESULT_DATA0 = 4,
-   FRAG_RESULT_DATA1,
-   FRAG_RESULT_DATA2,
-   FRAG_RESULT_DATA3,
-   FRAG_RESULT_DATA4,
-   FRAG_RESULT_DATA5,
-   FRAG_RESULT_DATA6,
-   FRAG_RESULT_DATA7,
-} gl_frag_result;
-
-const char *gl_frag_result_name(gl_frag_result result);
-
-#define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
-
-/**
- * \brief Layout qualifiers for gl_FragDepth.
- *
- * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
- * a layout qualifier.
- *
- * \see enum ir_depth_layout
- */
-enum gl_frag_depth_layout
-{
-   FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
-   FRAG_DEPTH_LAYOUT_ANY,
-   FRAG_DEPTH_LAYOUT_GREATER,
-   FRAG_DEPTH_LAYOUT_LESS,
-   FRAG_DEPTH_LAYOUT_UNCHANGED
-};
-
-/**
- * \brief Buffer access qualifiers
- */
-enum gl_access_qualifier
-{
-   ACCESS_COHERENT      = (1 << 0),
-   ACCESS_RESTRICT      = (1 << 1),
-   ACCESS_VOLATILE      = (1 << 2),
-   ACCESS_NON_READABLE  = (1 << 3),
-   ACCESS_NON_WRITEABLE = (1 << 4),
-
-   /** The access may use a non-uniform buffer or image index */
-   ACCESS_NON_UNIFORM   = (1 << 5),
-
-   /* This has the same semantics as NIR_INTRINSIC_CAN_REORDER, only to be
-    * used with loads. In other words, it means that the load can be
-    * arbitrarily reordered, or combined with other loads to the same address.
-    * It is implied by ACCESS_NON_WRITEABLE together with ACCESS_RESTRICT, and
-    * a lack of ACCESS_COHERENT and ACCESS_VOLATILE.
-    */
-   ACCESS_CAN_REORDER = (1 << 6),
-
-   /** Use as little cache space as possible. */
-   ACCESS_STREAM_CACHE_POLICY = (1 << 7),
-};
-
-/**
- * \brief Blend support qualifiers
- */
-enum gl_advanced_blend_mode
-{
-   BLEND_NONE           = 0x0000,
-
-   BLEND_MULTIPLY       = 0x0001,
-   BLEND_SCREEN         = 0x0002,
-   BLEND_OVERLAY        = 0x0004,
-   BLEND_DARKEN         = 0x0008,
-   BLEND_LIGHTEN        = 0x0010,
-   BLEND_COLORDODGE     = 0x0020,
-   BLEND_COLORBURN      = 0x0040,
-   BLEND_HARDLIGHT      = 0x0080,
-   BLEND_SOFTLIGHT      = 0x0100,
-   BLEND_DIFFERENCE     = 0x0200,
-   BLEND_EXCLUSION      = 0x0400,
-   BLEND_HSL_HUE        = 0x0800,
-   BLEND_HSL_SATURATION = 0x1000,
-   BLEND_HSL_COLOR      = 0x2000,
-   BLEND_HSL_LUMINOSITY = 0x4000,
-
-   BLEND_ALL            = 0x7fff,
-};
-
-enum blend_func
-{
-   BLEND_FUNC_ADD,
-   BLEND_FUNC_SUBTRACT,
-   BLEND_FUNC_REVERSE_SUBTRACT,
-   BLEND_FUNC_MIN,
-   BLEND_FUNC_MAX,
-};
-
-enum blend_factor
-{
-   BLEND_FACTOR_ZERO,
-   BLEND_FACTOR_SRC_COLOR,
-   BLEND_FACTOR_DST_COLOR,
-   BLEND_FACTOR_SRC_ALPHA,
-   BLEND_FACTOR_DST_ALPHA,
-   BLEND_FACTOR_CONSTANT_COLOR,
-   BLEND_FACTOR_CONSTANT_ALPHA,
-   BLEND_FACTOR_SRC_ALPHA_SATURATE,
-};
-
-enum gl_tess_spacing
-{
-   TESS_SPACING_UNSPECIFIED,
-   TESS_SPACING_EQUAL,
-   TESS_SPACING_FRACTIONAL_ODD,
-   TESS_SPACING_FRACTIONAL_EVEN,
-};
-
-/**
- * A compare function enum for use in compiler lowering passes.  This is in
- * the same order as GL's compare functions (shifted down by GL_NEVER), and is
- * exactly the same as gallium's PIPE_FUNC_*.
- */
-enum compare_func
-{
-   COMPARE_FUNC_NEVER,
-   COMPARE_FUNC_LESS,
-   COMPARE_FUNC_EQUAL,
-   COMPARE_FUNC_LEQUAL,
-   COMPARE_FUNC_GREATER,
-   COMPARE_FUNC_NOTEQUAL,
-   COMPARE_FUNC_GEQUAL,
-   COMPARE_FUNC_ALWAYS,
-};
-
-/**
- * Arrangements for grouping invocations from NV_compute_shader_derivatives.
- *
- *   The extension provides new layout qualifiers that support two different
- *   arrangements of compute shader invocations for the purpose of derivative
- *   computation.  When specifying
- *
- *     layout(derivative_group_quadsNV) in;
- *
- *   compute shader invocations are grouped into 2x2x1 arrays whose four local
- *   invocation ID values follow the pattern:
- *
- *       +-----------------+------------------+
- *       | (2x+0, 2y+0, z) |  (2x+1, 2y+0, z) |
- *       +-----------------+------------------+
- *       | (2x+0, 2y+1, z) |  (2x+1, 2y+1, z) |
- *       +-----------------+------------------+
- *
- *   where Y increases from bottom to top.  When specifying
- *
- *     layout(derivative_group_linearNV) in;
- *
- *   compute shader invocations are grouped into 2x2x1 arrays whose four local
- *   invocation index values follow the pattern:
- *
- *       +------+------+
- *       | 4n+0 | 4n+1 |
- *       +------+------+
- *       | 4n+2 | 4n+3 |
- *       +------+------+
- *
- *   If neither layout qualifier is specified, derivatives in compute shaders
- *   return zero, which is consistent with the handling of built-in texture
- *   functions like texture() in GLSL 4.50 compute shaders.
- */
-enum gl_derivative_group {
-   DERIVATIVE_GROUP_NONE = 0,
-   DERIVATIVE_GROUP_QUADS,
-   DERIVATIVE_GROUP_LINEAR,
-};
-
-enum float_controls
-{
-   FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE        = 0x0000,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP16              = 0x0001,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP32              = 0x0002,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP64              = 0x0004,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16         = 0x0008,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32         = 0x0010,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64         = 0x0020,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 = 0x0040,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 = 0x0080,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64 = 0x0100,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16            = 0x0200,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32            = 0x0400,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64            = 0x0800,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16            = 0x1000,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32            = 0x2000,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64            = 0x4000,
-};
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* SHADER_ENUMS_H */
--- a/extra/disassemblers/adreno/util/bitscan.h
+++ b/extra/disassemblers/adreno/util/bitscan.h
@ -1,326 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef BITSCAN_H
-#define BITSCAN_H
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
-#if defined(__POPCNT__)
-#include <popcntintrin.h>
-#endif
-
-//#include "c99_compat.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/**
- * Find first bit set in word.  Least significant bit is 1.
- * Return 0 if no bits set.
- */
-#ifdef HAVE___BUILTIN_FFS
-#define ffs __builtin_ffs
-#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
-static inline
-int ffs(int i)
-{
-   unsigned long index;
-   if (_BitScanForward(&index, i))
-      return index + 1;
-   else
-      return 0;
-}
-#else
-extern
-int ffs(int i);
-#endif
-
-#ifdef HAVE___BUILTIN_FFSLL
-#define ffsll __builtin_ffsll
-#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
-static inline int
-ffsll(long long int i)
-{
-   unsigned long index;
-   if (_BitScanForward64(&index, i))
-      return index + 1;
-   else
-      return 0;
-}
-#else
-extern int
-ffsll(long long int val);
-#endif
-
-
-/* Destructively loop over all of the bits in a mask as in:
- *
- * while (mymask) {
- *   int i = u_bit_scan(&mymask);
- *   ... process element i
- * }
- *
- */
-static inline int
-u_bit_scan(unsigned *mask)
-{
-   const int i = ffs(*mask) - 1;
-   *mask ^= (1u << i);
-   return i;
-}
-
-static inline int
-u_bit_scan64(uint64_t *mask)
-{
-   const int i = ffsll(*mask) - 1;
-   *mask ^= (((uint64_t)1) << i);
-   return i;
-}
-
-/* Determine if an unsigned value is a power of two.
- *
- * \note
- * Zero is treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_or_zero(unsigned v)
-{
-   return (v & (v - 1)) == 0;
-}
-
-/* Determine if an uint64_t value is a power of two.
- *
- * \note
- * Zero is treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_or_zero64(uint64_t v)
-{
-   return (v & (v - 1)) == 0;
-}
-
-/* Determine if an unsigned value is a power of two.
- *
- * \note
- * Zero is \b not treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_nonzero(unsigned v)
-{
-   /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT.  The latter
-    * indicates the existence of the __builtin_popcount function.  The former
-    * indicates that _mm_popcnt_u32 exists and is a native instruction.
-    *
-    * The other alternative is to use SSE 4.2 compile-time flags.  This has
-    * two drawbacks.  First, there is currently no build infrastructure for
-    * SSE 4.2 (only 4.1), so that would have to be added.  Second, some AMD
-    * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
-    */
-#ifdef __POPCNT__
-   return _mm_popcnt_u32(v) == 1;
-#else
-   return v != 0 && (v & (v - 1)) == 0;
-#endif
-}
-
-/* For looping over a bitmask when you want to loop over consecutive bits
- * manually, for example:
- *
- * while (mask) {
- *    int start, count, i;
- *
- *    u_bit_scan_consecutive_range(&mask, &start, &count);
- *
- *    for (i = 0; i < count; i++)
- *       ... process element (start+i)
- * }
- */
-static inline void
-u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
-{
-   if (*mask == 0xffffffff) {
-      *start = 0;
-      *count = 32;
-      *mask = 0;
-      return;
-   }
-   *start = ffs(*mask) - 1;
-   *count = ffs(~(*mask >> *start)) - 1;
-   *mask &= ~(((1u << *count) - 1) << *start);
-}
-
-static inline void
-u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
-{
-   if (*mask == ~0ull) {
-      *start = 0;
-      *count = 64;
-      *mask = 0;
-      return;
-   }
-   *start = ffsll(*mask) - 1;
-   *count = ffsll(~(*mask >> *start)) - 1;
-   *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
-}
-
-
-/**
- * Find last bit set in a word.  The least significant bit is 1.
- * Return 0 if no bits are set.
- * Essentially ffs() in the reverse direction.
- */
-static inline unsigned
-util_last_bit(unsigned u)
-{
-#if defined(HAVE___BUILTIN_CLZ)
-   return u == 0 ? 0 : 32 - __builtin_clz(u);
-#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
-   unsigned long index;
-   if (_BitScanReverse(&index, u))
-      return index + 1;
-   else
-      return 0;
-#else
-   unsigned r = 0;
-   while (u) {
-      r++;
-      u >>= 1;
-   }
-   return r;
-#endif
-}
-
-/**
- * Find last bit set in a word.  The least significant bit is 1.
- * Return 0 if no bits are set.
- * Essentially ffsll() in the reverse direction.
- */
-static inline unsigned
-util_last_bit64(uint64_t u)
-{
-#if defined(HAVE___BUILTIN_CLZLL)
-   return u == 0 ? 0 : 64 - __builtin_clzll(u);
-#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
-   unsigned long index;
-   if (_BitScanReverse64(&index, u))
-      return index + 1;
-   else
-      return 0;
-#else
-   unsigned r = 0;
-   while (u) {
-      r++;
-      u >>= 1;
-   }
-   return r;
-#endif
-}
-
-/**
- * Find last bit in a word that does not match the sign bit. The least
- * significant bit is 1.
- * Return 0 if no bits are set.
- */
-static inline unsigned
-util_last_bit_signed(int i)
-{
-   if (i >= 0)
-      return util_last_bit(i);
-   else
-      return util_last_bit(~(unsigned)i);
-}
-
-/* Returns a bitfield in which the first count bits starting at start are
- * set.
- */
-static inline unsigned
-u_bit_consecutive(unsigned start, unsigned count)
-{
-   assert(start + count <= 32);
-   if (count == 32)
-      return ~0;
-   return ((1u << count) - 1) << start;
-}
-
-static inline uint64_t
-u_bit_consecutive64(unsigned start, unsigned count)
-{
-   assert(start + count <= 64);
-   if (count == 64)
-      return ~(uint64_t)0;
-   return (((uint64_t)1 << count) - 1) << start;
-}
-
-/**
- * Return number of bits set in n.
- */
-static inline unsigned
-util_bitcount(unsigned n)
-{
-#if defined(HAVE___BUILTIN_POPCOUNT)
-   return __builtin_popcount(n);
-#else
-   /* K&R classic bitcount.
-    *
-    * For each iteration, clear the LSB from the bitfield.
-    * Requires only one iteration per set bit, instead of
-    * one iteration per bit less than highest set bit.
-    */
-   unsigned bits;
-   for (bits = 0; n; bits++) {
-      n &= n - 1;
-   }
-   return bits;
-#endif
-}
-
-static inline unsigned
-util_bitcount64(uint64_t n)
-{
-#ifdef HAVE___BUILTIN_POPCOUNTLL
-   return __builtin_popcountll(n);
-#else
-   return util_bitcount(n) + util_bitcount(n >> 32);
-#endif
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* BITSCAN_H */
--- a/extra/disassemblers/adreno/util/bitset.h
+++ b/extra/disassemblers/adreno/util/bitset.h
@ -1,261 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2006  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file bitset.h
- * \brief Bitset of arbitrary size definitions.
- * \author Michal Krol
- */
-
-#ifndef BITSET_H
-#define BITSET_H
-
-//#include "util/bitscan.h"
-//#include "util/macros.h"
-
-/****************************************************************************
- * generic bitset implementation
- */
-
-#define BITSET_WORD unsigned int
-#define BITSET_WORDBITS (sizeof (BITSET_WORD) * 8)
-
-/* bitset declarations
- */
-#define BITSET_WORDS(bits) (((bits) + BITSET_WORDBITS - 1) / BITSET_WORDBITS)
-#define BITSET_DECLARE(name, bits) BITSET_WORD name[BITSET_WORDS(bits)]
-
-/* bitset operations
- */
-#define BITSET_COPY(x, y) memcpy( (x), (y), sizeof (x) )
-#define BITSET_EQUAL(x, y) (memcmp( (x), (y), sizeof (x) ) == 0)
-#define BITSET_ZERO(x) memset( (x), 0, sizeof (x) )
-#define BITSET_ONES(x) memset( (x), 0xff, sizeof (x) )
-
-#define BITSET_BITWORD(b) ((b) / BITSET_WORDBITS)
-#define BITSET_BIT(b) (1u << ((b) % BITSET_WORDBITS))
-
-/* single bit operations
- */
-#define BITSET_TEST(x, b) (((x)[BITSET_BITWORD(b)] & BITSET_BIT(b)) != 0)
-#define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
-#define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
-
-#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
-#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
-
-/* bit range operations
- */
-#define BITSET_TEST_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   (((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \
-   (assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0))
-#define BITSET_SET_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   ((x)[BITSET_BITWORD(b)] |= BITSET_RANGE(b, e)) : \
-   (assert (!"BITSET_SET_RANGE: bit range crosses word boundary"), 0))
-#define BITSET_CLEAR_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   ((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
-   (assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
-
-/* Get first bit set in a bitset.
- */
-static inline int
-__bitset_ffs(const BITSET_WORD *x, int n)
-{
-   int i;
-
-   for (i = 0; i < n; i++) {
-      if (x[i])
-	 return ffs(x[i]) + BITSET_WORDBITS * i;
-   }
-
-   return 0;
-}
-
-#define BITSET_FFS(x) __bitset_ffs(x, ARRAY_SIZE(x))
-
-static inline unsigned
-__bitset_next_set(unsigned i, BITSET_WORD *tmp,
-                  const BITSET_WORD *set, unsigned size)
-{
-   unsigned bit, word;
-
-   /* NOTE: The initial conditions for this function are very specific.  At
-    * the start of the loop, the tmp variable must be set to *set and the
-    * initial i value set to 0.  This way, if there is a bit set in the first
-    * word, we ignore the i-value and just grab that bit (so 0 is ok, even
-    * though 0 may be returned).  If the first word is 0, then the value of
-    * `word` will be 0 and we will go on to look at the second word.
-    */
-   word = BITSET_BITWORD(i);
-   while (*tmp == 0) {
-      word++;
-
-      if (word >= BITSET_WORDS(size))
-         return size;
-
-      *tmp = set[word];
-   }
-
-   /* Find the next set bit in the non-zero word */
-   bit = ffs(*tmp) - 1;
-
-   /* Unset the bit */
-   *tmp &= ~(1ull << bit);
-
-   return word * BITSET_WORDBITS + bit;
-}
-
-/**
- * Iterates over each set bit in a set
- *
- * @param __i    iteration variable, bit number
- * @param __set  the bitset to iterate (will not be modified)
- * @param __size number of bits in the set to consider
- */
-#define BITSET_FOREACH_SET(__i, __set, __size) \
-   for (BITSET_WORD __tmp = *(__set), *__foo = &__tmp; __foo != NULL; __foo = NULL) \
-      for (__i = 0; \
-           (__i = __bitset_next_set(__i, &__tmp, __set, __size)) < __size;)
-
-#ifdef __cplusplus
-
-/**
- * Simple C++ wrapper of a bitset type of static size, with value semantics
- * and basic bitwise arithmetic operators.  The operators defined below are
- * expected to have the same semantics as the same operator applied to other
- * fundamental integer types.  T is the name of the struct to instantiate
- * it as, and N is the number of bits in the bitset.
- */
-#define DECLARE_BITSET_T(T, N) struct T {                       \
-      EXPLICIT_CONVERSION                                       \
-      operator bool() const                                     \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            if (words[i])                                       \
-               return true;                                     \
-         return false;                                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator=(int x)                                          \
-      {                                                         \
-         const T c = {{ (BITSET_WORD)x }};                      \
-         return *this = c;                                      \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator==(const T &b, const T &c)                        \
-      {                                                         \
-         return BITSET_EQUAL(b.words, c.words);                 \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator!=(const T &b, const T &c)                        \
-      {                                                         \
-         return !(b == c);                                      \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator==(const T &b, int x)                             \
-      {                                                         \
-         const T c = {{ (BITSET_WORD)x }};                      \
-         return b == c;                                         \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator!=(const T &b, int x)                             \
-      {                                                         \
-         return !(b == x);                                      \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator~(const T &b)                                     \
-      {                                                         \
-         T c;                                                   \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            c.words[i] = ~b.words[i];                           \
-         return c;                                              \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator|=(const T &b)                                    \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            words[i] |= b.words[i];                             \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator|(const T &b, const T &c)                         \
-      {                                                         \
-         T d = b;                                               \
-         d |= c;                                                \
-         return d;                                              \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator&=(const T &b)                                    \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            words[i] &= b.words[i];                             \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator&(const T &b, const T &c)                         \
-      {                                                         \
-         T d = b;                                               \
-         d &= c;                                                \
-         return d;                                              \
-      }                                                         \
-                                                                \
-      bool                                                      \
-      test(unsigned i) const                                    \
-      {                                                         \
-         return BITSET_TEST(words, i);                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      set(unsigned i)                                           \
-      {                                                         \
-         BITSET_SET(words, i);                                  \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      clear(unsigned i)                                         \
-      {                                                         \
-         BITSET_CLEAR(words, i);                                \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      BITSET_WORD words[BITSET_WORDS(N)];                       \
-   }
-
-#endif
-
-#endif
--- a/extra/disassemblers/adreno/util/list.h
+++ b/extra/disassemblers/adreno/util/list.h
@ -1,262 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2006 VMware, Inc., Bismarck, ND. USA.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- **************************************************************************/
-
-/**
- * \file
- * List macros heavily inspired by the Linux kernel
- * list handling. No list looping yet.
- *
- * Is not threadsafe, so common operations need to
- * be protected using an external mutex.
- */
-
-#ifndef _UTIL_LIST_H_
-#define _UTIL_LIST_H_
-
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <assert.h>
-
-#ifdef DEBUG
-#  define list_assert(cond, msg)  assert(cond && msg)
-#else
-#  define list_assert(cond, msg)  (void)(0 && (cond))
-#endif
-
-struct list_head
-{
-    struct list_head *prev;
-    struct list_head *next;
-};
-
-static inline void list_inithead(struct list_head *item)
-{
-    item->prev = item;
-    item->next = item;
-}
-
-static inline void list_add(struct list_head *item, struct list_head *list)
-{
-    item->prev = list;
-    item->next = list->next;
-    list->next->prev = item;
-    list->next = item;
-}
-
-static inline void list_addtail(struct list_head *item, struct list_head *list)
-{
-    item->next = list;
-    item->prev = list->prev;
-    list->prev->next = item;
-    list->prev = item;
-}
-
-static inline bool list_is_empty(const struct list_head *list);
-
-static inline void list_replace(struct list_head *from, struct list_head *to)
-{
-    if (list_is_empty(from)) {
-        list_inithead(to);
-    } else {
-        to->prev = from->prev;
-        to->next = from->next;
-        from->next->prev = to;
-        from->prev->next = to;
-    }
-}
-
-static inline void list_del(struct list_head *item)
-{
-    item->prev->next = item->next;
-    item->next->prev = item->prev;
-    item->prev = item->next = NULL;
-}
-
-static inline void list_delinit(struct list_head *item)
-{
-    item->prev->next = item->next;
-    item->next->prev = item->prev;
-    item->next = item;
-    item->prev = item;
-}
-
-static inline bool list_is_empty(const struct list_head *list)
-{
-   return list->next == list;
-}
-
-/**
- * Returns whether the list has exactly one element.
- */
-static inline bool list_is_singular(const struct list_head *list)
-{
-   return list->next != NULL && list->next != list && list->next->next == list;
-}
-
-static inline unsigned list_length(const struct list_head *list)
-{
-   struct list_head *node;
-   unsigned length = 0;
-   for (node = list->next; node != list; node = node->next)
-      length++;
-   return length;
-}
-
-static inline void list_splice(struct list_head *src, struct list_head *dst)
-{
-   if (list_is_empty(src))
-      return;
-
-   src->next->prev = dst;
-   src->prev->next = dst->next;
-   dst->next->prev = src->prev;
-   dst->next = src->next;
-}
-
-static inline void list_splicetail(struct list_head *src, struct list_head *dst)
-{
-   if (list_is_empty(src))
-      return;
-
-   src->prev->next = dst;
-   src->next->prev = dst->prev;
-   dst->prev->next = src->next;
-   dst->prev = src->prev;
-}
-
-static inline void list_validate(const struct list_head *list)
-{
-   struct list_head *node;
-   assert(list->next->prev == list && list->prev->next == list);
-   for (node = list->next; node != list; node = node->next)
-      assert(node->next->prev == node && node->prev->next == node);
-}
-
-#define LIST_ENTRY(__type, __item, __field)   \
-    ((__type *)(((char *)(__item)) - offsetof(__type, __field)))
-
-/**
- * Cast from a pointer to a member of a struct back to the containing struct.
- *
- * 'sample' MUST be initialized, or else the result is undefined!
- */
-#ifndef container_of
-#define container_of(ptr, sample, member)				\
-    (void *)((char *)(ptr)						\
-	     - ((char *)&(sample)->member - (char *)(sample)))
-#endif
-
-#define list_first_entry(ptr, type, member) \
-        LIST_ENTRY(type, (ptr)->next, member)
-
-#define list_last_entry(ptr, type, member) \
-        LIST_ENTRY(type, (ptr)->prev, member)
-
-
-#define LIST_FOR_EACH_ENTRY(pos, head, member)				\
-   for (pos = NULL, pos = container_of((head)->next, pos, member);	\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.next, pos, member))
-
-#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
-   for (pos = NULL, pos = container_of((head)->next, pos, member),	\
-	storage = container_of(pos->member.next, pos, member);	\
-	&pos->member != (head);						\
-	pos = storage, storage = container_of(storage->member.next, storage, member))
-
-#define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member)	\
-   for (pos = NULL, pos = container_of((head)->prev, pos, member),	\
-	storage = container_of(pos->member.prev, pos, member);		\
-	&pos->member != (head);						\
-	pos = storage, storage = container_of(storage->member.prev, storage, member))
-
-#define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member)		\
-   for (pos = NULL, pos = container_of((start), pos, member);		\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.next, pos, member))
-
-#define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member)		\
-   for (pos = NULL, pos = container_of((start), pos, member);		\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.prev, pos, member))
-
-#define list_for_each_entry(type, pos, head, member)                    \
-   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.next, member),               \
-	list_assert(pos == __next, "use _safe iterator"),               \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_safe(type, pos, head, member)               \
-   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = __next,                                                   \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_rev(type, pos, head, member)                \
-   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
-	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.prev, member),               \
-	list_assert(pos == __prev, "use _safe iterator"),               \
-	__prev = LIST_ENTRY(type, __prev->member.prev, member))
-
-#define list_for_each_entry_safe_rev(type, pos, head, member)           \
-   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
-	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
-	&pos->member != (head);                                         \
-	pos = __prev,                                                   \
-        __prev = LIST_ENTRY(type, __prev->member.prev, member))
-
-#define list_for_each_entry_from(type, pos, start, head, member)        \
-   for (type *pos = LIST_ENTRY(type, (start), member);                  \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.next, member))
-
-#define list_for_each_entry_from_safe(type, pos, start, head, member)   \
-   for (type *pos = LIST_ENTRY(type, (start), member),                  \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = __next,                                                   \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_from_rev(type, pos, start, head, member)    \
-   for (type *pos = LIST_ENTRY(type, (start), member);                  \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.prev, member))
-
-#define list_pair_for_each_entry(type, pos1, pos2, head1, head2, member) \
-   for (type *pos1 = LIST_ENTRY(type, (head1)->next, member),           \
-             *pos2 = LIST_ENTRY(type, (head2)->next, member);           \
-        &pos1->member != (head1) && &pos2->member != (head2);           \
-	pos1 = LIST_ENTRY(type, pos1->member.next, member),               \
-	pos2 = LIST_ENTRY(type, pos2->member.next, member))
-
-#endif /*_UTIL_LIST_H_*/
--- a/extra/disassemblers/adreno/util/macros.h
+++ b/extra/disassemblers/adreno/util/macros.h
@ -1,346 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef UTIL_MACROS_H
-#define UTIL_MACROS_H
-
-#include <assert.h>
-
-/* Compute the size of an array */
-#ifndef ARRAY_SIZE
-#  define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
-/* For compatibility with Clang's __has_builtin() */
-#ifndef __has_builtin
-#  define __has_builtin(x) 0
-#endif
-
-/**
- * __builtin_expect macros
- */
-#if !defined(HAVE___BUILTIN_EXPECT)
-#  define __builtin_expect(x, y) (x)
-#endif
-
-#ifndef likely
-#  ifdef HAVE___BUILTIN_EXPECT
-#    define likely(x)   __builtin_expect(!!(x), 1)
-#    define unlikely(x) __builtin_expect(!!(x), 0)
-#  else
-#    define likely(x)   (x)
-#    define unlikely(x) (x)
-#  endif
-#endif
-
-
-/**
- * Static (compile-time) assertion.
- * Basically, use COND to dimension an array.  If COND is false/zero the
- * array size will be -1 and we'll get a compilation error.
- */
-#define STATIC_ASSERT(COND) \
-   do { \
-      (void) sizeof(char [1 - 2*!(COND)]); \
-   } while (0)
-
-
-/**
- * Unreachable macro. Useful for suppressing "control reaches end of non-void
- * function" warnings.
- */
-#if defined(HAVE___BUILTIN_UNREACHABLE) || __has_builtin(__builtin_unreachable)
-#define unreachable(str)    \
-do {                        \
-   assert(!str);            \
-   __builtin_unreachable(); \
-} while (0)
-#elif defined (_MSC_VER)
-#define unreachable(str)    \
-do {                        \
-   assert(!str);            \
-   __assume(0);             \
-} while (0)
-#else
-#define unreachable(str) assert(!str)
-#endif
-
-/**
- * Assume macro. Useful for expressing our assumptions to the compiler,
- * typically for purposes of silencing warnings.
- */
-#if __has_builtin(__builtin_assume)
-#define assume(expr)       \
-do {                       \
-   assert(expr);           \
-   __builtin_assume(expr); \
-} while (0)
-#elif defined HAVE___BUILTIN_UNREACHABLE
-#define assume(expr) ((expr) ? ((void) 0) \
-                             : (assert(!"assumption failed"), \
-                                __builtin_unreachable()))
-#elif defined (_MSC_VER)
-#define assume(expr) __assume(expr)
-#else
-#define assume(expr) assert(expr)
-#endif
-
-/* Attribute const is used for functions that have no effects other than their
- * return value, and only rely on the argument values to compute the return
- * value.  As a result, calls to it can be CSEed.  Note that using memory
- * pointed to by the arguments is not allowed for const functions.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_CONST
-#define ATTRIBUTE_CONST __attribute__((__const__))
-#else
-#define ATTRIBUTE_CONST
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
-#define FLATTEN __attribute__((__flatten__))
-#else
-#define FLATTEN
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_FORMAT
-#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
-#else
-#define PRINTFLIKE(f, a)
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC
-#define MALLOCLIKE __attribute__((__malloc__))
-#else
-#define MALLOCLIKE
-#endif
-
-/* Forced function inlining */
-/* Note: Clang also sets __GNUC__ (see other cases below) */
-#ifndef ALWAYS_INLINE
-#  if defined(__GNUC__)
-#    define ALWAYS_INLINE inline __attribute__((always_inline))
-#  elif defined(_MSC_VER)
-#    define ALWAYS_INLINE __forceinline
-#  else
-#    define ALWAYS_INLINE inline
-#  endif
-#endif
-
-/* Used to optionally mark structures with misaligned elements or size as
- * packed, to trade off performance for space.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_PACKED
-#define PACKED __attribute__((__packed__))
-#else
-#define PACKED
-#endif
-
-/* Attribute pure is used for functions that have no effects other than their
- * return value.  As a result, calls to it can be dead code eliminated.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_PURE
-#define ATTRIBUTE_PURE __attribute__((__pure__))
-#else
-#define ATTRIBUTE_PURE
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
-#define ATTRIBUTE_RETURNS_NONNULL __attribute__((__returns_nonnull__))
-#else
-#define ATTRIBUTE_RETURNS_NONNULL
-#endif
-
-#ifndef NORETURN
-#  ifdef _MSC_VER
-#    define NORETURN __declspec(noreturn)
-#  elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
-#    define NORETURN __attribute__((__noreturn__))
-#  else
-#    define NORETURN
-#  endif
-#endif
-
-#ifdef __cplusplus
-/**
- * Macro function that evaluates to true if T is a trivially
- * destructible type -- that is, if its (non-virtual) destructor
- * performs no action and all member variables and base classes are
- * trivially destructible themselves.
- */
-#   if (defined(__clang__) && defined(__has_feature))
-#      if __has_feature(has_trivial_destructor)
-#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#      endif
-#   elif defined(__GNUC__)
-#      if ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
-#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#      endif
-#   elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#      define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#   endif
-#   ifndef HAS_TRIVIAL_DESTRUCTOR
-       /* It's always safe (if inefficient) to assume that a
-        * destructor is non-trivial.
-        */
-#      define HAS_TRIVIAL_DESTRUCTOR(T) (false)
-#   endif
-#endif
-
-/**
- * PUBLIC/USED macros
- *
- * If we build the library with gcc's -fvisibility=hidden flag, we'll
- * use the PUBLIC macro to mark functions that are to be exported.
- *
- * We also need to define a USED attribute, so the optimizer doesn't
- * inline a static function that we later use in an alias. - ajax
- */
-#ifndef PUBLIC
-#  if defined(__GNUC__)
-#    define PUBLIC __attribute__((visibility("default")))
-#    define USED __attribute__((used))
-#  elif defined(_MSC_VER)
-#    define PUBLIC __declspec(dllexport)
-#    define USED
-#  else
-#    define PUBLIC
-#    define USED
-#  endif
-#endif
-
-/**
- * UNUSED marks variables (or sometimes functions) that have to be defined,
- * but are sometimes (or always) unused beyond that. A common case is for
- * a function parameter to be used in some build configurations but not others.
- * Another case is fallback vfuncs that don't do anything with their params.
- *
- * Note that this should not be used for identifiers used in `assert()`;
- * see ASSERTED below.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_UNUSED
-#define UNUSED __attribute__((unused))
-#else
-#define UNUSED
-#endif
-
-/**
- * Use ASSERTED to indicate that an identifier is unused outside of an `assert()`,
- * so that assert-free builds don't get "unused variable" warnings.
- */
-#ifdef NDEBUG
-#define ASSERTED UNUSED
-#else
-#define ASSERTED
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
-#define MUST_CHECK __attribute__((warn_unused_result))
-#else
-#define MUST_CHECK
-#endif
-
-#if defined(__GNUC__)
-#define ATTRIBUTE_NOINLINE __attribute__((noinline))
-#else
-#define ATTRIBUTE_NOINLINE
-#endif
-
-
-/**
- * Check that STRUCT::FIELD can hold MAXVAL.  We use a lot of bitfields
- * in Mesa/gallium.  We have to be sure they're of sufficient size to
- * hold the largest expected value.
- * Note that with MSVC, enums are signed and enum bitfields need one extra
- * high bit (always zero) to ensure the max value is handled correctly.
- * This macro will detect that with MSVC, but not GCC.
- */
-#define ASSERT_BITFIELD_SIZE(STRUCT, FIELD, MAXVAL) \
-   do { \
-      ASSERTED STRUCT s; \
-      s.FIELD = (MAXVAL); \
-      assert((int) s.FIELD == (MAXVAL) && "Insufficient bitfield size!"); \
-   } while (0)
-
-
-/** Compute ceiling of integer quotient of A divided by B. */
-#define DIV_ROUND_UP( A, B )  ( ((A) + (B) - 1) / (B) )
-
-/** Clamp X to [MIN,MAX].  Turn NaN into MIN, arbitrarily. */
-#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
-
-/** Minimum of two values: */
-#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
-
-/** Maximum of two values: */
-#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-
-/** Minimum and maximum of three values: */
-#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
-#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
-
-/** Align a value to a power of two */
-#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
-
-/**
- * Macro for declaring an explicit conversion operator.  Defaults to an
- * implicit conversion if C++11 is not supported.
- */
-#if __cplusplus >= 201103L
-#define EXPLICIT_CONVERSION explicit
-#elif defined(__cplusplus)
-#define EXPLICIT_CONVERSION
-#endif
-
-/** Set a single bit */
-#define BITFIELD_BIT(b)      (1u << (b))
-/** Set all bits up to excluding bit b */
-#define BITFIELD_MASK(b)      \
-   ((b) == 32 ? (~0u) : BITFIELD_BIT((b) % 32) - 1)
-/** Set count bits starting from bit b  */
-#define BITFIELD_RANGE(b, count) \
-   (BITFIELD_MASK((b) + (count)) & ~BITFIELD_MASK(b))
-
-/** Set a single bit */
-#define BITFIELD64_BIT(b)      (1ull << (b))
-/** Set all bits up to excluding bit b */
-#define BITFIELD64_MASK(b)      \
-   ((b) == 64 ? (~0ull) : BITFIELD64_BIT(b) - 1)
-/** Set count bits starting from bit b  */
-#define BITFIELD64_RANGE(b, count) \
-   (BITFIELD64_MASK((b) + (count)) & ~BITFIELD64_MASK(b))
-
-/* TODO: In future we should try to move this to u_debug.h once header
- * dependencies are reorganised to allow this.
- */
-enum pipe_debug_type
-{
-   PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
-   PIPE_DEBUG_TYPE_ERROR,
-   PIPE_DEBUG_TYPE_SHADER_INFO,
-   PIPE_DEBUG_TYPE_PERF_INFO,
-   PIPE_DEBUG_TYPE_INFO,
-   PIPE_DEBUG_TYPE_FALLBACK,
-   PIPE_DEBUG_TYPE_CONFORMANCE,
-};
-
-#endif /* UTIL_MACROS_H */
--- a/extra/disk_read_speed.py
+++ b/extra/disk_read_speed.py
@ -1,120 +0,0 @@
-#!/usr/bin/env python3
-import os, ctypes, ctypes.util, io, mmap, pathlib
-from tinygrad import Tensor, dtypes, Device
-from tinygrad.helpers import Timing, from_mv
-libc = ctypes.CDLL(ctypes.util.find_library("c"))
-
-#from extra.hip_gpu_driver import hip_ioctl
-
-# sudo su -c "echo 3 > /proc/sys/vm/drop_caches"
-
-# sudo su -c 'echo 8 > /proc/sys/kernel/printk'
-# sudo su -c "echo 'module amdgpu +p' > /sys/kernel/debug/dynamic_debug/control"
-
-libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
-
-libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t]
-libc.read.restype = ctypes.c_size_t
-
-libc.malloc.argtypes = [ctypes.c_size_t]
-libc.malloc.restype = ctypes.c_void_p
-
-def read_direct(fd, sz):
-  with Timing("mmap: ", lambda x: f", {sz/x:.2f} GB/s"):
-    buf = mmap.mmap(-1, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE)
-  with Timing("read: ", lambda x: f", {sz/x:.2f} GB/s"):
-    ret = libc.read(fd, from_mv(buf), sz)
-  assert ret == sz
-
-def read_mmap(fd, sz):
-  with Timing("mmfd:       ", lambda x: f", {sz/x:.2f} GB/s"):
-    buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
-    t = 0
-    for i in range(0, sz, 0x1000): t += buf[i]
-
-# def _copyin_async(self, dest:T, src:T, size:int): check(hip.hipMemcpyAsync(dest, src, size, hip.hipMemcpyHostToDevice, None))
-
-def read_to_gpu_mmap(fd, sz, gpubuf):
-  with Timing("gpu copyin: ", lambda x: f", {sz/x:.2f} GB/s"):
-    with Timing("mmfd:       ", lambda x: f", {sz/x:.2f} GB/s"):
-      buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
-    dev.allocator._copyin_async(gpubuf, from_mv(buf), sz)
-    dev.synchronize()
-
-def read_to_gpu_single(fd, sz, gpubuf):
-  os.lseek(fd, 0, os.SEEK_SET)
-  with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
-    with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
-      hst = dev.allocator._hostalloc(sz)
-    with Timing("read to host:   ", lambda x: f", {sz/x:.2f} GB/s"):
-      ret = libc.read(fd, hst, sz)
-    with Timing("gpu host copy:  ", lambda x: f", {sz/x:.2f} GB/s"):
-      dev.allocator._copyin_async(gpubuf, hst, sz)
-      dev.synchronize()
-
-def read_to_gpu_pingpong(fd, sz, gpubuf):
-  psz = 256*1024*1024
-  print(f"piece size {psz/(1024*1024):.2f} MB")
-  with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
-    hst1 = dev.allocator._hostalloc(psz)
-    hst2 = dev.allocator._hostalloc(psz)
-
-  os.lseek(fd, 0, os.SEEK_SET)
-  with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
-    for i in range(sz//(psz*2)):
-      with Timing("tfer(0):           ", lambda x: f", {psz/x:.2f} GB/s"):
-        ret = libc.read(fd, hst1, psz)
-        dev.synchronize()
-        dev.allocator._copyin_async(gpubuf, hst1, psz)
-      with Timing("tfer(1):           ", lambda x: f", {psz/x:.2f} GB/s"):
-        ret = libc.read(fd, hst2, psz)
-        dev.synchronize()
-        dev.allocator._copyin_async(gpubuf, hst2, psz)
-    dev.synchronize()
-
-MAP_LOCKED = 0x2000
-MAP_HUGETLB = 0x40000
-
-if __name__ == "__main__":
-  dev = Device[Device.DEFAULT]
-
-  warm = (Tensor.ones(1024, device=Device.DEFAULT).contiguous() + Tensor.ones(1024, device=Device.DEFAULT).contiguous()).realize()
-  #fn = "/home/tiny/tinygrad/weights/rng"
-  fn = pathlib.Path(__file__).parents[1] / "weights/LLaMA-2/70B/consolidated.00.pth"
-  sz = os.stat(fn).st_size
-  t = Tensor.empty(sz, dtype=dtypes.uint8, device=f"disk:{fn}")
-  with Timing("copy:  ", lambda x: f", {sz/x:.2f} GB/s"):
-    on_dev = t.to(Device.DEFAULT).realize()
-
-  exit(0)
-
-  # 4GB of random numbers
-  #fd = os.open("/home/tiny/tinygrad/weights/rng", os.O_RDWR|os.O_DIRECT)
-  #sz = os.fstat(fd).st_size // 4
-  fd = os.open("/home/tiny/tinygrad/weights/LLaMA/7B/consolidated.00.pth", os.O_RDWR|os.O_DIRECT)
-  sz = os.fstat(fd).st_size
-  print(f"read {sz} from {fd}")
-
-  with Timing("gpu alloc:  ", lambda x: f", {sz/x:.2f} GB/s"):
-    gpubuf = dev.allocator._alloc(sz)
-  # warmup
-  dev.allocator._copyin_async(gpubuf, from_mv(bytearray(b"\x00\x00\x00\x00"*0x1000)), 0x4000)
-  print("copying, is warm")
-
-  print("****** read to gpu pingpong")
-  read_to_gpu_pingpong(fd, sz, gpubuf)
-  exit(0)
-
-  print("****** read direct")
-  read_direct(fd, sz)
-
-  print("****** read mmap")
-  read_mmap(fd, sz)
-
-  print("****** read to gpu single")
-  read_to_gpu_single(fd, sz, gpubuf)
-
-  print("****** read to gpu mmap")
-  read_to_gpu_mmap(fd, sz, gpubuf)
-
-  os._exit(0)
--- a/extra/dump_cache.py
+++ b/extra/dump_cache.py
@ -1,21 +0,0 @@
-import sys, sqlite3, pickle
-from tinygrad.helpers import CACHEDB
-
-if __name__ == "__main__":
-  fn = sys.argv[1] if len(sys.argv) > 1 else CACHEDB
-  conn = sqlite3.connect(fn)
-  cur = conn.cursor()
-  cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
-  for f in cur.fetchall():
-    table = f[0]
-    cur2 = conn.cursor()
-    cur2.execute(f"SELECT COUNT(*) FROM {table}")
-    cnt = cur2.fetchone()[0]
-    print(f"{table:20s} : {cnt}")
-
-    cur3 = conn.cursor()
-    cur3.execute(f"SELECT * FROM {table} LIMIT 10")
-    for f in cur3.fetchall():
-      v = pickle.loads(f[-1])
-      print("   ", len(f[0]) if isinstance(f[0], str) else f[0], f[1:-1], str(v)[0:50])
-      #print(f"{len(k):10d}, {sk} -> {v}")
--- a/extra/gemm/amd_matmul.py
+++ b/extra/gemm/amd_matmul.py
@ -36,7 +36,7 @@ if __name__ == "__main__":
    for _ in range(run_count): tc = (a@b).realize()

  GlobalCounters.reset()
-  ei = ExecItem(runner, [a.uop.buffer, b.uop.buffer, c.uop.buffer])
+  ei = ExecItem(ast, [a.uop.buffer, b.uop.buffer, c.uop.buffer], prg=runner)
  with Context(DEBUG=2):
    for _ in range(run_count): ei.run(wait=True)
  print(f"custom  {(c-tc).square().mean().item()}")
--- a/extra/gemm/amd_uop_matmul.py
+++ b/extra/gemm/amd_uop_matmul.py
@ -1,3 +1,4 @@
+import numpy as np
 from tinygrad import Tensor, Device, Context, GlobalCounters, dtypes
 from tinygrad.uop.ops import UOp, KernelInfo, sint, AxisType
 from tinygrad.engine.realize import ExecItem, get_runner
@ -140,15 +141,14 @@ def hand_spec_kernel3():
  return sink.sink(arg=KernelInfo(opts_to_apply=())).simplify()

 def test_matmul(sink:UOp, N=N):
-  with Context(DEBUG=0):
-    a = Tensor.randn(N, N)
-    b = Tensor.randn(N, N)
-    hc = Tensor.empty(N, N)
-    Tensor.realize(a, b, hc)
+  rng = np.random.default_rng()
+  a = Tensor(rng.random((N, N), dtype=np.float32)-0.5)
+  b = Tensor(rng.random((N, N), dtype=np.float32)-0.5)
+  hc = Tensor.empty(N, N)
+  Tensor.realize(a, b, hc)

-  ei = ExecItem(get_runner(Device.DEFAULT, sink), [t.uop.buffer for t in [hc, a, b]])
+  ei = ExecItem(sink, [t.uop.buffer for t in [hc, a, b]], prg=get_runner(Device.DEFAULT, sink))

-  GlobalCounters.reset()
  ets = []
  with Context(DEBUG=2):
    for _ in range(run_count):
--- a/extra/gemm/jax_pmatmul.py
+++ b/extra/gemm/jax_pmatmul.py
@ -1,27 +0,0 @@
-#!/usr/bin/env python3
-import time
-import jax
-import jax.numpy as jnp
-
-print(jax.devices())
-DEVICES = len(jax.devices())
-BS = 32
-N = 4096
-dtype = jnp.float16
-A = jnp.zeros((DEVICES, BS, N, N), dtype)
-B = jnp.zeros((1, 1, N, N), dtype)
-A = jax.device_put_sharded([A[i] for i in range(DEVICES)], jax.devices())
-B = jax.device_put_sharded([B for i in range(DEVICES)], jax.devices())
-
-OPS = DEVICES*BS*N*N*N*2
-def matmul(A,B): return jnp.matmul(A,B,preferred_element_type=jnp.float32)
-pmatmul = jax.pmap(matmul)
-
-MAX_TFLOPS = 123*DEVICES  # Peak FP16 Tensor TFLOPS with FP32 Acc (7900XTX)
-for i in range(10):
-  st = time.perf_counter()
-  C = pmatmul(A,B).block_until_ready()
-  et = time.perf_counter()-st
-  tflops = (OPS*1e-12)/et
-  print(f"time {et*1e3:.2f} ms, TFLOPS {tflops:6.2f}, MFU {(tflops/MAX_TFLOPS)*100:4.2f}% out shape {C.shape} dtype {C.dtype}")
-
--- a/extra/gemm/mlx_matmul.py
+++ b/extra/gemm/mlx_matmul.py
@ -1,10 +0,0 @@
-import mlx.core as mx
-from tinygrad.helpers import Timing
-N = 4096
-x = mx.random.normal((N,N))
-w = mx.random.normal((N,N))
-
-FLOPS = N*N*N*2
-for i in range(10):
-  with Timing("", lambda x: f"  {FLOPS/x:.2f} GFLOPS"):
-    mx.eval(x@w)
--- a/extra/gemm/simple_matmul.py
+++ b/extra/gemm/simple_matmul.py
@ -3,7 +3,6 @@ from tinygrad import dtypes, Tensor
 from tinygrad.helpers import getenv, get_single_element
 from tinygrad.dtype import _to_np_dtype
 from tinygrad.codegen.opt import OptOps
-from tinygrad.engine.realize import lower_schedule

 dtype_in = (dtypes.half if getenv("HALF") else dtypes.bfloat16 if getenv("BFLOAT16") else
            dtypes.fp8e4m3 if getenv("FP8E4M3") else dtypes.fp8e5m2 if getenv("FP8E5M2") else dtypes.float)
@ -40,8 +39,8 @@ if __name__ == "__main__":

  if getenv("SHOULD_USE_TC"):
    sched = a.matmul(b, dtype=acc_dtype).schedule()
-    lowered = list(lower_schedule(sched))
-    ei = get_single_element(lowered)[1]
+    ei = get_single_element(sched)
+    ei.lower()
    assert any(opt.op is OptOps.TC for opt in ei.prg.p.applied_opts), f"TC not triggered, {ei.prg.p.applied_opts}"

  ref = a.numpy().astype(np.float32) @ b.numpy().astype(np.float32)
--- a/extra/gemm/tf_gemm.py
+++ b/extra/gemm/tf_gemm.py
@ -1,33 +0,0 @@
-import time
-import tensorflow as tf
-
-gpus = tf.config.list_physical_devices('GPU')
-if gpus:
-  try:
-    # Currently, memory growth needs to be the same across GPUs
-    for gpu in gpus:
-      tf.config.experimental.set_memory_growth(gpu, True)
-    logical_gpus = tf.config.list_logical_devices('GPU')
-    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
-  except RuntimeError as e:
-    # Memory growth must be set before GPUs have been initialized
-    print(e)
-
-for dtype in [tf.float16, tf.float32]:
-  for N in [256, 512, 1024, 2048, 4096, 8192]:
-    FLOPS = N*N*N*2
-
-    b = tf.random.uniform((N, N), dtype=dtype)
-    c = tf.random.uniform((N, N), dtype=dtype)
-
-    b = tf.Variable(b)
-    c = tf.Variable(c)
-
-    def tf_prog(b, c):
-      st = time.perf_counter()
-      a = tf.matmul(b, c)
-      tf.debugging.check_numerics(a, "Nan or Inf in result") # Ensures that the calculation is done.
-      return time.perf_counter() - st
-
-    tm = min([tf_prog(b, c) for _ in range(20)])
-    print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS {N:4d}x{N:4d}x{N:4d} matmul in {dtype}")
--- a/extra/gemm/tinygrad_nv_matmul.py
+++ b/extra/gemm/tinygrad_nv_matmul.py
@ -33,5 +33,5 @@ if __name__ == "__main__":
  new_src = prg.src
  # can mod source here
  prg = replace(prg, src=new_src)
-  ei = ExecItem(CompiledRunner(prg), [x.ensure_allocated() for x in si.bufs], si.metadata)
+  ei = ExecItem(si.ast, [x.ensure_allocated() for x in si.bufs], si.metadata, prg=CompiledRunner(prg))
  for i in range(5): ei.run(wait=True)
--- a/extra/gemm/triton_nv_matmul.py
+++ b/extra/gemm/triton_nv_matmul.py
@ -88,7 +88,7 @@ if __name__ == "__main__":
  prg = ProgramSpec("matmul_kernel", src, device=Device.DEFAULT,
                global_size=[M//BLOCK_SIZE_M, N//BLOCK_SIZE_N, 1], local_size=[32*compiled.metadata.num_warps, 1, 1],
                mem_estimate=A.nbytes() + B.nbytes() + C.nbytes())
-  ei = ExecItem(CompiledRunner(prg), [x.ensure_allocated() for x in si.bufs], si.metadata)
+  ei = ExecItem(si.ast, [x.ensure_allocated() for x in si.bufs], si.metadata, prg=CompiledRunner(prg))
  tflops = []
  for i in range(5):
    tm = ei.run(wait=True)
--- a/extra/hevc/.gitignore
+++ b/extra/hevc/.gitignore
@ -0,0 +1 @@
+out/
--- a/extra/hevc/decode.py
+++ b/extra/hevc/decode.py
@ -0,0 +1,83 @@
+import argparse, os, hashlib
+from tinygrad.helpers import getenv, DEBUG, round_up, Timing, tqdm, fetch
+from extra.hevc.hevc import parse_hevc_file_headers, untile_nv12, to_bgr, nv_gpu
+from tinygrad import Tensor, dtypes, Device, Variable, TinyJit
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input_file", type=str, default="")
+  parser.add_argument("--output_dir", type=str, default="extra/hevc/out")
+  args = parser.parse_args()
+
+  if args.input_file == "":
+    url = "https://github.com/haraschax/filedump/raw/09a497959f7fa6fd8dba501a25f2cdb3a41ecb12/comma_video.hevc"
+    hevc_tensor = Tensor.from_url(url, device="CPU")
+  else:
+    hevc_tensor = Tensor.empty(os.stat(args.input_file).st_size, dtype=dtypes.uint8, device=f"disk:{args.input_file}").to("CPU")
+
+  dat = bytes(hevc_tensor.data())
+  dat_hash = hashlib.md5(dat).hexdigest()
+
+  with Timing("prep infos: "):
+    dat_nv = hevc_tensor.to("NV")
+    opaque, frame_info, w, h, luma_w, luma_h, chroma_off = parse_hevc_file_headers(dat)
+
+  frame_info = frame_info[:getenv("MAX_FRAMES", len(frame_info))]
+
+  # move all needed data to gpu
+  #all_slices = []
+  with Timing("copy to gpu: "):
+    opaque_nv = opaque.to("NV").contiguous().realize()
+    hevc_tensor = hevc_tensor.to("NV")
+
+  out_image_size = luma_h + (luma_h + 1) // 2, round_up(luma_w, 64)
+  max_hist = max(history_sz for _, _, _, history_sz, _ in frame_info)
+
+  # define variables
+  v_pos = Variable("pos", 0, max_hist + 1)
+  v_offset = Variable("offset", 0, hevc_tensor.numel()-1)
+  v_sz = Variable("sz", 0, hevc_tensor.numel())
+  v_i = Variable("i", 0, len(frame_info)-1)
+
+  @TinyJit
+  def decode_jit(pos:Variable, src:Tensor, data:Tensor, *hist:Tensor):
+    return src.decode_hevc_frame(pos, out_image_size, data, hist).realize()
+
+  # warm up
+  history = [Tensor.empty(*out_image_size, dtype=dtypes.uint8, device="NV") for _ in range(max_hist)]
+  for i in range(3):
+    hevc_frame = hevc_tensor.shrink((((bound_offset:=v_offset.bind(frame_info[0][0])), bound_offset+v_sz.bind(frame_info[0][1])),))
+    decode_jit(v_pos.bind(0), hevc_frame, opaque_nv[v_i.bind(0)], *history)
+
+  out_images = []
+  with Timing("decoding whole file: ", on_exit=(lambda et: f", {len(frame_info)} frames, {len(frame_info)/(et/1e9):.2f} fps")):
+    for i, (offset, sz, frame_pos, history_sz, is_hist) in enumerate(frame_info):
+      history = history[-max_hist:] if max_hist > 0 else []
+      # TODO: this shrink should work as a slice
+      hevc_frame = hevc_tensor.shrink((((bound_offset:=v_offset.bind(offset)), bound_offset+v_sz.bind(sz)),))
+
+      outimg = decode_jit(v_pos.bind(frame_pos), hevc_frame, opaque_nv[v_i.bind(i)], *history).clone()
+      out_images.append(outimg)
+      if is_hist: history.append(outimg)
+
+    Device.default.synchronize()
+
+  if getenv("VALIDATE", 0):
+    import pickle
+    if dat_hash == "b813bfdbec194fd17fdf0e3ceb8cea1c":
+      url = "https://github.com/nimlgen/hevc_validate_set/raw/refs/heads/main/decoded_frames_b813bfdbec194fd17fdf0e3ceb8cea1c.pkl"
+      decoded_frames = pickle.load(fetch(url).open("rb"))
+    else: decoded_frames = pickle.load(open(f"extra/hevc/decoded_frames_{dat_hash}.pkl", "rb"))
+  else: import cv2
+
+  for i, img in tqdm(enumerate(out_images)):
+    if getenv("VALIDATE", 0):
+      if i < len(decoded_frames) and len(decoded_frames[i]) > 0:
+        img = untile_nv12(img, h, w, luma_w, chroma_off).realize()
+        assert img.data() == decoded_frames[i], f"Frame {i} does not match reference decoder!"
+        print(f"Frame {i} matches reference decoder!")
+    else:
+      if len(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+        img = to_bgr(img, h, w, luma_w, chroma_off).realize()
+        cv2.imwrite(f"{args.output_dir}/out_frame_{i:04d}.png", img.numpy())
--- a/extra/hevc/hevc.py
+++ b/extra/hevc/hevc.py
@ -0,0 +1,450 @@
+import dataclasses, enum, argparse, os, itertools, time, ctypes
+from typing import Any
+from tinygrad import Tensor, dtypes, Device, TinyJit
+from tinygrad.helpers import DEBUG, round_up, ceildiv, Timing, prod
+from tinygrad.runtime.autogen import avcodec, nv_570 as nv_gpu
+
+class BitReader:
+  def __init__(self, data:bytes): self.reader, self.current_bits, self.bits, self.read_bits, self.total = iter(data), 0, 0, 0, len(data) * 8
+  def empty(self): return self.read_bits == self.total and self.current_bits == 0
+  def peak_bits(self, n):
+    while self.current_bits < n:
+      self.bits = (self.bits << 8) | next(self.reader)
+      self.current_bits += 8
+      self.read_bits += 8
+    return (self.bits >> (self.current_bits - n)) & ((1 << n) - 1)
+  def _next_bits(self, n):
+    val = self.peak_bits(n)
+    self.bits &= (1 << (self.current_bits - n)) - 1
+    self.current_bits -= n
+    return val
+
+  def u(self, n): return self._next_bits(n)
+
+  # 9.2 Parsing process for 0-th order Exp-Golomb codes
+  def ue_v(self):
+    leading_zero_bits = -1
+    while True:
+      bit = self.u(1)
+      leading_zero_bits += 1
+      if bit == 1: break
+
+    part = self.u(leading_zero_bits)
+
+    if leading_zero_bits == 0: return 0
+    return (1 << leading_zero_bits) - 1 + part
+
+  # 9.2.2 Mapping process for signed Exp-Golomb codes
+  def se_v(self):
+    k = self.ue_v()
+    return (-1 ** (k + 1)) * (k // 2)
+
+# 7.3.1.1 General NAL unit syntax
+def _hevc_get_rbsp(dat:bytes, off=0) -> bytes:
+  rbsp = bytes()
+  while off < len(dat):
+    if off + 2 < len(dat) and dat[off:off+3] == b'\x00\x00\x03':
+      rbsp += bytes([0, 0])
+      off += 3
+    else:
+      rbsp += bytes([dat[off]])
+      off += 1
+  return rbsp
+
+class HevcSlice:
+  # 7.3.3 Profile, tier and level syntax
+  def profile_tier_level(self, r:BitReader, enable:bool, max_sub_layers:int):
+    assert enable and max_sub_layers == 0, "no sublayers supported"
+    self._notimpl_profile_tier_level = r.u(88)
+    self.general_level_idc = r.u(8)
+
+  # 7.3.7 Short-term reference picture set syntax
+  def st_ref_pic_set(self, r:BitReader, stRpsIdx:int, num_short_term_ref_pic_sets:int=0, sps=None):
+    inter_ref_pic_set_prediction_flag = r.u(1) if stRpsIdx != 0 else 0
+
+    if inter_ref_pic_set_prediction_flag:
+      if stRpsIdx == num_short_term_ref_pic_sets:
+        delta_idx_minus1 = r.ue_v()
+      delta_rps_sign = r.u(1)
+      abs_delta_rps_minus1 = r.ue_v()
+
+      NumDeltaPocs = sps.num_negative_pics + sps.num_positive_pics
+      for i in range(NumDeltaPocs + 1):
+        used_by_curr_pic_flag = r.u(1)
+        if not used_by_curr_pic_flag:
+          use_delta_flag = r.u(1)
+    else:
+      self.num_negative_pics = r.ue_v()
+      self.num_positive_pics = r.ue_v()
+      for i in range(self.num_negative_pics):
+        delta_poc_s0_minus1 = r.ue_v()
+        used_by_curr_pic_s0_flag = r.u(1)
+      for i in range(self.num_positive_pics):
+        delta_poc_s1_minus1 = r.ue_v()
+        used_by_curr_pic_s1_flag = r.u(1)
+
+# 7.3.2.2 Sequence parameter set RBSP syntax
+class SPS(HevcSlice):
+  def __init__(self, r:BitReader):
+    self.sps_video_parameter_set_id = r.u(4)
+    self.sps_max_sub_layers_minus1 = r.u(3)
+    self.sps_temporal_id_nesting_flag = r.u(1)
+
+    self.profile_tier_level(r, True, self.sps_max_sub_layers_minus1)
+
+    self.sps_seq_parameter_set_id = r.ue_v()
+    self.chroma_format_idc = r.ue_v()
+    self.separate_colour_plane_flag = r.u(1) if self.chroma_format_idc == 3 else 0
+    self.pic_width_in_luma_samples = r.ue_v()
+    self.pic_height_in_luma_samples = r.ue_v()
+    self.conformance_window_flag = r.u(1)
+
+    if self.conformance_window_flag:
+      self.conf_win_left_offset = r.ue_v()
+      self.conf_win_right_offset = r.ue_v()
+      self.conf_win_top_offset = r.ue_v()
+      self.conf_win_bottom_offset = r.ue_v()
+    else: self.conf_win_left_offset = self.conf_win_right_offset = self.conf_win_top_offset = self.conf_win_bottom_offset = 0
+
+    self.bit_depth_luma = r.ue_v() + 8
+    self.bit_depth_chroma = r.ue_v() + 8
+    self.log2_max_pic_order_cnt_lsb_minus4 = r.ue_v()
+    self.sps_sub_layer_ordering_info_present_flag = r.u(1)
+    self.sps_max_dec_pic_buffering, self.sps_max_num_reorder_pics, self.sps_max_latency_increase_plus1 = [], [], []
+    for i in range((0 if self.sps_sub_layer_ordering_info_present_flag else self.sps_max_sub_layers_minus1), self.sps_max_sub_layers_minus1 + 1):
+      self.sps_max_dec_pic_buffering.append(r.ue_v() + 1)
+      self.sps_max_num_reorder_pics.append(r.ue_v())
+      self.sps_max_latency_increase_plus1.append(r.ue_v())
+    self.log2_min_luma_coding_block_size = r.ue_v() + 3
+    self.log2_max_luma_coding_block_size = self.log2_min_luma_coding_block_size + r.ue_v()
+    self.log2_min_transform_block_size = r.ue_v() + 2
+    self.log2_max_transform_block_size = self.log2_min_transform_block_size + r.ue_v()
+    self.max_transform_hierarchy_depth_inter = r.ue_v()
+    self.max_transform_hierarchy_depth_intra = r.ue_v()
+    if scaling_list_enabled_flag := r.u(1):
+      if sps_scaling_list_data_present_flag := r.u(1): assert False, "scaling_list_data parsing not implemented"
+    self.amp_enabled_flag = r.u(1)
+    self.sample_adaptive_offset_enabled_flag = r.u(1)
+    self.pcm_enabled_flag = r.u(1)
+    assert self.pcm_enabled_flag == 0, "pcm not implemented"
+    self.num_short_term_ref_pic_sets = r.ue_v()
+    for i in range(self.num_short_term_ref_pic_sets):
+      self.st_ref_pic_set(r, i, self.num_short_term_ref_pic_sets)
+    self.long_term_ref_pics_present_flag = r.u(1)
+    if self.long_term_ref_pics_present_flag: assert False, "long_term_ref_pics parsing not implemented"
+    self.sps_temporal_mvp_enabled_flag = r.u(1)
+    self.strong_intra_smoothing_enabled_flag = r.u(1)
+
+# 7.3.2.3 Picture parameter set RBSP syntax
+class PPS(HevcSlice):
+  def __init__(self, r:BitReader):
+    self.pps_pic_parameter_set_id = r.ue_v()
+    self.pps_seq_parameter_set_id = r.ue_v()
+    self.dependent_slice_segments_enabled_flag = r.u(1)
+    self.output_flag_present_flag = r.u(1)
+    self.num_extra_slice_header_bits = r.u(3)
+    self.sign_data_hiding_enabled_flag = r.u(1)
+    self.cabac_init_present_flag = r.u(1)
+    self.num_ref_idx_l0_default_active = r.ue_v() + 1
+    self.num_ref_idx_l1_default_active = r.ue_v() + 1
+    self.init_qp = r.se_v() + 26
+    self.constrained_intra_pred_flag = r.u(1)
+    self.transform_skip_enabled_flag = r.u(1)
+    self.cu_qp_delta_enabled_flag = r.u(1)
+    if self.cu_qp_delta_enabled_flag: self.diff_cu_qp_delta_depth = r.ue_v()
+
+    self.pps_cb_qp_offset = r.se_v()
+    self.pps_cr_qp_offset = r.se_v()
+    self.pps_slice_chroma_qp_offsets_present_flag = r.u(1)
+    self.weighted_pred_flag = r.u(1)
+    self.weighted_bipred_flag = r.u(1)
+    self.transquant_bypass_enabled_flag = r.u(1)
+    self.tiles_enabled_flag = r.u(1)
+    self.entropy_coding_sync_enabled_flag = r.u(1)
+    if self.tiles_enabled_flag:
+      self.num_tile_columns_minus1 = r.ue_v()
+      self.num_tile_rows_minus1 = r.ue_v()
+      self.uniform_spacing_flag = r.u(1)
+      self.column_width_minus1, self.row_height_minus1 = [], []
+      if not self.uniform_spacing_flag:
+        for i in range(self.num_tile_columns_minus1): self.column_width_minus1.append(r.ue_v())
+        for i in range(self.num_tile_rows_minus1): self.row_height_minus1.append(r.ue_v())
+      self.loop_filter_across_tiles_enabled_flag = r.u(1)
+    self.loop_filter_across_slices_enabled_flag = r.u(1)
+    self.deblocking_filter_control_present_flag = r.u(1)
+    if self.deblocking_filter_control_present_flag: assert False, "deblocking_filter parsing not implemented"
+    self.scaling_list_data_present_flag = r.u(1)
+    if self.scaling_list_data_present_flag: assert False, "scaling_list_data parsing not implemented"
+    self.lists_modification_present_flag = r.u(1)
+    self.log2_parallel_merge_level = r.ue_v() + 2
+
+# 7.3.6 Slice segment header syntax
+class SliceSegment(HevcSlice):
+  def __init__(self, r:BitReader, nal_unit_type:int, sps:SPS, pps:PPS):
+    self.first_slice_segment_in_pic_flag = r.u(1)
+    if nal_unit_type >= avcodec.HEVC_NAL_BLA_W_LP and nal_unit_type <= avcodec.HEVC_NAL_RSV_IRAP_VCL23:
+      self.no_output_of_prior_pics_flag = r.u(1)
+    self.slice_pic_parameter_set_id = r.ue_v()
+    if not self.first_slice_segment_in_pic_flag:
+      if pps.dependent_slice_segments_enabled_flag:
+        self.dependent_slice_segment_flag = r.u(1)
+      self.slice_segment_address = r.ue_v()
+    self.dependent_slice_segment_flag = 0
+    if not self.dependent_slice_segment_flag:
+      r.u(pps.num_extra_slice_header_bits) # extra bits ignored
+      self.slice_type = r.ue_v()
+
+      self.sw_skip_start = r.read_bits - r.current_bits
+      self.pic_output_flag = r.u(1) if pps.output_flag_present_flag else 0
+      self.colour_plane_id = r.u(2) if sps.separate_colour_plane_flag else 0
+
+      if nal_unit_type != avcodec.HEVC_NAL_IDR_W_RADL and nal_unit_type != avcodec.HEVC_NAL_IDR_N_LP:
+        self.slice_pic_order_cnt_lsb = r.u(sps.log2_max_pic_order_cnt_lsb_minus4 + 4)
+
+        self.short_term_ref_pic_set_sps_flag = r.u(1)
+        if not self.short_term_ref_pic_set_sps_flag:
+          self.short_term_ref_pics_in_slice_start = r.read_bits - r.current_bits
+          self.st_ref_pic_set(r, sps.num_short_term_ref_pic_sets, sps=sps)
+          self.short_term_ref_pics_in_slice_end = r.read_bits - r.current_bits
+        elif sps.num_short_term_ref_pic_sets > 1: assert False, "short_term_ref_pic_set parsing not implemented"
+
+        if sps.long_term_ref_pics_present_flag: assert False, "long_term_ref_pics parsing not implemented"
+
+        self.sw_skip_end = r.read_bits - r.current_bits
+        self.slice_temporal_mvp_enabled_flag = r.u(1) if sps.sps_temporal_mvp_enabled_flag else 0
+      else: self.slice_pic_order_cnt_lsb, self.sw_skip_end = 0, self.sw_skip_start
+
+      if sps.sample_adaptive_offset_enabled_flag:
+        slice_sao_luma_flag = r.u(1)
+        ChromaArrayType = sps.chroma_format_idc if sps.separate_colour_plane_flag == 0 else 0
+        slice_sao_chroma_flag = r.u(1) if ChromaArrayType != 0 else 0
+
+      if self.slice_type in {avcodec.HEVC_SLICE_B, avcodec.HEVC_SLICE_B}:
+        if num_ref_idx_active_override_flag := r.u(1):
+          num_ref_idx_l0_active_minus1 = r.ue_v()
+          num_ref_idx_l1_active_minus1 = r.ue_v() if self.slice_type == avcodec.HEVC_SLICE_B else 0
+
+def fill_sps_into_dev_context(device_ctx, sps:SPS):
+  device_ctx.chroma_format_idc = sps.chroma_format_idc
+  device_ctx.pic_width_in_luma_samples = sps.pic_width_in_luma_samples
+  device_ctx.pic_height_in_luma_samples = sps.pic_height_in_luma_samples
+  device_ctx.bit_depth_luma = sps.bit_depth_luma
+  device_ctx.bit_depth_chroma = sps.bit_depth_chroma
+  device_ctx.log2_max_pic_order_cnt_lsb_minus4 = sps.log2_max_pic_order_cnt_lsb_minus4
+  device_ctx.log2_min_luma_coding_block_size = sps.log2_min_luma_coding_block_size
+  device_ctx.log2_max_luma_coding_block_size = sps.log2_max_luma_coding_block_size
+  device_ctx.log2_min_transform_block_size = sps.log2_min_transform_block_size
+  device_ctx.log2_max_transform_block_size = sps.log2_max_transform_block_size
+  device_ctx.amp_enabled_flag = sps.amp_enabled_flag
+  device_ctx.pcm_enabled_flag = sps.pcm_enabled_flag
+  device_ctx.sample_adaptive_offset_enabled_flag = sps.sample_adaptive_offset_enabled_flag
+  device_ctx.sps_temporal_mvp_enabled_flag = sps.sps_temporal_mvp_enabled_flag
+  device_ctx.strong_intra_smoothing_enabled_flag = sps.strong_intra_smoothing_enabled_flag
+
+def fill_pps_into_dev_context(device_ctx, pps:PPS):
+  device_ctx.sign_data_hiding_enabled_flag = pps.sign_data_hiding_enabled_flag
+  device_ctx.cabac_init_present_flag = pps.cabac_init_present_flag
+  device_ctx.num_ref_idx_l0_default_active = pps.num_ref_idx_l0_default_active
+  device_ctx.num_ref_idx_l1_default_active = pps.num_ref_idx_l1_default_active
+  device_ctx.init_qp = pps.init_qp
+  device_ctx.cu_qp_delta_enabled_flag = pps.cu_qp_delta_enabled_flag
+  device_ctx.diff_cu_qp_delta_depth = getattr(pps, 'diff_cu_qp_delta_depth', 0)
+  device_ctx.pps_cb_qp_offset = pps.pps_cb_qp_offset
+  device_ctx.pps_cr_qp_offset = pps.pps_cr_qp_offset
+  device_ctx.pps_slice_chroma_qp_offsets_present_flag = pps.pps_slice_chroma_qp_offsets_present_flag
+  device_ctx.weighted_pred_flag = pps.weighted_pred_flag
+  device_ctx.weighted_bipred_flag = pps.weighted_bipred_flag
+  device_ctx.transquant_bypass_enabled_flag = pps.transquant_bypass_enabled_flag
+  device_ctx.tiles_enabled_flag = pps.tiles_enabled_flag
+  device_ctx.entropy_coding_sync_enabled_flag = pps.entropy_coding_sync_enabled_flag
+  device_ctx.loop_filter_across_slices_enabled_flag = pps.loop_filter_across_slices_enabled_flag
+  device_ctx.deblocking_filter_control_present_flag = pps.deblocking_filter_control_present_flag
+  device_ctx.scaling_list_data_present_flag = pps.scaling_list_data_present_flag
+  device_ctx.lists_modification_present_flag = pps.lists_modification_present_flag
+  device_ctx.log2_parallel_merge_level = pps.log2_parallel_merge_level
+  device_ctx.loop_filter_across_tiles_enabled_flag = getattr(pps, 'loop_filter_across_tiles_enabled_flag', 0)
+
+def parse_hevc_file_headers(dat:bytes, device="NV"):
+  res = []
+  nal_unit_start = 1
+  history:list[tuple[int, int, int]] = []
+  device_ctx = nv_gpu.nvdec_hevc_pic_s(gptimer_timeout_value=92720000, tileformat=1, sw_start_code_e=1, pattern_id=2)
+  nal_infos = []
+  ctx_bytes = bytes()
+  align_ctx_bytes_size = 0x300
+
+  def _flush_picture():
+    nonlocal res, history, device_ctx, nal_infos, ctx_bytes, align_ctx_bytes_size
+
+    if not len(nal_infos): return
+
+    hdr, nal_unit_type = nal_infos[0][0]
+    assert all(nal_unit_type == x[0][1] for x in nal_infos), "all NAL units in a picture must be of the same type"
+
+    device_ctx.curr_pic_idx = next(i for i in range(16) if all(d[0] != i for d in history))
+
+    if nal_unit_type in {avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_IDR_N_LP}:
+      history = []
+
+    device_ctx.num_ref_frames = len(history)
+    device_ctx.IDR_picture_flag = int(nal_unit_type in {avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_IDR_N_LP})
+    device_ctx.RAP_picture_flag = int(nal_unit_type >= avcodec.HEVC_NAL_BLA_W_LP and nal_unit_type <= avcodec.HEVC_NAL_RSV_IRAP_VCL23)
+    device_ctx.RefDiffPicOrderCnts=(ctypes.c_int16 * 16)()
+    device_ctx.colMvBuffersize = (round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64) // 16) // 256
+    device_ctx.framestride=(ctypes.c_uint32 * 2)(round_up(sps.pic_width_in_luma_samples, 64), round_up(sps.pic_width_in_luma_samples, 64))
+    device_ctx.sw_hdr_skip_length = hdr.sw_skip_end - hdr.sw_skip_start
+    device_ctx.num_bits_short_term_ref_pics_in_slice = max(0, device_ctx.sw_hdr_skip_length - 9)
+    device_ctx.stream_len = sum(x[2] for x in nal_infos)
+
+    if pps.tiles_enabled_flag:
+      device_ctx.num_tile_columns = pps.num_tile_columns_minus1 + 1
+      device_ctx.num_tile_rows = pps.num_tile_rows_minus1 + 1
+
+    device_ctx.num_short_term_ref_pic_sets = sps.num_short_term_ref_pic_sets
+
+    luma_h_rounded = round_up(sps.pic_height_in_luma_samples, 64)
+    device_ctx.HevcSaoBufferOffset = (608 * luma_h_rounded) >> 8
+    device_ctx.HevcBsdCtrlOffset = ((device_ctx.HevcSaoBufferOffset<<8) + 4864 * luma_h_rounded) >> 8
+
+    device_ctx.v1.hevc_main10_444_ext.HevcFltAboveOffset = ((device_ctx.HevcBsdCtrlOffset<<8) + 152 * luma_h_rounded) >> 8
+    device_ctx.v1.hevc_main10_444_ext.HevcSaoAboveOffset = ((device_ctx.v1.hevc_main10_444_ext.HevcFltAboveOffset<<8) + 2000 * luma_h_rounded) >> 8
+    device_ctx.v3.HevcSliceEdgeOffset = device_ctx.v1.hevc_main10_444_ext.HevcSaoAboveOffset
+
+    before_list, after_list = [], []
+    for pic_idx, poc, _ in history:
+      device_ctx.RefDiffPicOrderCnts[pic_idx] = hdr.slice_pic_order_cnt_lsb - poc
+      if hdr.slice_pic_order_cnt_lsb < poc: after_list.append((poc - hdr.slice_pic_order_cnt_lsb, pic_idx))
+      else: before_list.append((hdr.slice_pic_order_cnt_lsb - poc, pic_idx))
+    before_list.sort()
+    after_list.sort()
+
+    device_ctx.initreflistidxl0 = (ctypes.c_uint8 * 16)(*[idx for _,idx in before_list + after_list])
+    if hdr.slice_type == avcodec.HEVC_SLICE_B: device_ctx.initreflistidxl1 = (ctypes.c_uint8 * 16)(*[idx for _,idx in after_list + before_list])
+
+    locl_ctx_bytes = bytes(device_ctx)
+    locl_ctx_bytes += b'\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00' # blackwell extension
+    locl_ctx_bytes += bytes(0x200 - len(locl_ctx_bytes)) # pad to 512 bytes
+
+    pic_width_in_ctbs = ceildiv(sps.pic_width_in_luma_samples, (1 << sps.log2_max_luma_coding_block_size))
+    pic_height_in_ctbs = ceildiv(sps.pic_height_in_luma_samples, (1 << sps.log2_max_luma_coding_block_size))
+    # append tile sizes 0x200
+    if pps.tiles_enabled_flag and pps.uniform_spacing_flag:
+      assert device_ctx.num_tile_columns == 1 and device_ctx.num_tile_rows == 1, "not implemented: uniform spacing with multiple tiles"
+      locl_ctx_bytes += pic_width_in_ctbs.to_bytes(2, "little") + pic_height_in_ctbs.to_bytes(2, "little")
+    else:
+      if pps.tiles_enabled_flag and not getattr(pps, 'uniform_spacing_flag', 0):
+        column_width = [cw_minus1 + 1 for cw_minus1 in pps.column_width_minus1[0:pps.num_tile_columns_minus1]]
+        row_height = [rh_minus1 + 1 for rh_minus1 in pps.row_height_minus1[0:pps.num_tile_rows_minus1]]
+      else:
+        column_width = []
+        row_height = []
+
+      column_width.append(pic_width_in_ctbs - sum(column_width))
+      row_height.append(pic_height_in_ctbs - sum(row_height))
+
+      for c in column_width:
+        for r in row_height: locl_ctx_bytes += c.to_bytes(2, "little") + r.to_bytes(2, "little")
+
+    luma_size = round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64)
+    chroma_size = round_up(sps.pic_width_in_luma_samples, 64) * round_up((sps.pic_height_in_luma_samples + 1) // 2, 64)
+    is_hist = nal_unit_type in {avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL}
+
+    res.append((nal_infos[0][1], device_ctx.stream_len, device_ctx.curr_pic_idx, len(history), is_hist))
+
+    locl_ctx_bytes += (align_ctx_bytes_size - len(locl_ctx_bytes)) * b'\x00'
+    ctx_bytes += locl_ctx_bytes
+
+    if nal_unit_type in {avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL}:
+      history.append((device_ctx.curr_pic_idx, hdr.slice_pic_order_cnt_lsb, None))
+
+    if len(history) >= sps.sps_max_dec_pic_buffering[0]:
+      # remove the oldest poc
+      history.pop(0)
+
+    nal_infos = []
+
+  cnt = 0
+  while nal_unit_start < len(dat):
+    assert dat[nal_unit_start:nal_unit_start+3] == b"\x00\x00\x01", "NAL unit start code not found"
+
+    pos = dat.find(b"\x00\x00\x01", nal_unit_start + 3)
+    nal_unit_len = (pos if pos != -1 else len(dat)) - nal_unit_start
+
+    # 7.3.1.1 General NAL unit syntax
+    nal_unit_type = (dat[nal_unit_start+3] >> 1) & 0x3F
+    slice_dat = dat[nal_unit_start+5:nal_unit_start+nal_unit_len]
+
+    if nal_unit_type == avcodec.HEVC_NAL_SPS:
+      sps = SPS(BitReader(_hevc_get_rbsp(slice_dat)))
+      fill_sps_into_dev_context(device_ctx, sps)
+    elif nal_unit_type == avcodec.HEVC_NAL_PPS:
+      pps = PPS(BitReader(_hevc_get_rbsp(slice_dat)))
+      fill_pps_into_dev_context(device_ctx, pps)
+    elif nal_unit_type in {avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_TRAIL_N}:
+      hdr = SliceSegment(BitReader(slice_dat), nal_unit_type, sps, pps)
+
+      if hdr.first_slice_segment_in_pic_flag == 1: _flush_picture()
+      nal_infos.append(((hdr, nal_unit_type), nal_unit_start, nal_unit_len))
+
+    nal_unit_start += nal_unit_len
+  _flush_picture()
+
+  w = sps.pic_width_in_luma_samples - 2 * (sps.conf_win_left_offset + sps.conf_win_right_offset)
+  h = sps.pic_height_in_luma_samples - 2 * (sps.conf_win_top_offset  + sps.conf_win_bottom_offset)
+  chroma_off = round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64)
+  opaque = Tensor(ctx_bytes, device=device).reshape(len(res), align_ctx_bytes_size)
+  return opaque, res, w, h, sps.pic_width_in_luma_samples, sps.pic_height_in_luma_samples, chroma_off
+
+def _addr_table(h, w, w_aligned):
+  GOB_W, GOB_H = 64, 8
+  GOB_SIZE = GOB_W * GOB_H
+  BLOCK_H_GOBS = 2
+
+  xs = Tensor.arange(w, dtype=dtypes.uint32).reshape(1, w)
+  ys = Tensor.arange(h, dtype=dtypes.uint32).reshape(h, 1)
+
+  gob_x = xs // GOB_W
+  gob_y = ys // GOB_H
+  super_block_y = gob_y // BLOCK_H_GOBS
+  gob_y_in_block = gob_y  % BLOCK_H_GOBS
+  stride_gobs = w_aligned // GOB_W
+
+  base = ((super_block_y * stride_gobs + gob_x) * BLOCK_H_GOBS + gob_y_in_block) * GOB_SIZE
+
+  lx, ly = xs % GOB_W, ys % GOB_H
+  swiz = (lx & 0x0F) | ((ly & 0x03) << 4) | ((lx & 0x10) << 2) | ((ly & 0x04) << 5) | ((lx & 0x20) << 3)
+  return (base + swiz).reshape(-1)
+
+def nv12_to_bgr_from_planes(luma: Tensor, chroma: Tensor, h: int, w: int) -> Tensor:
+  Y = luma.reshape(h, w).cast(dtypes.float32)
+
+  uv = chroma.reshape(h // 2, w // 2, 2).cast(dtypes.float32)
+  U_small = uv[..., 0]
+  V_small = uv[..., 1]
+
+  U = U_small.reshape(h // 2, 1, w // 2, 1).expand(h // 2, 2, w // 2, 2).reshape(h, w)
+  V = V_small.reshape(h // 2, 1, w // 2, 1).expand(h // 2, 2, w // 2, 2).reshape(h, w)
+
+  C = Y - 16.0
+  D = U - 128.0
+  E = V - 128.0
+
+  R = 1.1643835616438356 * C + 1.5960267857142858 * E
+  G = 1.1643835616438356 * C - 0.39176229009491365 * D - 0.8129676472377708 * E
+  B = 1.1643835616438356 * C + 2.017232142857143  * D
+
+  R = R.maximum(0.0).minimum(255.0)
+  G = G.maximum(0.0).minimum(255.0)
+  B = B.maximum(0.0).minimum(255.0)
+
+  return Tensor.stack([B, G, R], dim=2).cast(dtypes.uint8)
+
+def untile_nv12(src:Tensor, h:int, w:int, luma_w:int, chroma_off:int) -> Tensor:
+  luma = src.reshape(-1)[_addr_table(h, w, round_up(luma_w, 64))]
+  chroma = src.reshape(-1)[chroma_off:][_addr_table((h + 1) // 2, w, round_up(luma_w, 64))]
+  return luma.cat(chroma).realize()
+
+def to_bgr(tensor:Tensor, h:int, w:int, luma_w:int, chroma_off:int) -> Tensor:
+  luma = tensor.reshape(-1)[_addr_table(h, w, round_up(luma_w, 64))]
+  chroma = tensor.reshape(-1)[chroma_off:][_addr_table((h + 1) // 2, w, round_up(luma_w, 64))]
+  return nv12_to_bgr_from_planes(luma, chroma, h, w).realize()
--- a/extra/hip_events.py
+++ b/extra/hip_events.py
@ -1,12 +0,0 @@
-import ctypes
-import tinygrad.runtime.autogen.hip as hip
-from tinygrad.runtime.ops_hip import check
-from tinygrad.helpers import init_c_var
-
-if __name__ == "__main__":
-  check(hip.hipSetDevice(0))
-  evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
-  check(hip.hipSetDevice(1))
-  check(hip.hipStreamWaitEvent(None, evt, 0))
-  check(hip.hipSetDevice(0))
-  check(hip.hipEventRecord(evt, None))
--- a/extra/hip_gpu_driver/hip_ioctl.py
+++ b/extra/hip_gpu_driver/hip_ioctl.py
@ -66,7 +66,7 @@ def ioctl(fd, request, argp):
    print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
    if name == "AMDKFD_IOC_SVM":
      out = ctypes.cast(s.attrs, ctypes.POINTER(kfd_ioctl.struct_kfd_ioctl_svm_attribute))
-      for i in range(s.nattr): print(f"{i}: {kfd_ioctl.kfd_ioctl_svm_attr_type__enumvalues[out[i].type]:40s}: {out[i].value:#x}")
+      for i in range(s.nattr): print(f"{i}: {kfd_ioctl.enum_kfd_ioctl_svm_attr_type.get(out[i].type):40s}: {out[i].value:#x}")
  else:
    print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : ioctl",
          f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
--- a/extra/junk/sentencepiece_model_pb2.py
+++ b/extra/junk/sentencepiece_model_pb2.py
@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: sentencepiece_model.proto
-# Protobuf Python Version: 4.25.1
-"""Generated protocol buffer code."""
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import descriptor_pool as _descriptor_pool
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf.internal import builder as _builder
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-
-
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03')
-
-_globals = globals()
-_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sentencepiece_model_pb2', _globals)
-if _descriptor._USE_C_DESCRIPTORS == False:
-  _globals['DESCRIPTOR']._options = None
-  _globals['DESCRIPTOR']._serialized_options = b'H\003'
-  _globals['_TRAINERSPEC'].fields_by_name['mining_sentence_size']._options = None
-  _globals['_TRAINERSPEC'].fields_by_name['mining_sentence_size']._serialized_options = b'\030\001'
-  _globals['_TRAINERSPEC'].fields_by_name['training_sentence_size']._options = None
-  _globals['_TRAINERSPEC'].fields_by_name['training_sentence_size']._serialized_options = b'\030\001'
-  _globals['_TRAINERSPEC']._serialized_start=45
-  _globals['_TRAINERSPEC']._serialized_end=1581
-  _globals['_TRAINERSPEC_MODELTYPE']._serialized_start=1517
-  _globals['_TRAINERSPEC_MODELTYPE']._serialized_end=1570
-  _globals['_NORMALIZERSPEC']._serialized_start=1584
-  _globals['_NORMALIZERSPEC']._serialized_end=1793
-  _globals['_SELFTESTDATA']._serialized_start=1795
-  _globals['_SELFTESTDATA']._serialized_end=1916
-  _globals['_SELFTESTDATA_SAMPLE']._serialized_start=1864
-  _globals['_SELFTESTDATA_SAMPLE']._serialized_end=1905
-  _globals['_MODELPROTO']._serialized_start=1919
-  _globals['_MODELPROTO']._serialized_end=2429
-  _globals['_MODELPROTO_SENTENCEPIECE']._serialized_start=2208
-  _globals['_MODELPROTO_SENTENCEPIECE']._serialized_end=2418
-  _globals['_MODELPROTO_SENTENCEPIECE_TYPE']._serialized_start=2323
-  _globals['_MODELPROTO_SENTENCEPIECE_TYPE']._serialized_end=2407
-# @@protoc_insertion_point(module_scope)
--- a/extra/mcts_search.py
+++ b/extra/mcts_search.py
@ -1,176 +0,0 @@
-from __future__ import annotations
-from typing import List, Optional, Dict, cast
-import numpy as np
-np.set_printoptions(suppress=True)
-import math, functools, time, random, statistics
-from tinygrad.helpers import DEBUG, getenv, CACHELEVEL, diskcache_get, diskcache_put, colored, Profiling
-from tinygrad.codegen.opt.kernel import Kernel
-from tinygrad.device import Buffer, Device, CompileError
-from tinygrad.codegen.opt.search import _ensure_buffer_alloc, get_kernel_actions, _time_program
-from tinygrad.engine.realize import get_program
-
-class MCTSNode:
-  def __init__(self, kernel:Kernel, parent=None):
-    self.kernel:Kernel = kernel
-    self.t = math.inf
-    self.n = 0
-    self.tm = math.inf
-    self.i = -1
-    self.parents: List[MCTSNode] = [parent] if parent is not None else []
-    self.children: Optional[List[MCTSNode]] = None
-    self.removed_children: List[MCTSNode] = []
-
-def expand_node(node:MCTSNode):
-  assert node.children is None
-  node.children = [MCTSNode(x, node) for x in get_kernel_actions(node.kernel, include_0=False).values()]
-
-def remove_node(node:MCTSNode):
-  for parent in node.parents:
-    assert parent.children is not None
-    parent.children.remove(node)
-    parent.removed_children.append(node)
-
-C = math.sqrt(2)
-TEMP = 0.5
-def _sample_tree(node:MCTSNode, best_tm:float) -> MCTSNode:
-  if node.children is None or len(node.children) == 0: return node
-  unexplored_children = []
-  explored_children = []
-  ucb_explored_children: List[float] = []
-  for child in node.children:
-    if child.n == 0: unexplored_children.append(child)
-    else:
-      ucb = -child.t/best_tm + C*math.sqrt(math.log(node.n)/child.n)
-      if not math.isinf(ucb):
-        explored_children.append(child)
-        ucb_explored_children.append(ucb)
-  if len(unexplored_children): return random.choice(unexplored_children)
-  if not len(explored_children): return node
-  # safe softmax
-  ucb_exp = np.exp((np.array(ucb_explored_children)-max(ucb_explored_children))/TEMP)
-  return _sample_tree(explored_children[np.random.choice(len(ucb_exp), p=ucb_exp/np.sum(ucb_exp))], best_tm)
-
-# this will expand/remove sometimes
-def sample_tree(root:MCTSNode, best_tm:float) -> Optional[MCTSNode]:
-  if root.children is None: expand_node(root)
-  while root.children:
-    # tree traversal
-    node = _sample_tree(root, best_tm)
-
-    if node.children is not None and len(node.children) == 0:
-      remove_node(node)
-      continue
-
-    # node expansion
-    if node.n != 0:
-      if node.children is None: expand_node(node)
-      assert node.children is not None
-      if len(node.children) == 0:
-        remove_node(node)
-        continue
-      node = random.choice(node.children)
-    return node
-  return None
-
-def backprop(bnode:MCTSNode, tm, strength=1.0):
-  if bnode.t > tm: bnode.t = tm
-  bnode.n += strength
-  for parent in bnode.parents: backprop(parent, tm, strength/len(bnode.parents))
-
-graph_mcts_cnt = 0
-def mcts_search(lin:Kernel, rawbufs:List[Buffer], amt:int) -> Kernel:
-  global graph_mcts_cnt
-  # TODO: copied from BEAM
-  key = {"ast": lin.ast.key, "amt": amt, "device": lin.opts.device, "suffix": lin.opts.suffix}
-  if not getenv("IGNORE_MCTS_CACHE") and CACHELEVEL >= 1 and (val:=diskcache_get("mcts_search", key)) is not None:
-    ret = lin.copy()
-    for o in val[len(lin.applied_opts):]: ret.apply_opt(o)
-    return ret
-
-  rawbufs = _ensure_buffer_alloc(rawbufs)
-  var_vals = {k.expr:(k.vmax+k.vmin)//2 for k in lin.ast.variables()}
-  dev = Device[lin.opts.device]
-  root = MCTSNode(lin)
-
-  st = time.perf_counter()
-  best, best_idx, best_tm = lin, 0, math.inf
-  seen_libs: Dict[bytes, MCTSNode] = {}
-  seen_asts: Dict[bytes, MCTSNode] = {}
-  compile_time, runtime_time = 0.0, 0.0
-  for i in range(amt):
-    node = sample_tree(root, best_tm)  # sample and expand
-    if node is None: break  # finished the whole tree
-    node.i = i  # when was node explored
-
-    opt_ast = node.kernel.get_optimized_ast()
-    if (sibling_node:=seen_asts.get(opt_ast.key, None)) is not None:
-      # early check for same optimized AST hit
-      remove_node(node)
-      tm = sibling_node.t
-    else:
-      seen_asts[opt_ast.key] = node
-
-      # lowering (50% of the time)
-      p = get_program(node.kernel.get_optimized_ast(name_override="test"), node.kernel.opts)
-
-      # rollout
-      tm1 = time.perf_counter()
-      try:
-        lib = dev.compiler.compile(p.src)
-      except CompileError:
-        # NOTE: many of these "compiler errors" are caused by bad code output from the lowerer
-        lib = None
-      tm2 = time.perf_counter()
-      if lib is None:
-        tm = math.inf
-      else:
-        if (sibling_node:=seen_libs.get(lib, None)) is not None:
-          # NOTE: these should all be caught by the AST check, need to canonicalize
-          # remove this node, it's a duplicate
-          remove_node(node)
-          tm = sibling_node.t
-        else:
-          seen_libs[lib] = node
-          try: tm = statistics.median(_time_program(p, lib, var_vals, rawbufs, cnt=3, early_stop=best_tm*5/1e6))*1e6
-          except RuntimeError: tm = math.inf
-          node.tm = tm
-      tm3 = time.perf_counter()
-      compile_time += tm2-tm1
-      runtime_time += tm3-tm2
-
-      # mock rollout
-      #node.tm = tm = random.random() + 0.1
-
-    if tm < best_tm: best, best_idx, best_tm = node.kernel, i, tm
-    et = time.perf_counter() - st
-    if DEBUG>=2: print(f"\r{et:7.2f}s {colored(f'{compile_time*100/et:3.0f}%', 'cyan')} {colored(f'{runtime_time*100/et:3.0f}%', 'red')}: {tm:12.2f} us     best: {best_tm:12.2f} us @ {best_idx+1:4d}      {i+1:4d}/{amt:4d}  {int(round((i+1)/et)):4d}/s     {node.kernel.colored_shape()}\033[K", end="")  # noqa: E501
-
-    # backprop
-    backprop(node, tm)
-  if DEBUG>=2: print()
-
-  if getenv("MCTSGRAPH"):
-    import networkx as nx
-    import os
-    GRAPHPATH = "/tmp/net"
-    def save_graph(G, fn, opt=""):
-      print("saving", G, f"to {fn}.svg")
-      nx.drawing.nx_pydot.write_dot(G, f'{fn}.dot')
-      os.system(f'dot {opt} -Tsvg {fn}.dot -o {fn}.svg')
-
-    G = nx.DiGraph()
-    def add_node(node:MCTSNode):
-      if node.n == 0: return
-      for parent in node.parents: G.add_edge(parent, node)
-      gopts = node.kernel.applied_opts
-      edge_lbl = f"{str(gopts[-1].op)[7:]} {gopts[-1].axis} {gopts[-1].arg}" if len(gopts) else "ROOT"
-      G.add_node(node, label=f"{node.i+1}\n{node.tm:.2f} us\n{edge_lbl}\nt {node.t:.2f}\nn {node.n}",
-                 fillcolor="#80ff8080" if node.tm == best_tm else "#ffff8080", style='filled' if node.t == best_tm else '')
-      if node.children is not None:
-        for child in node.children+node.removed_children: add_node(child)
-    add_node(root)
-    save_graph(G, f"{GRAPHPATH}.{graph_mcts_cnt}.mcts", '-Grankdir=LR')
-    graph_mcts_cnt += 1
-
-  if CACHELEVEL >= 1: diskcache_put("mcts_search", key, best.applied_opts)
-  return best
--- a/extra/mmapeak/mmapeak.py
+++ b/extra/mmapeak/mmapeak.py
@ -48,7 +48,7 @@ if __name__=="__main__":
  COMPILER = HIPCompiler(DEV.arch)
  if DEV.arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
    if DEV.arch == 'gfx1103': NUM_WORKGROUPS = 8
-    if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 40
+    if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 32
    launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
    launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
    launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))
--- a/extra/models/bert.py
+++ b/extra/models/bert.py
@ -242,7 +242,8 @@ class BertIntermediate:
  def __call__(self, hidden_states):
    x = self.dense(hidden_states)
    # tinygrad gelu is openai gelu but we need the original bert gelu
-    return gelu(x)
+    # NOTE: contiguous for speed
+    return gelu(x).contiguous()

 class BertAttention:
  def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
--- a/extra/nv_gpu_driver/clc9b0.h
+++ b/extra/nv_gpu_driver/clc9b0.h
@ -0,0 +1,603 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef clc9b0_h_
+#define clc9b0_h_
+
+#include "nvtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NVC9B0_VIDEO_DECODER                                                       (0x0000C9B0)
+
+#define NVC9B0_NOP                                                              (0x00000100)
+#define NVC9B0_NOP_PARAMETER                                                    31:0
+#define NVC9B0_PM_TRIGGER                                                       (0x00000140)
+#define NVC9B0_PM_TRIGGER_V                                                     31:0
+#define NVC9B0_SET_APPLICATION_ID                                               (0x00000200)
+#define NVC9B0_SET_APPLICATION_ID_ID                                            31:0
+#define NVC9B0_SET_APPLICATION_ID_ID_MPEG12                                     (0x00000001)
+#define NVC9B0_SET_APPLICATION_ID_ID_VC1                                        (0x00000002)
+#define NVC9B0_SET_APPLICATION_ID_ID_H264                                       (0x00000003)
+#define NVC9B0_SET_APPLICATION_ID_ID_MPEG4                                      (0x00000004)
+#define NVC9B0_SET_APPLICATION_ID_ID_VP8                                        (0x00000005)
+#define NVC9B0_SET_APPLICATION_ID_ID_CTR64                                      (0x00000006)
+#define NVC9B0_SET_APPLICATION_ID_ID_HEVC                                       (0x00000007)
+#define NVC9B0_SET_APPLICATION_ID_ID_NEW_H264                                   (0x00000008)
+#define NVC9B0_SET_APPLICATION_ID_ID_VP9                                        (0x00000009)
+#define NVC9B0_SET_APPLICATION_ID_ID_PASS1                                      (0x0000000A)
+#define NVC9B0_SET_APPLICATION_ID_ID_HEVC_PARSER                                (0x0000000C)
+#define NVC9B0_SET_APPLICATION_ID_ID_UCODE_TEST                                 (0x0000000D)
+#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_DECRYPTAUDIO                      (0x0000000E)
+#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_DECRYPTAUDIOMULTIPLE              (0x0000000F)
+#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_PREPROCESSENCRYPTEDDATA           (0x00000010)
+#define NVC9B0_SET_APPLICATION_ID_ID_VP9_WITH_PARSER                            (0x00000011)
+#define NVC9B0_SET_APPLICATION_ID_ID_AVD                                        (0x00000012)
+#define NVC9B0_SET_APPLICATION_ID_ID_HW_DRM_PR4_DECRYPTCONTENTMULTIPLE          (0x00000013)
+#define NVC9B0_SET_APPLICATION_ID_ID_DHKE                                       (0x00000020)
+#define NVC9B0_SET_WATCHDOG_TIMER                                               (0x00000204)
+#define NVC9B0_SET_WATCHDOG_TIMER_TIMER                                         31:0
+#define NVC9B0_SEMAPHORE_A                                                      (0x00000240)
+#define NVC9B0_SEMAPHORE_A_UPPER                                                7:0
+#define NVC9B0_SEMAPHORE_B                                                      (0x00000244)
+#define NVC9B0_SEMAPHORE_B_LOWER                                                31:0
+#define NVC9B0_SEMAPHORE_C                                                      (0x00000248)
+#define NVC9B0_SEMAPHORE_C_PAYLOAD                                              31:0
+#define NVC9B0_CTX_SAVE_AREA                                                    (0x0000024C)
+#define NVC9B0_CTX_SAVE_AREA_OFFSET                                             31:0
+#define NVC9B0_CTX_SWITCH                                                       (0x00000250)
+#define NVC9B0_CTX_SWITCH_OP                                                    1:0
+#define NVC9B0_CTX_SWITCH_OP_CTX_UPDATE                                         (0x00000000)
+#define NVC9B0_CTX_SWITCH_OP_CTX_SAVE                                           (0x00000001)
+#define NVC9B0_CTX_SWITCH_OP_CTX_RESTORE                                        (0x00000002)
+#define NVC9B0_CTX_SWITCH_OP_CTX_FORCERESTORE                                   (0x00000003)
+#define NVC9B0_CTX_SWITCH_CTXID_VALID                                           2:2
+#define NVC9B0_CTX_SWITCH_CTXID_VALID_FALSE                                     (0x00000000)
+#define NVC9B0_CTX_SWITCH_CTXID_VALID_TRUE                                      (0x00000001)
+#define NVC9B0_CTX_SWITCH_RESERVED0                                             7:3
+#define NVC9B0_CTX_SWITCH_CTX_ID                                                23:8
+#define NVC9B0_CTX_SWITCH_RESERVED1                                             31:24
+#define NVC9B0_SET_SEMAPHORE_PAYLOAD_LOWER                                      (0x00000254)
+#define NVC9B0_SET_SEMAPHORE_PAYLOAD_LOWER_PAYLOAD_LOWER                        31:0
+#define NVC9B0_SET_SEMAPHORE_PAYLOAD_UPPER                                      (0x00000258)
+#define NVC9B0_SET_SEMAPHORE_PAYLOAD_UPPER_PAYLOAD_UPPER                        31:0
+#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_A                        (0x0000025C)
+#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_A_LOWER                  31:0
+#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_B                        (0x00000260)
+#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_B_UPPER                  31:0
+#define NVC9B0_EXECUTE                                                          (0x00000300)
+#define NVC9B0_EXECUTE_NOTIFY                                                   0:0
+#define NVC9B0_EXECUTE_NOTIFY_DISABLE                                           (0x00000000)
+#define NVC9B0_EXECUTE_NOTIFY_ENABLE                                            (0x00000001)
+#define NVC9B0_EXECUTE_NOTIFY_ON                                                1:1
+#define NVC9B0_EXECUTE_NOTIFY_ON_END                                            (0x00000000)
+#define NVC9B0_EXECUTE_NOTIFY_ON_BEGIN                                          (0x00000001)
+#define NVC9B0_EXECUTE_PREDICATION                                              2:2
+#define NVC9B0_EXECUTE_PREDICATION_DISABLE                                      (0x00000000)
+#define NVC9B0_EXECUTE_PREDICATION_ENABLE                                       (0x00000001)
+#define NVC9B0_EXECUTE_PREDICATION_OP                                           3:3
+#define NVC9B0_EXECUTE_PREDICATION_OP_EQUAL_ZERO                                (0x00000000)
+#define NVC9B0_EXECUTE_PREDICATION_OP_NOT_EQUAL_ZERO                            (0x00000001)
+#define NVC9B0_EXECUTE_AWAKEN                                                   8:8
+#define NVC9B0_EXECUTE_AWAKEN_DISABLE                                           (0x00000000)
+#define NVC9B0_EXECUTE_AWAKEN_ENABLE                                            (0x00000001)
+#define NVC9B0_SEMAPHORE_D                                                      (0x00000304)
+#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE                                       1:0
+#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_ONE                                   (0x00000000)
+#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_FOUR                                  (0x00000001)
+#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_TWO                                   (0x00000002)
+#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE                                        8:8
+#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE_FALSE                                  (0x00000000)
+#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE_TRUE                                   (0x00000001)
+#define NVC9B0_SEMAPHORE_D_OPERATION                                            17:16
+#define NVC9B0_SEMAPHORE_D_OPERATION_RELEASE                                    (0x00000000)
+#define NVC9B0_SEMAPHORE_D_OPERATION_RESERVED_0                                 (0x00000001)
+#define NVC9B0_SEMAPHORE_D_OPERATION_RESERVED_1                                 (0x00000002)
+#define NVC9B0_SEMAPHORE_D_OPERATION_TRAP                                       (0x00000003)
+#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE                                        21:21
+#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE_FALSE                                  (0x00000000)
+#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE_TRUE                                   (0x00000001)
+#define NVC9B0_SEMAPHORE_D_TRAP_TYPE                                            23:22
+#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_UNCONDITIONAL                              (0x00000000)
+#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_CONDITIONAL                                (0x00000001)
+#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_CONDITIONAL_EXT                            (0x00000002)
+#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE                                         24:24
+#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE_32BIT                                   (0x00000000)
+#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE_64BIT                                   (0x00000001)
+#define NVC9B0_SET_PREDICATION_OFFSET_UPPER                                     (0x00000308)
+#define NVC9B0_SET_PREDICATION_OFFSET_UPPER_OFFSET                              7:0
+#define NVC9B0_SET_PREDICATION_OFFSET_LOWER                                     (0x0000030C)
+#define NVC9B0_SET_PREDICATION_OFFSET_LOWER_OFFSET                              31:0
+#define NVC9B0_SET_AUXILIARY_DATA_BUFFER                                        (0x00000310)
+#define NVC9B0_SET_AUXILIARY_DATA_BUFFER_OFFSET                                 31:0
+#define NVC9B0_SET_CONTROL_PARAMS                                               (0x00000400)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE                                    3:0
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG1                              (0x00000000)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG2                              (0x00000001)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VC1                                (0x00000002)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_H264                               (0x00000003)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG4                              (0x00000004)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_DIVX3                              (0x00000004)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VP8                                (0x00000005)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_HEVC                               (0x00000007)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VP9                                (0x00000009)
+#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_AV1                                (0x0000000A)
+#define NVC9B0_SET_CONTROL_PARAMS_GPTIMER_ON                                    4:4
+#define NVC9B0_SET_CONTROL_PARAMS_RET_ERROR                                     5:5
+#define NVC9B0_SET_CONTROL_PARAMS_ERR_CONCEAL_ON                                6:6
+#define NVC9B0_SET_CONTROL_PARAMS_ERROR_FRM_IDX                                 12:7
+#define NVC9B0_SET_CONTROL_PARAMS_MBTIMER_ON                                    13:13
+#define NVC9B0_SET_CONTROL_PARAMS_EC_INTRA_FRAME_USING_PSLC                     14:14
+#define NVC9B0_SET_CONTROL_PARAMS_IGNORE_SOME_FIELDS_CRC_CHECK                  15:15
+#define NVC9B0_SET_CONTROL_PARAMS_EVENT_TRACE_LOGGING_ON                        16:16
+#define NVC9B0_SET_CONTROL_PARAMS_ALL_INTRA_FRAME                               17:17
+#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV                                   19:18
+#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV_TRACE3D_RUN                       (0x00000000)
+#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV_PROD_RUN                          (0x00000001)
+#define NVC9B0_SET_CONTROL_PARAMS_HINT_DUMP_EN                                  20:20
+#define NVC9B0_SET_CONTROL_PARAMS_RESERVED                                      25:21
+#define NVC9B0_SET_CONTROL_PARAMS_NVDECSIM_SKIP_SCP                             26:26
+#define NVC9B0_SET_CONTROL_PARAMS_ENABLE_ENCRYPT                                27:27
+#define NVC9B0_SET_CONTROL_PARAMS_ENCRYPTMODE                                   31:28
+#define NVC9B0_SET_DRV_PIC_SETUP_OFFSET                                         (0x00000404)
+#define NVC9B0_SET_DRV_PIC_SETUP_OFFSET_OFFSET                                  31:0
+#define NVC9B0_SET_IN_BUF_BASE_OFFSET                                           (0x00000408)
+#define NVC9B0_SET_IN_BUF_BASE_OFFSET_OFFSET                                    31:0
+#define NVC9B0_SET_PICTURE_INDEX                                                (0x0000040C)
+#define NVC9B0_SET_PICTURE_INDEX_INDEX                                          31:0
+#define NVC9B0_SET_SLICE_OFFSETS_BUF_OFFSET                                     (0x00000410)
+#define NVC9B0_SET_SLICE_OFFSETS_BUF_OFFSET_OFFSET                              31:0
+#define NVC9B0_SET_COLOC_DATA_OFFSET                                            (0x00000414)
+#define NVC9B0_SET_COLOC_DATA_OFFSET_OFFSET                                     31:0
+#define NVC9B0_SET_HISTORY_OFFSET                                               (0x00000418)
+#define NVC9B0_SET_HISTORY_OFFSET_OFFSET                                        31:0
+#define NVC9B0_SET_DISPLAY_BUF_SIZE                                             (0x0000041C)
+#define NVC9B0_SET_DISPLAY_BUF_SIZE_SIZE                                        31:0
+#define NVC9B0_SET_HISTOGRAM_OFFSET                                             (0x00000420)
+#define NVC9B0_SET_HISTOGRAM_OFFSET_OFFSET                                      31:0
+#define NVC9B0_SET_NVDEC_STATUS_OFFSET                                          (0x00000424)
+#define NVC9B0_SET_NVDEC_STATUS_OFFSET_OFFSET                                   31:0
+#define NVC9B0_SET_DISPLAY_BUF_LUMA_OFFSET                                      (0x00000428)
+#define NVC9B0_SET_DISPLAY_BUF_LUMA_OFFSET_OFFSET                               31:0
+#define NVC9B0_SET_DISPLAY_BUF_CHROMA_OFFSET                                    (0x0000042C)
+#define NVC9B0_SET_DISPLAY_BUF_CHROMA_OFFSET_OFFSET                             31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET0                                         (0x00000430)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET0_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET1                                         (0x00000434)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET1_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET2                                         (0x00000438)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET2_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET3                                         (0x0000043C)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET3_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET4                                         (0x00000440)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET4_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET5                                         (0x00000444)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET5_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET6                                         (0x00000448)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET6_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET7                                         (0x0000044C)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET7_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET8                                         (0x00000450)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET8_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET9                                         (0x00000454)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET9_OFFSET                                  31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET10                                        (0x00000458)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET10_OFFSET                                 31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET11                                        (0x0000045C)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET11_OFFSET                                 31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET12                                        (0x00000460)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET12_OFFSET                                 31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET13                                        (0x00000464)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET13_OFFSET                                 31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET14                                        (0x00000468)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET14_OFFSET                                 31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET15                                        (0x0000046C)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET15_OFFSET                                 31:0
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET16                                        (0x00000470)
+#define NVC9B0_SET_PICTURE_LUMA_OFFSET16_OFFSET                                 31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET0                                       (0x00000474)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET0_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET1                                       (0x00000478)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET1_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET2                                       (0x0000047C)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET2_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET3                                       (0x00000480)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET3_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET4                                       (0x00000484)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET4_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET5                                       (0x00000488)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET5_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET6                                       (0x0000048C)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET6_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET7                                       (0x00000490)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET7_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET8                                       (0x00000494)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET8_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET9                                       (0x00000498)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET9_OFFSET                                31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET10                                      (0x0000049C)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET10_OFFSET                               31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET11                                      (0x000004A0)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET11_OFFSET                               31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET12                                      (0x000004A4)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET12_OFFSET                               31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET13                                      (0x000004A8)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET13_OFFSET                               31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET14                                      (0x000004AC)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET14_OFFSET                               31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET15                                      (0x000004B0)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET15_OFFSET                               31:0
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET16                                      (0x000004B4)
+#define NVC9B0_SET_PICTURE_CHROMA_OFFSET16_OFFSET                               31:0
+#define NVC9B0_SET_PIC_SCRATCH_BUF_OFFSET                                       (0x000004B8)
+#define NVC9B0_SET_PIC_SCRATCH_BUF_OFFSET_OFFSET                                31:0
+#define NVC9B0_SET_EXTERNAL_MVBUFFER_OFFSET                                     (0x000004BC)
+#define NVC9B0_SET_EXTERNAL_MVBUFFER_OFFSET_OFFSET                              31:0
+#define NVC9B0_SET_SUB_SAMPLE_MAP_OFFSET                                        (0x000004C0)
+#define NVC9B0_SET_SUB_SAMPLE_MAP_OFFSET_OFFSET                                 31:0
+#define NVC9B0_SET_SUB_SAMPLE_MAP_IV_OFFSET                                     (0x000004C4)
+#define NVC9B0_SET_SUB_SAMPLE_MAP_IV_OFFSET_OFFSET                              31:0
+#define NVC9B0_SET_INTRA_TOP_BUF_OFFSET                                         (0x000004C8)
+#define NVC9B0_SET_INTRA_TOP_BUF_OFFSET_OFFSET                                  31:0
+#define NVC9B0_SET_TILE_SIZE_BUF_OFFSET                                         (0x000004CC)
+#define NVC9B0_SET_TILE_SIZE_BUF_OFFSET_OFFSET                                  31:0
+#define NVC9B0_SET_FILTER_BUFFER_OFFSET                                         (0x000004D0)
+#define NVC9B0_SET_FILTER_BUFFER_OFFSET_OFFSET                                  31:0
+#define NVC9B0_SET_CRC_STRUCT_OFFSET                                            (0x000004D4)
+#define NVC9B0_SET_CRC_STRUCT_OFFSET_OFFSET                                     31:0
+#define NVC9B0_SET_PR_SSM_CONTENT_INFO_BUF_OFFSET                               (0x000004D8)
+#define NVC9B0_SET_PR_SSM_CONTENT_INFO_BUF_OFFSET_OFFSET                        31:0
+#define NVC9B0_H264_SET_MBHIST_BUF_OFFSET                                       (0x00000500)
+#define NVC9B0_H264_SET_MBHIST_BUF_OFFSET_OFFSET                                31:0
+#define NVC9B0_VP8_SET_PROB_DATA_OFFSET                                         (0x00000540)
+#define NVC9B0_VP8_SET_PROB_DATA_OFFSET_OFFSET                                  31:0
+#define NVC9B0_VP8_SET_HEADER_PARTITION_BUF_BASE_OFFSET                         (0x00000544)
+#define NVC9B0_VP8_SET_HEADER_PARTITION_BUF_BASE_OFFSET_OFFSET                  31:0
+#define NVC9B0_HEVC_SET_SCALING_LIST_OFFSET                                     (0x00000580)
+#define NVC9B0_HEVC_SET_SCALING_LIST_OFFSET_OFFSET                              31:0
+#define NVC9B0_HEVC_SET_TILE_SIZES_OFFSET                                       (0x00000584)
+#define NVC9B0_HEVC_SET_TILE_SIZES_OFFSET_OFFSET                                31:0
+#define NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET                                    (0x00000588)
+#define NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET_OFFSET                             31:0
+#define NVC9B0_HEVC_SET_SAO_BUFFER_OFFSET                                       (0x0000058C)
+#define NVC9B0_HEVC_SET_SAO_BUFFER_OFFSET_OFFSET                                31:0
+#define NVC9B0_HEVC_SET_SLICE_INFO_BUFFER_OFFSET                                (0x00000590)
+#define NVC9B0_HEVC_SET_SLICE_INFO_BUFFER_OFFSET_OFFSET                         31:0
+#define NVC9B0_HEVC_SET_SLICE_GROUP_INDEX                                       (0x00000594)
+#define NVC9B0_HEVC_SET_SLICE_GROUP_INDEX_OFFSET                                31:0
+#define NVC9B0_VP9_SET_PROB_TAB_BUF_OFFSET                                      (0x000005C0)
+#define NVC9B0_VP9_SET_PROB_TAB_BUF_OFFSET_OFFSET                               31:0
+#define NVC9B0_VP9_SET_CTX_COUNTER_BUF_OFFSET                                   (0x000005C4)
+#define NVC9B0_VP9_SET_CTX_COUNTER_BUF_OFFSET_OFFSET                            31:0
+#define NVC9B0_VP9_SET_SEGMENT_READ_BUF_OFFSET                                  (0x000005C8)
+#define NVC9B0_VP9_SET_SEGMENT_READ_BUF_OFFSET_OFFSET                           31:0
+#define NVC9B0_VP9_SET_SEGMENT_WRITE_BUF_OFFSET                                 (0x000005CC)
+#define NVC9B0_VP9_SET_SEGMENT_WRITE_BUF_OFFSET_OFFSET                          31:0
+#define NVC9B0_VP9_SET_TILE_SIZE_BUF_OFFSET                                     (0x000005D0)
+#define NVC9B0_VP9_SET_TILE_SIZE_BUF_OFFSET_OFFSET                              31:0
+#define NVC9B0_VP9_SET_COL_MVWRITE_BUF_OFFSET                                   (0x000005D4)
+#define NVC9B0_VP9_SET_COL_MVWRITE_BUF_OFFSET_OFFSET                            31:0
+#define NVC9B0_VP9_SET_COL_MVREAD_BUF_OFFSET                                    (0x000005D8)
+#define NVC9B0_VP9_SET_COL_MVREAD_BUF_OFFSET_OFFSET                             31:0
+#define NVC9B0_VP9_SET_FILTER_BUFFER_OFFSET                                     (0x000005DC)
+#define NVC9B0_VP9_SET_FILTER_BUFFER_OFFSET_OFFSET                              31:0
+#define NVC9B0_VP9_PARSER_SET_PIC_SETUP_OFFSET                                  (0x000005E0)
+#define NVC9B0_VP9_PARSER_SET_PIC_SETUP_OFFSET_OFFSET                           31:0
+#define NVC9B0_VP9_PARSER_SET_PREV_PIC_SETUP_OFFSET                             (0x000005E4)
+#define NVC9B0_VP9_PARSER_SET_PREV_PIC_SETUP_OFFSET_OFFSET                      31:0
+#define NVC9B0_VP9_PARSER_SET_PROB_TAB_BUF_OFFSET                               (0x000005E8)
+#define NVC9B0_VP9_PARSER_SET_PROB_TAB_BUF_OFFSET_OFFSET                        31:0
+#define NVC9B0_VP9_SET_HINT_DUMP_BUF_OFFSET                                     (0x000005EC)
+#define NVC9B0_VP9_SET_HINT_DUMP_BUF_OFFSET_OFFSET                              31:0
+#define NVC9B0_PASS1_SET_CLEAR_HEADER_OFFSET                                    (0x00000600)
+#define NVC9B0_PASS1_SET_CLEAR_HEADER_OFFSET_OFFSET                             31:0
+#define NVC9B0_PASS1_SET_RE_ENCRYPT_OFFSET                                      (0x00000604)
+#define NVC9B0_PASS1_SET_RE_ENCRYPT_OFFSET_OFFSET                               31:0
+#define NVC9B0_PASS1_SET_VP8_TOKEN_OFFSET                                       (0x00000608)
+#define NVC9B0_PASS1_SET_VP8_TOKEN_OFFSET_OFFSET                                31:0
+#define NVC9B0_PASS1_SET_INPUT_DATA_OFFSET                                      (0x0000060C)
+#define NVC9B0_PASS1_SET_INPUT_DATA_OFFSET_OFFSET                               31:0
+#define NVC9B0_PASS1_SET_OUTPUT_DATA_SIZE_OFFSET                                (0x00000610)
+#define NVC9B0_PASS1_SET_OUTPUT_DATA_SIZE_OFFSET_OFFSET                         31:0
+#define NVC9B0_AV1_SET_PROB_TAB_READ_BUF_OFFSET                                 (0x00000640)
+#define NVC9B0_AV1_SET_PROB_TAB_READ_BUF_OFFSET_OFFSET                          31:0
+#define NVC9B0_AV1_SET_PROB_TAB_WRITE_BUF_OFFSET                                (0x00000644)
+#define NVC9B0_AV1_SET_PROB_TAB_WRITE_BUF_OFFSET_OFFSET                         31:0
+#define NVC9B0_AV1_SET_SEGMENT_READ_BUF_OFFSET                                  (0x00000648)
+#define NVC9B0_AV1_SET_SEGMENT_READ_BUF_OFFSET_OFFSET                           31:0
+#define NVC9B0_AV1_SET_SEGMENT_WRITE_BUF_OFFSET                                 (0x0000064C)
+#define NVC9B0_AV1_SET_SEGMENT_WRITE_BUF_OFFSET_OFFSET                          31:0
+#define NVC9B0_AV1_SET_COL_MV0_READ_BUF_OFFSET                                  (0x00000650)
+#define NVC9B0_AV1_SET_COL_MV0_READ_BUF_OFFSET_OFFSET                           31:0
+#define NVC9B0_AV1_SET_COL_MV1_READ_BUF_OFFSET                                  (0x00000654)
+#define NVC9B0_AV1_SET_COL_MV1_READ_BUF_OFFSET_OFFSET                           31:0
+#define NVC9B0_AV1_SET_COL_MV2_READ_BUF_OFFSET                                  (0x00000658)
+#define NVC9B0_AV1_SET_COL_MV2_READ_BUF_OFFSET_OFFSET                           31:0
+#define NVC9B0_AV1_SET_COL_MVWRITE_BUF_OFFSET                                   (0x0000065C)
+#define NVC9B0_AV1_SET_COL_MVWRITE_BUF_OFFSET_OFFSET                            31:0
+#define NVC9B0_AV1_SET_GLOBAL_MODEL_BUF_OFFSET                                  (0x00000660)
+#define NVC9B0_AV1_SET_GLOBAL_MODEL_BUF_OFFSET_OFFSET                           31:0
+#define NVC9B0_AV1_SET_FILM_GRAIN_BUF_OFFSET                                    (0x00000664)
+#define NVC9B0_AV1_SET_FILM_GRAIN_BUF_OFFSET_OFFSET                             31:0
+#define NVC9B0_AV1_SET_TILE_STREAM_INFO_BUF_OFFSET                              (0x00000668)
+#define NVC9B0_AV1_SET_TILE_STREAM_INFO_BUF_OFFSET_OFFSET                       31:0
+#define NVC9B0_AV1_SET_SUB_STREAM_ENTRY_BUF_OFFSET                              (0x0000066C)
+#define NVC9B0_AV1_SET_SUB_STREAM_ENTRY_BUF_OFFSET_OFFSET                       31:0
+#define NVC9B0_AV1_SET_HINT_DUMP_BUF_OFFSET                                     (0x00000670)
+#define NVC9B0_AV1_SET_HINT_DUMP_BUF_OFFSET_OFFSET                              31:0
+#define NVC9B0_H264_SET_SCALING_LIST_OFFSET                                     (0x00000680)
+#define NVC9B0_H264_SET_SCALING_LIST_OFFSET_OFFSET                              31:0
+#define NVC9B0_H264_SET_VLDHIST_BUF_OFFSET                                      (0x00000684)
+#define NVC9B0_H264_SET_VLDHIST_BUF_OFFSET_OFFSET                               31:0
+#define NVC9B0_H264_SET_EDOBOFFSET0                                             (0x00000688)
+#define NVC9B0_H264_SET_EDOBOFFSET0_OFFSET                                      31:0
+#define NVC9B0_H264_SET_EDOBOFFSET1                                             (0x0000068C)
+#define NVC9B0_H264_SET_EDOBOFFSET1_OFFSET                                      31:0
+#define NVC9B0_H264_SET_EDOBOFFSET2                                             (0x00000690)
+#define NVC9B0_H264_SET_EDOBOFFSET2_OFFSET                                      31:0
+#define NVC9B0_H264_SET_EDOBOFFSET3                                             (0x00000694)
+#define NVC9B0_H264_SET_EDOBOFFSET3_OFFSET                                      31:0
+#define NVC9B0_SET_CONTENT_INITIAL_VECTOR(b)                                    (0x00000C00 + (b)*0x00000004)
+#define NVC9B0_SET_CONTENT_INITIAL_VECTOR_VALUE                                 31:0
+#define NVC9B0_SET_CTL_COUNT                                                    (0x00000C10)
+#define NVC9B0_SET_CTL_COUNT_VALUE                                              31:0
+#define NVC9B0_SET_UPPER_SRC                                                    (0x00000C14)
+#define NVC9B0_SET_UPPER_SRC_OFFSET                                             7:0
+#define NVC9B0_SET_LOWER_SRC                                                    (0x00000C18)
+#define NVC9B0_SET_LOWER_SRC_OFFSET                                             31:0
+#define NVC9B0_SET_UPPER_DST                                                    (0x00000C1C)
+#define NVC9B0_SET_UPPER_DST_OFFSET                                             7:0
+#define NVC9B0_SET_LOWER_DST                                                    (0x00000C20)
+#define NVC9B0_SET_LOWER_DST_OFFSET                                             31:0
+#define NVC9B0_SET_BLOCK_COUNT                                                  (0x00000C24)
+#define NVC9B0_SET_BLOCK_COUNT_VALUE                                            31:0
+#define NVC9B0_PR_SET_REQUEST_BUF_OFFSET                                        (0x00000D00)
+#define NVC9B0_PR_SET_REQUEST_BUF_OFFSET_OFFSET                                 31:0
+#define NVC9B0_PR_SET_REQUEST_BUF_SIZE                                          (0x00000D04)
+#define NVC9B0_PR_SET_REQUEST_BUF_SIZE_SIZE                                     31:0
+#define NVC9B0_PR_SET_RESPONSE_BUF_OFFSET                                       (0x00000D08)
+#define NVC9B0_PR_SET_RESPONSE_BUF_OFFSET_OFFSET                                31:0
+#define NVC9B0_PR_SET_RESPONSE_BUF_SIZE                                         (0x00000D0C)
+#define NVC9B0_PR_SET_RESPONSE_BUF_SIZE_SIZE                                    31:0
+#define NVC9B0_PR_SET_REQUEST_MESSAGE_BUF_OFFSET                                (0x00000D10)
+#define NVC9B0_PR_SET_REQUEST_MESSAGE_BUF_OFFSET_OFFSET                         31:0
+#define NVC9B0_PR_SET_RESPONSE_MESSAGE_BUF_OFFSET                               (0x00000D14)
+#define NVC9B0_PR_SET_RESPONSE_MESSAGE_BUF_OFFSET_OFFSET                        31:0
+#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_OFFSET                                  (0x00000D18)
+#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_OFFSET_OFFSET                           31:0
+#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_SIZE                                    (0x00000D1C)
+#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_SIZE_SIZE                               31:0
+#define NVC9B0_PR_SET_CONTENT_DECRYPT_INFO_BUF_OFFSET                           (0x00000D20)
+#define NVC9B0_PR_SET_CONTENT_DECRYPT_INFO_BUF_OFFSET_OFFSET                    31:0
+#define NVC9B0_PR_SET_REENCRYPTED_BITSTREAM_BUF_OFFSET                          (0x00000D24)
+#define NVC9B0_PR_SET_REENCRYPTED_BITSTREAM_BUF_OFFSET_OFFSET                   31:0
+#define NVC9B0_DH_KE_SET_CHALLENGE_BUF_OFFSET                                   (0x00000E00)
+#define NVC9B0_DH_KE_SET_CHALLENGE_BUF_OFFSET_OFFSET                            31:0
+#define NVC9B0_DH_KE_SET_RESPONSE_BUF_OFFSET                                    (0x00000E04)
+#define NVC9B0_DH_KE_SET_RESPONSE_BUF_OFFSET_OFFSET                             31:0
+#define NVC9B0_SET_SESSION_KEY(b)                                               (0x00000F00 + (b)*0x00000004)
+#define NVC9B0_SET_SESSION_KEY_VALUE                                            31:0
+#define NVC9B0_SET_CONTENT_KEY(b)                                               (0x00000F10 + (b)*0x00000004)
+#define NVC9B0_SET_CONTENT_KEY_VALUE                                            31:0
+#define NVC9B0_PM_TRIGGER_END                                                   (0x00001114)
+#define NVC9B0_PM_TRIGGER_END_V                                                 31:0
+
+#define NVC9B0_ERROR_NONE                                                       (0x00000000)
+#define NVC9B0_OS_ERROR_EXECUTE_INSUFFICIENT_DATA                               (0x00000001)
+#define NVC9B0_OS_ERROR_SEMAPHORE_INSUFFICIENT_DATA                             (0x00000002)
+#define NVC9B0_OS_ERROR_INVALID_METHOD                                          (0x00000003)
+#define NVC9B0_OS_ERROR_INVALID_DMA_PAGE                                        (0x00000004)
+#define NVC9B0_OS_ERROR_UNHANDLED_INTERRUPT                                     (0x00000005)
+#define NVC9B0_OS_ERROR_EXCEPTION                                               (0x00000006)
+#define NVC9B0_OS_ERROR_INVALID_CTXSW_REQUEST                                   (0x00000007)
+#define NVC9B0_OS_ERROR_APPLICATION                                             (0x00000008)
+#define NVC9B0_OS_ERROR_SW_BREAKPT                                              (0x00000009)
+#define NVC9B0_OS_INTERRUPT_EXECUTE_AWAKEN                                      (0x00000100)
+#define NVC9B0_OS_INTERRUPT_BACKEND_SEMAPHORE_AWAKEN                            (0x00000200)
+#define NVC9B0_OS_INTERRUPT_CTX_ERROR_FBIF                                      (0x00000300)
+#define NVC9B0_OS_INTERRUPT_LIMIT_VIOLATION                                     (0x00000400)
+#define NVC9B0_OS_INTERRUPT_LIMIT_AND_FBIF_CTX_ERROR                            (0x00000500)
+#define NVC9B0_OS_INTERRUPT_HALT_ENGINE                                         (0x00000600)
+#define NVC9B0_OS_INTERRUPT_TRAP_NONSTALL                                       (0x00000700)
+#define NVC9B0_H264_VLD_ERR_SEQ_DATA_INCONSISTENT                               (0x00004001)
+#define NVC9B0_H264_VLD_ERR_PIC_DATA_INCONSISTENT                               (0x00004002)
+#define NVC9B0_H264_VLD_ERR_SLC_DATA_BUF_ADDR_OUT_OF_BOUNDS                     (0x00004100)
+#define NVC9B0_H264_VLD_ERR_BITSTREAM_ERROR                                     (0x00004101)
+#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID                          (0x000041F8)
+#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_SIZE_NOT_MULT256                        (0x00004200)
+#define NVC9B0_H264_VLD_ERR_SLC_DATA_OUT_SIZE_NOT_MULT256                       (0x00004201)
+#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID                        (0x00004203)
+#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_SLC_HDR_OUT_INVALID                      (0x00004204)
+#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL                           (0x00004205)
+#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_BUF_ALREADY_VALID                       (0x00004206)
+#define NVC9B0_H264_VLD_ERR_SLC_DATA_OUT_BUF_TOO_SMALL                          (0x00004207)
+#define NVC9B0_H264_VLD_ERR_DATA_BUF_CNT_TOO_SMALL                              (0x00004208)
+#define NVC9B0_H264_VLD_ERR_BITSTREAM_EMPTY                                     (0x00004209)
+#define NVC9B0_H264_VLD_ERR_FRAME_WIDTH_TOO_LARGE                               (0x0000420A)
+#define NVC9B0_H264_VLD_ERR_FRAME_HEIGHT_TOO_LARGE                              (0x0000420B)
+#define NVC9B0_H264_VLD_ERR_HIST_BUF_TOO_SMALL                                  (0x00004300)
+#define NVC9B0_VC1_VLD_ERR_PIC_DATA_BUF_ADDR_OUT_OF_BOUND                       (0x00005100)
+#define NVC9B0_VC1_VLD_ERR_BITSTREAM_ERROR                                      (0x00005101)
+#define NVC9B0_VC1_VLD_ERR_PIC_HDR_OUT_SIZE_NOT_MULT256                         (0x00005200)
+#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_SIZE_NOT_MULT256                        (0x00005201)
+#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID                           (0x00005202)
+#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID                         (0x00005203)
+#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_PIC_HDR_OUT_INVALID                       (0x00005204)
+#define NVC9B0_VC1_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL                            (0x00005205)
+#define NVC9B0_VC1_VLD_ERR_PIC_HDR_OUT_BUF_ALREADY_VALID                        (0x00005206)
+#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_BUF_TOO_SMALL                           (0x00005207)
+#define NVC9B0_VC1_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL                           (0x00005208)
+#define NVC9B0_VC1_VLD_ERR_BITSTREAM_EMPTY                                      (0x00005209)
+#define NVC9B0_VC1_VLD_ERR_FRAME_WIDTH_TOO_LARGE                                (0x0000520A)
+#define NVC9B0_VC1_VLD_ERR_FRAME_HEIGHT_TOO_LARGE                               (0x0000520B)
+#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_BUF_FULL_TIME_OUT                       (0x00005300)
+#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_BUF_ADDR_OUT_OF_BOUNDS                   (0x00006100)
+#define NVC9B0_MPEG12_VLD_ERR_BITSTREAM_ERROR                                   (0x00006101)
+#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_SIZE_NOT_MULT256                     (0x00006200)
+#define NVC9B0_MPEG12_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID                        (0x00006201)
+#define NVC9B0_MPEG12_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID                      (0x00006202)
+#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_BUF_TOO_SMALL                        (0x00006203)
+#define NVC9B0_MPEG12_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL                        (0x00006204)
+#define NVC9B0_MPEG12_VLD_ERR_BITSTREAM_EMPTY                                   (0x00006205)
+#define NVC9B0_MPEG12_VLD_ERR_INVALID_PIC_STRUCTURE                             (0x00006206)
+#define NVC9B0_MPEG12_VLD_ERR_INVALID_PIC_CODING_TYPE                           (0x00006207)
+#define NVC9B0_MPEG12_VLD_ERR_FRAME_WIDTH_TOO_LARGE                             (0x00006208)
+#define NVC9B0_MPEG12_VLD_ERR_FRAME_HEIGHT_TOO_LARGE                            (0x00006209)
+#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_BUF_FULL_TIME_OUT                    (0x00006300)
+#define NVC9B0_CMN_VLD_ERR_PDEC_RETURNED_ERROR                                  (0x00007101)
+#define NVC9B0_CMN_VLD_ERR_EDOB_FLUSH_TIME_OUT                                  (0x00007102)
+#define NVC9B0_CMN_VLD_ERR_EDOB_REWIND_TIME_OUT                                 (0x00007103)
+#define NVC9B0_CMN_VLD_ERR_VLD_WD_TIME_OUT                                      (0x00007104)
+#define NVC9B0_CMN_VLD_ERR_NUM_SLICES_ZERO                                      (0x00007105)
+#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_BUF_ADDR_OUT_OF_BOUND                     (0x00008100)
+#define NVC9B0_MPEG4_VLD_ERR_BITSTREAM_ERROR                                    (0x00008101)
+#define NVC9B0_MPEG4_VLD_ERR_PIC_HDR_OUT_SIZE_NOT_MULT256                       (0x00008200)
+#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_SIZE_NOT_MULT256                      (0x00008201)
+#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID                         (0x00008202)
+#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID                       (0x00008203)
+#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_PIC_HDR_OUT_INVALID                     (0x00008204)
+#define NVC9B0_MPEG4_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL                          (0x00008205)
+#define NVC9B0_MPEG4_VLD_ERR_PIC_HDR_OUT_BUF_ALREADY_VALID                      (0x00008206)
+#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_BUF_TOO_SMALL                         (0x00008207)
+#define NVC9B0_MPEG4_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL                         (0x00008208)
+#define NVC9B0_MPEG4_VLD_ERR_BITSTREAM_EMPTY                                    (0x00008209)
+#define NVC9B0_MPEG4_VLD_ERR_FRAME_WIDTH_TOO_LARGE                              (0x0000820A)
+#define NVC9B0_MPEG4_VLD_ERR_FRAME_HEIGHT_TOO_LARGE                             (0x0000820B)
+#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_BUF_FULL_TIME_OUT                     (0x00051E01)
+#define NVC9B0_DEC_ERROR_MPEG12_APPTIMER_EXPIRED                                (0xDEC10001)
+#define NVC9B0_DEC_ERROR_MPEG12_MVTIMER_EXPIRED                                 (0xDEC10002)
+#define NVC9B0_DEC_ERROR_MPEG12_INVALID_TOKEN                                   (0xDEC10003)
+#define NVC9B0_DEC_ERROR_MPEG12_SLICEDATA_MISSING                               (0xDEC10004)
+#define NVC9B0_DEC_ERROR_MPEG12_HWERR_INTERRUPT                                 (0xDEC10005)
+#define NVC9B0_DEC_ERROR_MPEG12_DETECTED_VLD_FAILURE                            (0xDEC10006)
+#define NVC9B0_DEC_ERROR_MPEG12_PICTURE_INIT                                    (0xDEC10100)
+#define NVC9B0_DEC_ERROR_MPEG12_STATEMACHINE_FAILURE                            (0xDEC10101)
+#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_PIC                               (0xDEC10901)
+#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_UCODE                             (0xDEC10902)
+#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_FC                                (0xDEC10903)
+#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_SLH                               (0xDEC10904)
+#define NVC9B0_DEC_ERROR_MPEG12_INVALID_UCODE_SIZE                              (0xDEC10905)
+#define NVC9B0_DEC_ERROR_MPEG12_INVALID_SLICE_COUNT                             (0xDEC10906)
+#define NVC9B0_DEC_ERROR_VC1_APPTIMER_EXPIRED                                   (0xDEC20001)
+#define NVC9B0_DEC_ERROR_VC1_MVTIMER_EXPIRED                                    (0xDEC20002)
+#define NVC9B0_DEC_ERROR_VC1_INVALID_TOKEN                                      (0xDEC20003)
+#define NVC9B0_DEC_ERROR_VC1_SLICEDATA_MISSING                                  (0xDEC20004)
+#define NVC9B0_DEC_ERROR_VC1_HWERR_INTERRUPT                                    (0xDEC20005)
+#define NVC9B0_DEC_ERROR_VC1_DETECTED_VLD_FAILURE                               (0xDEC20006)
+#define NVC9B0_DEC_ERROR_VC1_TIMEOUT_POLLING_FOR_DATA                           (0xDEC20007)
+#define NVC9B0_DEC_ERROR_VC1_PDEC_PIC_END_UNALIGNED                             (0xDEC20008)
+#define NVC9B0_DEC_ERROR_VC1_WDTIMER_EXPIRED                                    (0xDEC20009)
+#define NVC9B0_DEC_ERROR_VC1_ERRINTSTART                                        (0xDEC20010)
+#define NVC9B0_DEC_ERROR_VC1_IQT_ERRINT                                         (0xDEC20011)
+#define NVC9B0_DEC_ERROR_VC1_MC_ERRINT                                          (0xDEC20012)
+#define NVC9B0_DEC_ERROR_VC1_MC_IQT_ERRINT                                      (0xDEC20013)
+#define NVC9B0_DEC_ERROR_VC1_REC_ERRINT                                         (0xDEC20014)
+#define NVC9B0_DEC_ERROR_VC1_REC_IQT_ERRINT                                     (0xDEC20015)
+#define NVC9B0_DEC_ERROR_VC1_REC_MC_ERRINT                                      (0xDEC20016)
+#define NVC9B0_DEC_ERROR_VC1_REC_MC_IQT_ERRINT                                  (0xDEC20017)
+#define NVC9B0_DEC_ERROR_VC1_DBF_ERRINT                                         (0xDEC20018)
+#define NVC9B0_DEC_ERROR_VC1_DBF_IQT_ERRINT                                     (0xDEC20019)
+#define NVC9B0_DEC_ERROR_VC1_DBF_MC_ERRINT                                      (0xDEC2001A)
+#define NVC9B0_DEC_ERROR_VC1_DBF_MC_IQT_ERRINT                                  (0xDEC2001B)
+#define NVC9B0_DEC_ERROR_VC1_DBF_REC_ERRINT                                     (0xDEC2001C)
+#define NVC9B0_DEC_ERROR_VC1_DBF_REC_IQT_ERRINT                                 (0xDEC2001D)
+#define NVC9B0_DEC_ERROR_VC1_DBF_REC_MC_ERRINT                                  (0xDEC2001E)
+#define NVC9B0_DEC_ERROR_VC1_DBF_REC_MC_IQT_ERRINT                              (0xDEC2001F)
+#define NVC9B0_DEC_ERROR_VC1_PICTURE_INIT                                       (0xDEC20100)
+#define NVC9B0_DEC_ERROR_VC1_STATEMACHINE_FAILURE                               (0xDEC20101)
+#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_PIC                                  (0xDEC20901)
+#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_UCODE                                (0xDEC20902)
+#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_FC                                   (0xDEC20903)
+#define NVC9B0_DEC_ERROR_VC1_INVAILD_CTXID_SLH                                  (0xDEC20904)
+#define NVC9B0_DEC_ERROR_VC1_INVALID_UCODE_SIZE                                 (0xDEC20905)
+#define NVC9B0_DEC_ERROR_VC1_INVALID_SLICE_COUNT                                (0xDEC20906)
+#define NVC9B0_DEC_ERROR_H264_APPTIMER_EXPIRED                                  (0xDEC30001)
+#define NVC9B0_DEC_ERROR_H264_MVTIMER_EXPIRED                                   (0xDEC30002)
+#define NVC9B0_DEC_ERROR_H264_INVALID_TOKEN                                     (0xDEC30003)
+#define NVC9B0_DEC_ERROR_H264_SLICEDATA_MISSING                                 (0xDEC30004)
+#define NVC9B0_DEC_ERROR_H264_HWERR_INTERRUPT                                   (0xDEC30005)
+#define NVC9B0_DEC_ERROR_H264_DETECTED_VLD_FAILURE                              (0xDEC30006)
+#define NVC9B0_DEC_ERROR_H264_ERRINTSTART                                       (0xDEC30010)
+#define NVC9B0_DEC_ERROR_H264_IQT_ERRINT                                        (0xDEC30011)
+#define NVC9B0_DEC_ERROR_H264_MC_ERRINT                                         (0xDEC30012)
+#define NVC9B0_DEC_ERROR_H264_MC_IQT_ERRINT                                     (0xDEC30013)
+#define NVC9B0_DEC_ERROR_H264_REC_ERRINT                                        (0xDEC30014)
+#define NVC9B0_DEC_ERROR_H264_REC_IQT_ERRINT                                    (0xDEC30015)
+#define NVC9B0_DEC_ERROR_H264_REC_MC_ERRINT                                     (0xDEC30016)
+#define NVC9B0_DEC_ERROR_H264_REC_MC_IQT_ERRINT                                 (0xDEC30017)
+#define NVC9B0_DEC_ERROR_H264_DBF_ERRINT                                        (0xDEC30018)
+#define NVC9B0_DEC_ERROR_H264_DBF_IQT_ERRINT                                    (0xDEC30019)
+#define NVC9B0_DEC_ERROR_H264_DBF_MC_ERRINT                                     (0xDEC3001A)
+#define NVC9B0_DEC_ERROR_H264_DBF_MC_IQT_ERRINT                                 (0xDEC3001B)
+#define NVC9B0_DEC_ERROR_H264_DBF_REC_ERRINT                                    (0xDEC3001C)
+#define NVC9B0_DEC_ERROR_H264_DBF_REC_IQT_ERRINT                                (0xDEC3001D)
+#define NVC9B0_DEC_ERROR_H264_DBF_REC_MC_ERRINT                                 (0xDEC3001E)
+#define NVC9B0_DEC_ERROR_H264_DBF_REC_MC_IQT_ERRINT                             (0xDEC3001F)
+#define NVC9B0_DEC_ERROR_H264_PICTURE_INIT                                      (0xDEC30100)
+#define NVC9B0_DEC_ERROR_H264_STATEMACHINE_FAILURE                              (0xDEC30101)
+#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_PIC                                 (0xDEC30901)
+#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_UCODE                               (0xDEC30902)
+#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_FC                                  (0xDEC30903)
+#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_SLH                                 (0xDEC30904)
+#define NVC9B0_DEC_ERROR_H264_INVALID_UCODE_SIZE                                (0xDEC30905)
+#define NVC9B0_DEC_ERROR_H264_INVALID_SLICE_COUNT                               (0xDEC30906)
+#define NVC9B0_DEC_ERROR_MPEG4_APPTIMER_EXPIRED                                 (0xDEC40001)
+#define NVC9B0_DEC_ERROR_MPEG4_MVTIMER_EXPIRED                                  (0xDEC40002)
+#define NVC9B0_DEC_ERROR_MPEG4_INVALID_TOKEN                                    (0xDEC40003)
+#define NVC9B0_DEC_ERROR_MPEG4_SLICEDATA_MISSING                                (0xDEC40004)
+#define NVC9B0_DEC_ERROR_MPEG4_HWERR_INTERRUPT                                  (0xDEC40005)
+#define NVC9B0_DEC_ERROR_MPEG4_DETECTED_VLD_FAILURE                             (0xDEC40006)
+#define NVC9B0_DEC_ERROR_MPEG4_TIMEOUT_POLLING_FOR_DATA                         (0xDEC40007)
+#define NVC9B0_DEC_ERROR_MPEG4_PDEC_PIC_END_UNALIGNED                           (0xDEC40008)
+#define NVC9B0_DEC_ERROR_MPEG4_WDTIMER_EXPIRED                                  (0xDEC40009)
+#define NVC9B0_DEC_ERROR_MPEG4_ERRINTSTART                                      (0xDEC40010)
+#define NVC9B0_DEC_ERROR_MPEG4_IQT_ERRINT                                       (0xDEC40011)
+#define NVC9B0_DEC_ERROR_MPEG4_MC_ERRINT                                        (0xDEC40012)
+#define NVC9B0_DEC_ERROR_MPEG4_MC_IQT_ERRINT                                    (0xDEC40013)
+#define NVC9B0_DEC_ERROR_MPEG4_REC_ERRINT                                       (0xDEC40014)
+#define NVC9B0_DEC_ERROR_MPEG4_REC_IQT_ERRINT                                   (0xDEC40015)
+#define NVC9B0_DEC_ERROR_MPEG4_REC_MC_ERRINT                                    (0xDEC40016)
+#define NVC9B0_DEC_ERROR_MPEG4_REC_MC_IQT_ERRINT                                (0xDEC40017)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_ERRINT                                       (0xDEC40018)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_IQT_ERRINT                                   (0xDEC40019)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_MC_ERRINT                                    (0xDEC4001A)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_MC_IQT_ERRINT                                (0xDEC4001B)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_ERRINT                                   (0xDEC4001C)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_IQT_ERRINT                               (0xDEC4001D)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_MC_ERRINT                                (0xDEC4001E)
+#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_MC_IQT_ERRINT                            (0xDEC4001F)
+#define NVC9B0_DEC_ERROR_MPEG4_PICTURE_INIT                                     (0xDEC40100)
+#define NVC9B0_DEC_ERROR_MPEG4_STATEMACHINE_FAILURE                             (0xDEC40101)
+#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_PIC                                (0xDEC40901)
+#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_UCODE                              (0xDEC40902)
+#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_FC                                 (0xDEC40903)
+#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_SLH                                (0xDEC40904)
+#define NVC9B0_DEC_ERROR_MPEG4_INVALID_UCODE_SIZE                               (0xDEC40905)
+#define NVC9B0_DEC_ERROR_MPEG4_INVALID_SLICE_COUNT                              (0xDEC40906)
+
+#ifdef __cplusplus
+};     /* extern "C" */
+#endif
+#endif // clc9b0_h
--- a/extra/nv_gpu_driver/nv_ioctl.py
+++ b/extra/nv_gpu_driver/nv_ioctl.py
@ -64,14 +64,17 @@ nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(
          x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}

 def get_classes():
-  hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_570.py").read_text()
-  clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()
-  pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'
-  matches = re.findall(pattern, clss, re.MULTILINE)
-  return {int(num, base=16):name for name, num in matches}
+  res = {}
+  known_classes = {"NV01_DEVICE_0", "NV01_ROOT", "NV1_MEMORY_SYSTEM", "NV01_MEMORY_VIRTUAL", "NV1_MEMORY_USER", "NV50_MEMORY_VIRTUAL", "NV_FERMI_VASPACE_A",
+                   "NV20_SUBDEVICE_0"}
+  for nm,val in nv_gpu.__dict__.items():
+    if not isinstance(val, int): continue
+    if 0x3000 < val < 0xffff: res[val] = nm
+    if nm in known_classes: res[val] = nm
+  return res
 nvclasses = get_classes()
 nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}
-nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}
+nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC9B0_", "NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}

 global_ioctl_id = 0
 gpus_user_modes = []
--- a/extra/nv_gpu_driver/nvdec_drv.h
+++ b/extra/nv_gpu_driver/nvdec_drv.h
--- a/extra/qcom_gpu_driver/qcom_opencl_interop.py
+++ b/extra/qcom_gpu_driver/qcom_opencl_interop.py
@ -29,8 +29,9 @@ rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw

 # create QCOM tensor with the externally managed buffer
 x = Tensor.from_blob(rawbuf_ptr, (8, 8), dtype=dtypes.int, device='QCOM')
-y = (x + 1).numpy()
-print(y)
+y = (x + 1).reshape(-1).tolist()
+print(y[:10])
+assert y == [i + 1 for i in range(64)]

 # all calculations are done, save to free the object
 cl.clReleaseMemObject(cl_buf)
@ -49,7 +50,7 @@ for i in range(4):
  cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0]
  rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20]

-  y = calc(x = Tensor.from_blob(rawbuf_ptr, (2, 2), dtype=dtypes.int, device='QCOM')).numpy()
+  y = calc(x = Tensor.from_blob(rawbuf_ptr, (2, 2), dtype=dtypes.int, device='QCOM')).tolist()
  print(f'jit {i}\n', y)

  # all calculations are done, save to free the object
@ -80,8 +81,19 @@ rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw
 # dtypes.imageh = cl.cl_image_format(cl.CL_RGBA, cl.CL_HALF_FLOAT)
 # dtypes.imagef = cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT)
 x = Tensor.from_blob(rawbuf_ptr, (h*w*4,), dtype=dtypes.imagef((h,w)), device='QCOM')
-y = (x + 1).numpy()
-print(y)
+y = (x + 1).tolist()
+print(y[:10])

 # all calculations are done, save to free the object
 cl.clReleaseMemObject(cl_img)
+
+# from numpy
+import numpy as np
+
+YUV_SIZE = 50
+a_np = (32*np.random.randn(YUV_SIZE).astype(np.float32) + 128).clip(0,255).astype(np.uint8)
+a = Tensor.from_blob(a_np.ctypes.data, (YUV_SIZE,), dtype=dtypes.uint8, device='QCOM').realize()
+
+print(a.numpy()[:10], a_np[:10])
+assert np.all(a.numpy() == a_np)
+assert np.all((a - 1).numpy() == a_np - 1)
--- a/extra/reduce_speed.py
+++ b/extra/reduce_speed.py
@ -1,128 +0,0 @@
-import numpy as np
-import ctypes
-from tinygrad import Tensor, GlobalCounters, Context
-from tinygrad.engine.realize import lower_schedule, CompiledRunner
-from tinygrad.device import CPUProgram
-from dataclasses import replace
-from keystone import Ks, KS_ARCH_ARM64, KS_MODE_LITTLE_ENDIAN
-
-# only the memory access, over 100 GB/s! (sometimes)
-reduce_asm = """
-movi  v0.2d, #0000000000000000
-mov   w9, #0x30
-mov   w10, #0x20
-mov   x8, #-0x10
-movi  v1.2d, #0000000000000000
-movk  w9, #0x300, lsl #16
-movi  v2.2d, #0000000000000000
-movk  w10, #0x200, lsl #16
-movi  v3.2d, #0000000000000000
-mov   w11, #0x1000000
-mov   w12, #0x3ffff0
-loop:
-ldp   q4, q5, [x1]
-add   x13, x1, x11
-add   x15, x1, x10
-add   x14, x1, x9
-add   x8, x8, #0x10
-cmp   x8, x12
-ldp   q6, q7, [x1, #0x20]
-add   x1, x1, #0x40
-ldp   q4, q5, [x13]
-ldp   q6, q7, [x13, #0x20]
-ldp   q4, q5, [x15, #-0x20]
-ldp   q6, q7, [x15]
-ldp   q4, q5, [x14, #-0x30]
-ldp   q6, q7, [x14, #-0x10]
-b.lo  loop
-fadd  v0.4s, v1.4s, v0.4s
-fadd  v0.4s, v2.4s, v0.4s
-fadd  v0.4s, v3.4s, v0.4s
-dup   v1.4s, v0.s[1]
-dup   v2.4s, v0.s[2]
-fadd  v1.4s, v0.4s, v1.4s
-dup   v0.4s, v0.s[3]
-fadd  v1.4s, v2.4s, v1.4s
-fadd  v0.4s, v0.4s, v1.4s
-str   s0, [x0]
-ret
-"""
-
-ks = Ks(KS_ARCH_ARM64, KS_MODE_LITTLE_ENDIAN)
-arm_bytecode, _ = ks.asm(reduce_asm)
-arm_bytecode = bytes(arm_bytecode)
-
-reduce_src = """
-// data1 is 16M inputs
-typedef float float4 __attribute__((aligned(32),vector_size(16)));
-void reduce(float* restrict data0, float* restrict data1) {
-  float4 acc0 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float4 acc1 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float4 acc2 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float4 acc3 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float4 acc4 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float4 acc5 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float4 acc6 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float4 acc7 = {0.0f, 0.0f, 0.0f, 0.0f};
-  float* data1_1 = data1+4194304;
-  float* data1_2 = data1+(4194304*2);
-  float* data1_3 = data1+(4194304*3);
-  for (int ridx0 = 0; ridx0 < 16777216/4; ridx0+=16) {
-    float4 val0 = *(float4*)((data1+(ridx0+0)));
-    float4 val1 = *(float4*)((data1+(ridx0+4)));
-    float4 val2 = *(float4*)((data1+(ridx0+8)));
-    float4 val3 = *(float4*)((data1+(ridx0+12)));
-    acc0 += val0;
-    acc1 += val1;
-    acc2 += val2;
-    acc3 += val3;
-    val0 = *(float4*)((data1_1+(ridx0+0)));
-    val1 = *(float4*)((data1_1+(ridx0+4)));
-    val2 = *(float4*)((data1_1+(ridx0+8)));
-    val3 = *(float4*)((data1_1+(ridx0+12)));
-    acc4 += val0;
-    acc5 += val1;
-    acc6 += val2;
-    acc7 += val3;
-    val0 = *(float4*)((data1_2+(ridx0+0)));
-    val1 = *(float4*)((data1_2+(ridx0+4)));
-    val2 = *(float4*)((data1_2+(ridx0+8)));
-    val3 = *(float4*)((data1_2+(ridx0+12)));
-    acc0 += val0;
-    acc1 += val1;
-    acc2 += val2;
-    acc3 += val3;
-    val0 = *(float4*)((data1_3+(ridx0+0)));
-    val1 = *(float4*)((data1_3+(ridx0+4)));
-    val2 = *(float4*)((data1_3+(ridx0+8)));
-    val3 = *(float4*)((data1_3+(ridx0+12)));
-    acc4 += val0;
-    acc5 += val1;
-    acc6 += val2;
-    acc7 += val3;
-  }
-  float4 out = acc0+acc1+acc2+acc3+acc4+acc5+acc6+acc7;
-  *(data0+0) = out[0]+out[1]+out[2]+out[3];
-}
-"""
-
-if __name__ == "__main__":
-  a = Tensor(np_array:=(np.random.default_rng().random((4096, 4096), dtype=np.float32)-0.5)).realize()
-  with Context(SPLIT_REDUCEOP=0):
-    # TODO: make it easy to alter the OptOps for a ScheduleItem
-    GlobalCounters.reset()
-    out = a.sum()
-    sis = out.schedule()
-    for i,(_,ei) in enumerate(lower_schedule(sis)):
-      if i == 0:
-        # change the source code
-        prg_spec = ei.prg.p
-        prg_spec = replace(prg_spec, name="reduce", src=reduce_src)
-        prg = CompiledRunner(prg_spec)
-        # change the assembly
-        #prg._prg = CPUProgram(prg_spec.name, arm_bytecode)
-        print("buffer at:",hex(ctypes.addressof(ei.bufs[1]._buf)))
-        ei = replace(ei, prg=prg)
-      ei.run()
-    print(out.item())
-    np.testing.assert_allclose(out.item(), np_array.sum(), atol=1, rtol=1e-4)
--- a/extra/remu/src/helpers.rs
+++ b/extra/remu/src/helpers.rs
@ -1,5 +1,5 @@
 use half::f16;
-use num_traits::{float::FloatCore, PrimInt, Unsigned};
+use num_traits::{float::FloatCore, PrimInt, Unsigned, clamp};

 pub fn bits<T>(word: T, hi: usize, lo: usize) -> T where T: PrimInt + Unsigned {
    assert!(hi >= lo);
@ -48,6 +48,7 @@ impl IEEEClass<u64> for f64 {
 pub trait VOPModifier<T> {
    fn negate(&self, pos: usize, modifier: usize) -> T;
    fn absolute(&self, pos: usize, modifier: usize) -> T;
+    fn clmp(&self, cm: bool) -> T;
 }
 impl<T> VOPModifier<T> for T
 where
@ -65,6 +66,11 @@ where
            _ => *self,
        }
    }
+    fn clmp(&self, cm:bool) -> T {
+        if !cm { return *self }
+        let r = clamp(*self, T::zero(), T::one());
+        if r == T::zero() { T::zero() } else { r }
+    }
 }

 pub fn extract_mantissa(x: f64) -> f64 {
--- a/extra/remu/src/thread.rs
+++ b/extra/remu/src/thread.rs
@ -1024,7 +1024,7 @@ impl<'a> Thread<'a> {
                    let vdst = (instr & 0xff) as usize;
                    let abs = ((instr >> 8) & 0x7) as usize;
                    let opsel = ((instr >> 11) & 0xf) as usize;
-                    let cm = (instr >> 15) & 0x1;
+                    let cm = ((instr >> 15) & 0x1) != 0;

                    let s = |n: usize| ((instr >> n) & 0x1ff) as usize;
                    let src = (s(32), s(41), s(50));
@ -1032,7 +1032,9 @@ impl<'a> Thread<'a> {
                    let omod = (instr >> 59) & 0x3;
                    let neg = ((instr >> 61) & 0x7) as usize;
                    assert_eq!(omod, 0);
-                    assert_eq!(cm, 0);
+                    if op != 272 && cm {
+                        return todo_instr!(op); // TODO: add VOP3 clamp for all ops
+                    }
                    assert_eq!(opsel, 0);

                    match op {
@ -1266,7 +1268,7 @@ impl<'a> Thread<'a> {
                            }

                            let ret = match op {
-                                257 | 259 | 299 | 260 | 261 | 264 | 272 | 392 | 426 | 430 | 531 | 537 | 540 | 551 | 567 | 796 => {
+                                257 | 259 | 299 | 260 | 261 | 264 | 272 | 392 | 426 | 430 | 531 | 537 | 540 | 543 | 551 | 567 | 606 | 796 => {
                                    let s0 = f32::from_bits(s0).negate(0, neg).absolute(0, abs);
                                    let s1 = f32::from_bits(s1).negate(1, neg).absolute(1, abs);
                                    let s2 = f32::from_bits(s2).negate(2, neg).absolute(2, abs);
@ -1275,12 +1277,26 @@ impl<'a> Thread<'a> {
                                        260 => s0 - s1,
                                        261 => s1 - s0,
                                        264 => s0 * s1,
-                                        272 => f32::max(s0, s1),
+                                        272 => f32::max(s0, s1).clmp(cm),
                                        299 => f32::mul_add(s0, s1, f32::from_bits(self.vec_reg[vdst])),
                                        426 => s0.recip(),
                                        430 => 1.0 / f32::sqrt(s0),
                                        531 => f32::mul_add(s0, s1, s2),
                                        537 => f32::min(f32::min(s0, s1), s2),
+                                        543 => {
+                                            if s0.is_nan() || s1.is_nan() || s2.is_nan() {
+                                                f32::min(f32::min(s0, s1), s2)
+                                            } else {
+                                                let max = f32::max(f32::max(s0, s1), s2);
+                                                if max == s0 {
+                                                    f32::max(s1, s2)
+                                                } else if max == s1 {
+                                                    f32::max(s0, s2)
+                                                } else {
+                                                    f32::max(s0, s1)
+                                                }
+                                            }
+                                        },
                                        540 => f32::max(f32::max(s0, s1), s2),
                                        551 => s2 / s1,
                                        567 => {
@ -1290,6 +1306,7 @@ impl<'a> Thread<'a> {
                                                false => ret,
                                            }
                                        }
+                                        606 => f32::min(f32::max(s0, s1), s2),
                                        796 => s0 * 2f32.powi(s1.to_bits() as i32),
                                        // cnd_mask isn't a float only ALU but supports neg
                                        257 => {
--- a/extra/replay_pkl.py
+++ b/extra/replay_pkl.py
@ -1,75 +0,0 @@
-import pickle, sys
-from dataclasses import replace
-from tinygrad import Device, Context, Tensor, GlobalCounters
-from tinygrad.device import Buffer
-from tinygrad.helpers import getenv, BEAM
-from tinygrad.engine.jit import TinyJit
-from tinygrad.engine.realize import CompiledRunner, ExecItem, ScheduleItem, lower_schedule_item, get_program
-from tinygrad.renderer import ProgramSpec
-from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps
-from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
-import numpy as np
-
-def move_jit_captured_to_dev(captured, device="DSP"):
-  captured.expected_st_vars_dtype_device = [x[:3] + (device,) for x in captured.expected_st_vars_dtype_device]
-
-  assign = {}
-  def move_buffer(b):
-    if b in assign: return assign[b]
-
-    if b._base is not None:
-      newbuf = Buffer(device, b.size, b.dtype, base=move_buffer(b._base), offset=b.offset)
-    else:
-      newbuf = Buffer(device, b.size, b.dtype)
-      if b.is_allocated(): newbuf.ensure_allocated().copyin(b.as_buffer())
-    assign[b] = newbuf
-    return assign[b]
-
-  for item in captured.jit_cache:
-    for b in item.bufs:
-      if b is not None: move_buffer(b)
-  captured.jit_cache = [ExecItem(item.prg, [assign.get(b,b) for b in item.bufs]) for item in captured.jit_cache]
-  return captured
-
-if __name__ == "__main__":
-  with Context(DEBUG=0):
-    with open(sys.argv[1], "rb") as f:
-      fxn: TinyJit = pickle.load(f)
-      print(f"{f.tell()/1e6:.2f}M loaded")
-    print(type(fxn))
-
-  # Move all buffers to DSP device.
-  fxn.captured = move_jit_captured_to_dev(fxn.captured, "DSP")
-  new_jit = []
-
-  knum = 1
-  for ei in fxn.captured.jit_cache:
-    # skip the copy and the first kernel
-    if isinstance(ei.prg, CompiledRunner) and all(x is not None for x in ei.bufs):
-      if knum == (pknum:=getenv("KNUM", 0)) or pknum == 0:
-        p: ProgramSpec = ei.prg.p
-        k = Kernel(p.ast, Device["DSP"].renderer)
-
-        if getenv("VALIDATE"):
-          with Context(NOOPT=1):
-            lower_schedule_item(ScheduleItem(p.ast, ei.bufs)).run()
-            correct = ei.bufs[0].numpy()
-            ei.bufs[0].copyin(memoryview(bytearray(b'\x00'*ei.bufs[0].nbytes)))
-            GlobalCounters.kernel_count -= 1
-
-        if not getenv("NOOPT"): k.apply_opts(hand_coded_optimizations(k))
-        p2 = get_program(k.ast, k.opts, k.applied_opts)
-        new_ei = replace(ei, prg=CompiledRunner(p2))
-        new_ei.run()
-        new_jit.append(new_ei)
-        test = ei.bufs[0].numpy()
-
-        if getenv("VALIDATE"):
-          import numpy as np
-          np.testing.assert_allclose(correct, test, rtol=1e-3, atol=1e-3)
-      knum += 1
-
-  if getenv("RUN_JIT", 0):
-    fxn.captured.free_intermediates()
-    fxn.captured.jit_cache = new_jit
-    fxn(input=Tensor(np.zeros((1, 3, 224, 224), dtype=np.float32), device="DSP"))
--- a/extra/resnet18/resnet_mlx.py
+++ b/extra/resnet18/resnet_mlx.py
@ -1,114 +0,0 @@
-# code from https://x.com/awnihannun/status/1832511021602500796
-from huggingface_hub import snapshot_download
-import mlx.core as mx
-import mlx.nn as nn
-import time
-
-
-class Block(nn.Module):
-  def __init__(self, in_dims, dims, stride=1):
-    super().__init__()
-
-    self.conv1 = nn.Conv2d(
-      in_dims, dims, kernel_size=3, stride=stride, padding=1, bias=False
-    )
-    self.bn1 = nn.BatchNorm(dims)
-
-    self.conv2 = nn.Conv2d(
-      dims, dims, kernel_size=3, stride=1, padding=1, bias=False
-    )
-    self.bn2 = nn.BatchNorm(dims)
-
-    self.downsample = []
-    if stride != 1:
-      self.downsample = [
-        nn.Conv2d(in_dims, dims, kernel_size=1, stride=stride, bias=False),
-        nn.BatchNorm(dims)
-      ]
-
-  def __call__(self, x):
-    out = nn.relu(self.bn1(self.conv1(x)))
-    out = self.bn2(self.conv2(out))
-    for l in self.downsample:
-      x = l(x)
-    out += x
-    out = nn.relu(out)
-    return out
-
-
-class ResNet(nn.Module):
-  def __init__(self, block, num_blocks, num_classes=10):
-    super().__init__()
-    self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
-    self.bn1 = nn.BatchNorm(64)
-
-    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-    self.layer1 = self._make_layer(block, 64, 64, num_blocks[0], stride=1)
-    self.layer2 = self._make_layer(block, 64, 128, num_blocks[1], stride=2)
-    self.layer3 = self._make_layer(block, 128, 256, num_blocks[2], stride=2)
-    self.layer4 = self._make_layer(block, 256, 512, num_blocks[3], stride=2)
-
-    self.fc = nn.Linear(512, num_classes)
-
-  def _make_layer(self, block, in_dims, dims, num_blocks, stride):
-    strides = [stride] + [1] * (num_blocks - 1)
-    layers = []
-    for stride in strides:
-      layers.append(block(in_dims, dims, stride))
-      in_dims = dims
-    return layers
-
-  def __call__(self, x):
-    x = nn.relu(self.bn1(self.conv1(x)))
-    x = self.maxpool(x)
-    for l in self.layer1 + self.layer2 + self.layer3 + self.layer4:
-      x = l(x)
-    x = mx.mean(x, axis=[1, 2])
-    x = self.fc(x)
-    return x
-
-
-
-def load():
-  model = ResNet(Block, [2, 2, 2, 2], num_classes=1000)
-  file = "model.safetensors"
-  model_path = snapshot_download(
-    repo_id="awni/resnet18-mlx",
-    allow_patterns=[file],
-  )
-  model.load_weights(model_path + "/" + file)
-  model.eval()
-  mx.eval(model)
-  return model
-
-if __name__ == "__main__":
-
-  resnet18 = load()
-
-  @mx.compile
-  def forward(im):
-    return resnet18(im)
-
-  batch_sizes = [1, 2, 4, 8, 16, 32, 64]
-  #its = 200
-  #batch_sizes = [64]
-  its = 20
-  print(f"Batch Size | Images-per-second | Milliseconds-per-image")
-  print(f"---- | ---- | ---- ")
-  for N in batch_sizes:
-    image = mx.random.uniform(shape=(N, 288, 288, 3))
-
-    # Warmup
-    for _ in range(5):
-      output = forward(image)
-      mx.eval(output)
-
-    tic = time.time()
-    for _ in range(its):
-      output = forward(image)
-      mx.async_eval(output)
-    mx.eval(output)
-    toc = time.time()
-    ims_per_sec = N * its / (toc - tic)
-    ms_per_im = 1e3 / ims_per_sec
-    print(f"{N} | {ims_per_sec:.3f} | {ms_per_im:.3f}")
--- a/Show more
+++ b/Show more