Merge remote-tracking branch 'upstream/master' into new_x86_backend

This commit is contained in:
ttomsa 2025-12-20 20:43:21 +00:00
commit b8f06970fa
335 changed files with 27215 additions and 27875 deletions

View file

@ -61,7 +61,7 @@ runs:
uses: actions/cache@v4
with:
path: ${{ github.workspace }}/.venv
key: venv-${{ runner.os }}-python-${{ steps.setup-python.outputs.python-version }}-${{ inputs.deps }}-${{ inputs.pydeps }}-${{ hashFiles('**/pyproject.toml') }}-${{ env.CACHE_VERSION }}
key: venv-${{ runner.os }}-python-${{ steps.setup-python.outputs.python-version }}-${{ inputs.deps }}-${{ inputs.pydeps }}-${{ env.CACHE_VERSION }}
# **** Caching downloads ****
@ -70,13 +70,13 @@ runs:
uses: actions/cache@v4
with:
path: ~/.cache/tinygrad/downloads/
key: downloads-cache-${{ inputs.key }}-${{ env.CACHE_VERSION }}
key: downloads-${{ github.job }}-${{ inputs.key }}-${{ env.CACHE_VERSION }}
- name: Cache downloads (macOS)
if: inputs.key != '' && runner.os == 'macOS'
uses: actions/cache@v4
with:
path: ~/Library/Caches/tinygrad/downloads/
key: osx-downloads-cache-${{ inputs.key }}-${{ env.CACHE_VERSION }}
key: downloads-${{ github.job }}-${{ inputs.key }}-${{ env.CACHE_VERSION }}
# **** Python deps ****
@ -221,7 +221,7 @@ runs:
sudo mkdir -p /usr/local/lib
curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/nimlgen/amdcomgr_dylib/releases/latest | \
jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
sudo xargs curl -L -o /usr/local/lib/libamd_comgr.dylib
sudo xargs curl -fL -o /usr/local/lib/libamd_comgr.dylib
cargo build --release --manifest-path ./extra/remu/Cargo.toml
# **** gpuocelot ****
@ -278,7 +278,7 @@ runs:
if: inputs.webgpu == 'true' && runner.os == 'Linux'
shell: bash
run: |
sudo curl -L https://github.com/wpmed92/pydawn/releases/download/v0.1.6/libwebgpu_dawn.so -o /usr/local/lib/libwebgpu_dawn.so
sudo curl -fL https://github.com/wpmed92/pydawn/releases/download/v0.1.6/libwebgpu_dawn.so -o /usr/local/lib/libwebgpu_dawn.so
sudo ldconfig
- name: Install WebGPU dawn (macOS)
if: inputs.webgpu == 'true' && runner.os == 'macOS'
@ -298,7 +298,7 @@ runs:
- name: Install mesa (linux)
if: inputs.mesa == 'true' && runner.os == 'Linux'
shell: bash
run: sudo curl -L https://github.com/sirhcm/tinymesa/releases/download/tinymesa-32dc66c/libtinymesa_cpu-mesa-25.2.4-linux-amd64.so -o /usr/lib/libtinymesa_cpu.so
run: sudo curl -fL https://github.com/sirhcm/tinymesa/releases/download/v1/libtinymesa_cpu-mesa-25.2.7-linux-amd64.so -o /usr/lib/libtinymesa_cpu.so
- name: Install mesa (macOS)
if: inputs.mesa == 'true' && runner.os == 'macOS'
shell: bash

View file

@ -13,9 +13,11 @@ on:
pull_request:
paths:
- 'tinygrad/runtime/autogen/**/*'
- 'tinygrad/runtime/support/autogen.py'
workflow_dispatch:
paths:
- 'tinygrad/runtime/autogen/**/*'
- 'tinygrad/runtime/support/autogen.py'
jobs:
autogen:
@ -114,11 +116,9 @@ jobs:
- name: Verify Qualcomm autogen
run: |
mv tinygrad/runtime/autogen/kgsl.py /tmp/kgsl.py.bak
mv tinygrad/runtime/autogen/adreno.py /tmp/adreno.py.bak
mv tinygrad/runtime/autogen/qcom_dsp.py /tmp/qcom_dsp.py.bak
python3 -c "from tinygrad.runtime.autogen import kgsl, adreno, qcom_dsp"
python3 -c "from tinygrad.runtime.autogen import kgsl, qcom_dsp"
diff /tmp/kgsl.py.bak tinygrad/runtime/autogen/kgsl.py
diff /tmp/adreno.py.bak tinygrad/runtime/autogen/adreno.py
diff /tmp/qcom_dsp.py.bak tinygrad/runtime/autogen/qcom_dsp.py
- name: Verify libusb autogen
run: |

View file

@ -14,12 +14,6 @@ on:
- update_benchmark
- update_benchmark_staging
workflow_dispatch:
inputs:
run_process_replay:
description: "Run process replay tests"
required: false
default: false
type: boolean
jobs:
testmacbenchmark:
@ -39,6 +33,7 @@ jobs:
- name: Symlink models and datasets
run: |
mkdir -p weights
mkdir -p extra/disassemblers
ln -s ~/tinygrad/extra/disassemblers/applegpu extra/disassemblers/applegpu
ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
@ -54,9 +49,9 @@ jobs:
- name: Print macOS version
run: sw_vers
- name: Run Stable Diffusion
run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=800 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
- name: Run Stable Diffusion without fp16
run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=800 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt
- name: Run Stable Diffusion v2
# TODO: very slow step time
run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 ASSERT_MIN_STEP_TIME=4500 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt
@ -64,7 +59,7 @@ jobs:
- name: Run SDXL
run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=5000 CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
- name: Run model inference benchmark
run: METAL=1 python3.11 test/external/external_model_benchmark.py
run: METAL=1 NOCLANG=1 python3.11 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
- name: Test tensor cores
@ -124,14 +119,6 @@ jobs:
# TODO: too slow
# - name: Run 10 CIFAR training steps w winograd
# run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 ASSERT_MIN_STEP_TIME=150 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- name: UsbGPU boot time
run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
- name: UsbGPU tiny tests
run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/test_tiny.py
- name: UsbGPU copy speeds
run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds
#- name: UsbGPU openpilot test
# run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
- uses: actions/upload-artifact@v4
with:
name: Speed (Mac)
@ -165,6 +152,37 @@ jobs:
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3.11 process_replay.py
testusbgpu:
name: UsbGPU Benchmark
env:
PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache
runs-on: [self-hosted, macOS]
timeout-minutes: 10
defaults:
run:
shell: bash -e -o pipefail {0}
if: github.repository_owner == 'tinygrad'
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: setup staging db
if: github.ref == 'refs/heads/update_benchmark_staging'
run: |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
- name: UsbGPU boot time
run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
- name: UsbGPU tiny tests
run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/test_tiny.py
- name: UsbGPU copy speeds
run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds
#- name: UsbGPU openpilot test
# run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
- name: UsbGPU (USB4/TB) boot time
run: PYTHONPATH=. DEBUG=3 NV=1 NV_IFACE=PCI NV_NAK=1 time python3.11 test/test_tiny.py TestTiny.test_plus
- name: UsbGPU (USB4/TB) tiny tests
run: PYTHONPATH=. NV=1 NV_IFACE=PCI NV_NAK=1 python3.11 test/test_tiny.py
testnvidiabenchmark:
name: tinybox green Benchmark
runs-on: [self-hosted, Linux, tinyboxgreen]
@ -318,31 +336,31 @@ jobs:
# TODO: too slow
# - name: Fuzz Padded Tensor Core GEMM (PTX)
# run: NV=1 NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
- name: HEVC Decode Benchmark
run: VALIDATE=1 MAX_FRAMES=100 NV=1 PYTHONPATH=. python3 extra/hevc/decode.py
- name: Train MNIST
run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
# TODO: too slow
- name: Run 10 CIFAR training steps
run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=1300 NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
# - name: Run 10 CIFAR training steps w HALF
# run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=240 NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
# - name: Run 10 CIFAR training steps w BF16
# run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=270 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
# TODO: too slow
run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=120 NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=110 NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
- name: Run 10 CIFAR training steps w BF16
run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=120 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
# - name: Run 10 CIFAR training steps w winograd
# run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=350 NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
# - name: Run full CIFAR training w 1 GPU
# run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
# - name: Run full CIFAR training steps w 6 GPUS
# run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
- name: Run full CIFAR training w 1 GPU
run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
- name: Run full CIFAR training steps w 6 GPUS
run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
- name: Run MLPerf resnet eval on training data
run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
#- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
# run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
#- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
# run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run 10 MLPerf Bert training steps (6 gpu)
# TODO: remove BERT_LAYERS once scheduler is fast
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (NVIDIA Training)
@ -433,9 +451,8 @@ jobs:
run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
- name: Run Stable Diffusion
run: BENCHMARK_LOG=stable_diffusion ASSERT_MIN_STEP_TIME=550 AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
# TODO: too slow
# - name: Run SDXL
# run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3200 CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
- name: Run SDXL
run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3200 CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt
- name: Run LLaMA 7B
run: |
BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt
@ -525,22 +542,19 @@ jobs:
run: test/external/process_replay/reset.py
- name: Train MNIST
run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
# TODO: too slow
- name: Run 10 CIFAR training steps
run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=2000 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
# - name: Run 10 CIFAR training steps w HALF
# run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=390 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=200 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=200 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
# - name: Run 10 CIFAR training steps w BF16
# run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
# TODO: too slow
# - name: Run 10 CIFAR training steps w winograd
# run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
# - name: Run full CIFAR training w 1 GPU
# run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
#- name: Run full CIFAR training steps w 6 GPUS
# run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
#- name: Run full CIFAR training steps w 6 GPUS (REMOTE)
# run: time BENCHMARK_LOG=cifar_6gpu_remote REMOTE=1 REMOTEDEV=AMD DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu_remote.txt
- name: Run full CIFAR training w 1 GPU
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt
- name: Run full CIFAR training steps w 6 GPUS
run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD Training)
@ -552,7 +566,6 @@ jobs:
train_cifar_wino.txt
train_cifar_one_gpu.txt
train_cifar_six_gpu.txt
train_cifar_six_gpu_remote.txt
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
@ -590,13 +603,13 @@ jobs:
run: test/external/process_replay/reset.py
- name: Run MLPerf resnet eval
run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
#- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
# run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
#- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
# run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt
- name: Run 10 MLPerf ResNet50 training steps (6 gpu)
run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt
- name: Run 10 MLPerf Bert training steps (6 gpu)
# TODO: remove BERT_LAYERS once scheduler is fast
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt
- uses: actions/upload-artifact@v4
with:
name: Speed (AMD MLPerf)
@ -625,32 +638,28 @@ jobs:
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
- name: reset process replay
run: test/external/process_replay/reset.py
# - name: openpilot compile3 0.9.9 driving_vision
# run: BENCHMARK_LOG=openpilot_0_9_9_vision PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_vision.onnx
# - name: openpilot compile3 0.9.9 driving_policy
# run: BENCHMARK_LOG=openpilot_0_9_9_policy PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_policy.onnx
# - name: openpilot compile3 0.9.9 dmonitoring
# run: BENCHMARK_LOG=openpilot_0_9_9_dmonitoring PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/dmonitoring_model.onnx
- name: openpilot compile3 0.10.0 driving_policy
run: BENCHMARK_LOG=openpilot_0_10_0_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=4 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.10.0/selfdrive/modeld/models/driving_policy.onnx
- name: openpilot compile3 0.10.0 dmonitoring
run: BENCHMARK_LOG=openpilot_0_10_0_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=11 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.10.0/selfdrive/modeld/models/dmonitoring_model.onnx
- name: DEBUG=2 openpilot compile3 0.10.1 driving_vision
run: PYTHONPATH="." DEBUG=2 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
- name: DEBUG=2 IMAGE=1 openpilot compile3 0.10.1 driving_vision
run: PYTHONPATH="." DEBUG=2 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
- name: openpilot compile3 0.10.1 driving_vision
run: BENCHMARK_LOG=openpilot_0_10_1_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
- name: openpilot compile3 0.10.1 driving_policy
run: BENCHMARK_LOG=openpilot_0_10_1_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=4 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_policy.onnx
- name: openpilot compile3 0.10.1 dmonitoring
run: BENCHMARK_LOG=openpilot_0_10_1_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=10 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/dmonitoring_model.onnx
# - name: benchmark MobileNetV2 on DSP
# run: |
# # generate quantized weights
# ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
# ln -s /data/home/tiny/tinygrad/testsig-*.so .
# PYTHONPATH=. CC=clang-19 CPU=1 CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
# # benchmark on DSP with NOOPT=1, the devectorizer has issues
# PYTHONPATH=. CC=clang-19 DSP=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
- name: benchmark MobileNetV2 on DSP
run: |
# generate quantized weights
ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
ln -s /data/home/tiny/tinygrad/testsig-*.so .
PYTHONPATH=. CC=clang-19 CPU=1 CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
# benchmark on DSP with NOOPT=1, the devectorizer has issues
PYTHONPATH=. CC=clang-19 DSP=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
- name: Run process replay tests
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
@ -706,10 +715,8 @@ jobs:
run: |
AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
# TODO: too slow
# - name: Run full CIFAR training w 1 GPU
# run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
# TODO: enable
- name: Run full CIFAR training w 1 GPU
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
# - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
# run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee am_train_resnet_one_gpu.txt
- name: Run 10 MLPerf Bert training steps (1 gpu)
@ -770,11 +777,10 @@ jobs:
NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
- name: Test LLAMA-3
run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt
# TODO: too slow
# - name: Run full CIFAR training w 1 GPU
# run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt
#- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
# run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt
- name: Run full CIFAR training w 1 GPU
run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt
- name: Run 10 MLPerf ResNet50 training steps (1 gpu)
run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt
- name: Run 10 MLPerf Bert training steps (1 gpu)
# TODO: remove BERT_LAYERS once scheduler is fast
run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt

View file

@ -56,15 +56,15 @@ jobs:
uses: actions/checkout@v4
with:
path: base
- name: Set up Python 3.10
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.12'
- name: Count Line Diff
run: |
pip install tabulate
BASE="$GITHUB_WORKSPACE/base"
PR="$GITHUB_WORKSPACE/pr"
pip install tabulate $BASE
cp "$BASE/sz.py" .
echo "loc_content<<EOF" >> "$GITHUB_ENV"
python sz.py "$BASE" "$PR" >> "$GITHUB_ENV"

View file

@ -1,7 +1,7 @@
name: Unit Tests
env:
# increment this when downloads substantially change to avoid the internet
CACHE_VERSION: '13'
CACHE_VERSION: '15'
CAPTURE_PROCESS_REPLAY: 1
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PYTHONPATH: ${{ github.workspace }}
@ -71,9 +71,7 @@ jobs:
- name: Test Docs Build
run: python -m mkdocs build --strict
- name: Test Docs
run: |
python docs/abstractions2.py
python docs/abstractions3.py
run: python docs/abstractions3.py
- name: Test README
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py && python README.py
- name: Test Quickstart
@ -86,65 +84,67 @@ jobs:
clang -O2 recognize.c -lm -o recognize
cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock
# TODO: fix the torch backend and reenable
# torchbackend:
# name: Torch Backend Tests
# runs-on: ubuntu-latest
# timeout-minutes: 15
# steps:
# - name: Checkout Code
# uses: actions/checkout@v4
# - name: Setup Environment
# uses: ./.github/actions/setup-tinygrad
# with:
# key: torch-backend-pillow-torchvision-et-pt
# deps: testing_minimal
# pydeps: "pillow torchvision expecttest"
# llvm: 'true'
# - name: Install ninja
# run: |
# sudo apt update || true
# sudo apt install -y --no-install-recommends ninja-build
# - name: Lint with ruff
# run: |
# pip3 install --upgrade --force-reinstall ruff==0.11.0
# python3 -m ruff check extra/torch_backend/backend.py
# - name: Test one op
# run: FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_ops.py TestOps.test_add
# - name: Test ResNet-18
# run: DEBUG=2 python3 extra/torch_backend/example.py
# - name: My (custom) tests
# run: python3 extra/torch_backend/test.py
# - name: Test one op in torch tests
# run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
# - name: Test Ops with TINY_BACKEND
# run: CPU=1 CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/test_ops.py --durations=20
# - name: Test in-place operations on views
# run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
# - name: Test multi-gpu
# run: CPU=1 CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
torchbackend:
name: Torch Backend Tests
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: torch-backend-pillow-torchvision-et-pt
deps: testing_minimal
pydeps: "pillow torchvision expecttest"
llvm: 'true'
- name: Install ninja
run: |
sudo apt update || true
sudo apt install -y --no-install-recommends ninja-build
- name: Lint with ruff
run: |
pip3 install --upgrade --force-reinstall ruff==0.11.0
python3 -m ruff check extra/torch_backend/backend.py
- name: Test one op
run: FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_ops.py TestOps.test_add
- name: Test ResNet-18
run: DEBUG=2 python3 extra/torch_backend/example.py
- name: My (custom) tests
run: python3 extra/torch_backend/test.py
- name: Test one op in torch tests
run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
- name: Test Ops with TINY_BACKEND
run: CPU=1 CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/test_ops.py --durations=20
- name: Test in-place operations on views
run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
- name: Test multi-gpu
run: CPU=1 CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
- name: Test kernel fusion
run: python3 extra/torch_backend/test_kernel_fusion.py
# torchbackendmore:
# name: Torch Backend Tests More
# runs-on: ubuntu-latest
# timeout-minutes: 15
# steps:
# - name: Checkout Code
# uses: actions/checkout@v4
# - name: Setup Environment
# uses: ./.github/actions/setup-tinygrad
# with:
# key: torch-backend-pillow-torchvision-et-pt
# deps: testing_minimal
# llvm: 'true'
# - name: Install ninja
# run: |
# sudo apt update || true
# sudo apt install -y --no-install-recommends ninja-build
# - name: Test beautiful_mnist in torch with TINY_BACKEND
# run: STEPS=20 CPU=1 TARGET_EVAL_ACC_PCT=90.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py
# - name: Test some torch tests (expect failure)
# run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true
torchbackendmore:
name: Torch Backend Tests More
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: torch-backend-pillow-torchvision-et-pt
deps: testing_minimal
llvm: 'true'
- name: Install ninja
run: |
sudo apt update || true
sudo apt install -y --no-install-recommends ninja-build
- name: Test beautiful_mnist in torch with TINY_BACKEND
run: STEPS=20 CPU=1 TARGET_EVAL_ACC_PCT=90.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py
- name: Test some torch tests (expect failure)
run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true
bepython:
name: Python Backend
@ -236,6 +236,7 @@ jobs:
pip3 install --upgrade --force-reinstall ruff==0.11.0
python3 -m ruff check .
python3 -m ruff check examples/mlperf/ --ignore E501
python3 -m ruff check extra/thunder/tiny/ --ignore E501 --ignore F841 --ignore E722
- name: Run mypy
run: |
python -m mypy --strict-equality --lineprecision-report .
@ -261,7 +262,9 @@ jobs:
- name: Check Device.DEFAULT
run: python -c "from tinygrad import Device; assert Device.DEFAULT == 'CPU', Device.DEFAULT"
- name: Run unit tests
run: CPU=1 python -m pytest -n=auto test/unit/ --durations=20
run: |
CPU=1 python test/unit/test_device.py TestRunAsModule.test_module_runs
CPU=1 python -m pytest -n=auto test/unit/ --durations=20 --deselect=test/unit/test_device.py::TestRunAsModule::test_module_runs
- name: Run targetted tests on NULL backend
run: NULL=1 python3 -m unittest test.test_multitensor.TestMultiTensor.test_data_parallel_resnet_train_step test/device/test_null.py
# TODO: too slow
@ -287,8 +290,8 @@ jobs:
python extra/optimization/extract_dataset.py
gzip -c /tmp/sops > extra/datasets/sops.gz
#DEBUG=1 MIN_ASTS=1 python extra/optimization/get_action_space.py
- name: Repo line count < 19000 lines
run: MAX_LINE_COUNT=19000 python sz.py
- name: Repo line count < 20000 lines
run: MAX_LINE_COUNT=20000 python sz.py
spec:
strategy:
@ -306,8 +309,9 @@ jobs:
with:
key: spec-unit
deps: testing_unit
python-version: '3.14'
- name: Test SPEC=2
run: IGNORE_OOB=0 SPEC=2 PYTHONPATH="." pytest --maxfail=10 -n auto --durations=30 --ignore=test/models --ignore test/unit/test_hashing.py --timeout 60 -k "not test_setitem_big" --splits 2 --group ${{ matrix.group }}
run: IGNORE_OOB=0 SPEC=2 PYTHONPATH="." pytest --maxfail=10 -n auto --durations=30 --ignore=test/models --ignore test/test_custom_kernel.py --ignore test/unit/test_hashing.py --timeout 60 -k "not test_setitem_big" --splits 2 --group ${{ matrix.group }}
fuzzing:
name: Fuzzing
@ -323,6 +327,8 @@ jobs:
deps: testing_unit
- name: Fuzz Test symbolic
run: python test/external/fuzz_symbolic.py
- name: Fuzz Test symbolic (symbolic divisors)
run: python test/external/fuzz_symbolic_symbolic_div.py
- name: Fuzz Test fast idiv
run: python test/external/fuzz_fast_idiv.py
- name: Fuzz Test shape ops
@ -442,7 +448,7 @@ jobs:
with:
key: onnxoptl
deps: testing
pydeps: "tensorflow==2.15.1 tensorflow_addons"
pydeps: "tensorflow==2.19"
python-version: '3.11'
opencl: 'true'
- name: Test ONNX (CL)
@ -460,7 +466,7 @@ jobs:
- name: Test Bert training
run: NULL=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=24 GPUS=4 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py
- name: Test llama 3 training
run: NULL=1 SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
run: NULL=1 SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=1 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
- name: Run process replay tests
uses: ./.github/actions/process-replay
@ -716,71 +722,6 @@ jobs:
- name: Run process replay tests
uses: ./.github/actions/process-replay
amdremote:
name: Linux (remote)
runs-on: ubuntu-22.04
timeout-minutes: 20
env:
REMOTE: 1
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: linux-remote
deps: testing_minimal
amd: 'true'
llvm: 'true'
opencl: 'true'
- name: Start remote server
run: |
start_server() {
systemd-run --user \
--unit="$1" \
--setenv=REMOTEDEV="$2" \
--setenv=MOCKGPU=1 \
--setenv=PYTHONPATH=. \
--setenv=PORT="$3" \
--working-directory="$(pwd)" \
python tinygrad/runtime/ops_remote.py
}
start_server "remote-server-amd-1" "AMD" 6667
start_server "remote-server-amd-2" "AMD" 6668
start_server "remote-server-gpu" "CL" 7667
start_server "remote-server-cpu" "CPU" 8667
- name: Check Device.DEFAULT and print some source
env:
HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
run: |
python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT"
python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'AMD', Device.default.properties.real_device"
DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
- name: Run REMOTE=1 Test (AMD)
env:
HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
run: |
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
- name: Run REMOTE=1 Test (CL)
env:
HOST: 127.0.0.1:7667*6
run: |
python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20
IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
- name: Run REMOTE=1 Test (CPU)
env:
HOST: 127.0.0.1:8667*6
run: |
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20
- name: Show remote server logs
if: always()
run: |
journalctl --user -u remote-server-amd-1 --no-pager
journalctl --user -u remote-server-amd-2 --no-pager
journalctl --user -u remote-server-gpu --no-pager
journalctl --user -u remote-server-cpu --no-pager
# ****** OSX Tests ******
testmetal:
@ -878,30 +819,6 @@ jobs:
- name: Test ONNX Runner (WEBGPU)
run: WEBGPU=1 python3 test/external/external_test_onnx_runner.py
osxremote:
name: MacOS (remote metal)
runs-on: macos-15
timeout-minutes: 10
env:
REMOTE: 1
REMOTEDEV: METAL
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: macos-remote
deps: testing_minimal
- name: Check Device.DEFAULT and print some source
run: |
python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT"
python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'METAL', Device.default.properties.real_device"
DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
- name: Run REMOTE=1 Test
run: |
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_tensor_variable.py
osxtests:
strategy:
fail-fast: false
@ -967,3 +884,33 @@ jobs:
run: |
python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT"
python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20
# ****** Compile-only Tests ******
compiletests:
strategy:
fail-fast: false
matrix:
backend: [ir3, nak]
name: Compile-only (${{ matrix.backend }})
runs-on: ubuntu-24.04
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: compile-${{ matrix.backend }}
deps: testing_minimal
mesa: ${{ (matrix.backend == 'ir3' || matrix.backend == 'nak') && 'true' }}
python-version: '3.14'
- name: Set env
shell: bash
run: printf "NULL=1\n${{ matrix.backend == 'ir3' && 'NULL_IR3=1' || matrix.backend == 'nak' && 'NULL_NAK=1' }}" >> $GITHUB_ENV
- name: Run test_ops
shell: bash
run: |
python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
DEBUG=4 python3 test/test_ops.py TestOps.test_add
python -m pytest -n=auto test/test_ops.py --durations=20

View file

@ -27,8 +27,8 @@ repos:
always_run: true
pass_filenames: false
- id: tests
name: subset of tests
entry: env OMP_NUM_THREADS=1 PYTHONPATH="." python3 -m pytest -n=6 test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py
name: comprehensive test suite
entry: env OMP_NUM_THREADS=1 SKIP_SLOW_TEST=1 PYTHONPATH="." python3 -m pytest -n=6 test/test_ops.py test/test_schedule.py test/test_assign.py test/test_tensor.py test/test_jit.py test/unit/test_schedule_cache.py test/unit/test_pattern_matcher.py test/unit/test_uop_symbolic.py test/unit/test_helpers.py
language: system
always_run: true
pass_filenames: false

227
CLAUDE.md Normal file
View file

@ -0,0 +1,227 @@
# Claude Code Guide for tinygrad
## Architecture Overview
tinygrad compiles tensor operations into optimized kernels. The pipeline:
1. **Tensor** (`tensor.py`) - User-facing API, creates UOp graph
2. **UOp** (`uop/ops.py`) - Unified IR for all operations (both tensor and kernel level)
3. **Schedule** (`engine/schedule.py`, `schedule/`) - Converts tensor UOps to kernel UOps
4. **Codegen** (`codegen/`) - Converts kernel UOps to device code
5. **Runtime** (`runtime/`) - Device-specific execution
## Key Concepts
### UOp (Universal Operation)
Everything is a UOp - tensors, operations, buffers, kernels. Key properties:
- `op`: The operation type (Ops enum)
- `dtype`: Data type
- `src`: Tuple of source UOps
- `arg`: Operation-specific argument
- `tag`: Optional tag for graph transformations
UOps are **immutable and cached** - creating the same UOp twice returns the same object (ucache).
### PatternMatcher
Used extensively for graph transformations:
```python
pm = PatternMatcher([
(UPat(Ops.ADD, src=(UPat.cvar("x"), UPat.cvar("x"))), lambda x: x * 2),
])
result = graph_rewrite(uop, pm)
```
### Schedule Cache
Schedules are cached by graph structure. BIND nodes (variables with bound values) are unbound before cache key computation so different values hit the same cache.
## Directory Structure
```
tinygrad/
├── tensor.py # Tensor class, user API
├── device.py # Buffer, device management
├── dtype.py # Data types
├── helpers.py # Utilities, environment vars
├── uop/
│ ├── ops.py # UOp class, Ops enum, PatternMatcher
│ ├── spec.py # UOp type verification
│ └── symbolic.py # Symbolic math simplification
├── engine/
│ ├── schedule.py # Schedule creation, caching
│ ├── realize.py # Tensor realization
│ ├── jit.py # JIT compilation
│ └── memory.py # Memory planning
├── schedule/
│ ├── rangeify.py # Convert movements to ranges
│ └── indexing.py # Index calculations
├── codegen/
│ ├── kernel.py # Kernel optimization
│ └── uopgraph.py # UOp graph transformations
├── renderer/ # Code generation (CUDA, Metal, etc.)
└── runtime/ # Device backends
```
## Testing
```bash
# Run specific test
python -m pytest test/unit/test_schedule_cache.py -xvs
# Run with timeout
python -m pytest test/test_symbolic_ops.py -x --timeout=60
# Debug with print
DEBUG=2 python -m pytest test/test_schedule.py::test_name -xvs
# Visualize UOp graphs
VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()"
```
## Common Environment Variables
- `DEBUG=1-4` - Increasing verbosity
- `VIZ=1` - Enable graph visualization
- `SPEC=1` - Enable UOp spec verification
- `NOOPT=1` - Disable optimizations
- `DEVICE=CPU/CUDA/AMD/METAL` - Set default device
## Debugging Tips
1. **Print UOp graphs**: `print(tensor.uop)` or `print(tensor.uop.sink())`
2. **Check schedule**: `tensor.schedule()` returns list of ExecItems
3. **Trace graph rewrites**: Use `VIZ=1` or add print in PatternMatcher callbacks
4. **Find UOps by type**: `[u for u in uop.toposort() if u.op is Ops.SOMETHING]`
## Workflow Rules
- **NEVER commit without explicit user approval** - always show the diff and wait for approval
- **NEVER amend commits** - always create a new commit instead
- Run `pre-commit run --all-files` before committing to catch linting/type errors
- Run tests before proposing commits
- Test with `SPEC=2` when modifying UOp-related code
## Style Notes
- 2-space indentation, 150 char line limit
- PatternMatchers should be defined at module level (slow to construct)
- Prefer `graph_rewrite` over manual graph traversal
- UOp methods like `.replace()` preserve tags unless explicitly changed
- Use `.rtag(value)` to add tags to UOps
## Lessons Learned
### UOp ucache Behavior
UOps are cached by their contents - creating a UOp with identical (op, dtype, src, arg) returns the **same object**. This means:
- `uop.replace(tag=None)` on a tagged UOp returns the original untagged UOp if it exists in cache
- Two UOps with same structure are identical (`is` comparison works)
### Spec Validation
When adding new UOp patterns, update `tinygrad/uop/spec.py`. Test with:
```bash
SPEC=2 python3 test/unit/test_something.py
```
Spec issues appear as `RuntimeError: SPEC ISSUE None: UOp(...)`.
### Schedule Cache Key Normalization
The schedule cache strips values from BIND nodes so different bound values (e.g., KV cache positions) hit the same cache entry:
- `pm_pre_sched_cache`: BIND(DEFINE_VAR, CONST) → BIND(DEFINE_VAR) for cache key
- `pm_post_sched_cache`: restores original BIND from context
- When accessing `bind.src[1]`, check `len(bind.src) > 1` first (might be stripped)
- Extract var_vals from `input_buffers` dict after graph_rewrite (avoids extra toposort)
### Avoiding Extra Work
- Use ctx dict from graph_rewrite to collect info during traversal instead of separate toposort
- Only extract var_vals when schedule is non-empty (no kernels = no vars needed)
- PatternMatchers are slow to construct - define at module level, not in functions
### Readability Over Speed
Don't add complexity for marginal performance gains. Simpler code that's slightly slower is often better:
```python
# BAD: "optimized" with extra complexity
if has_afters: # skip toposort if no AFTERs
after_map = [(u, u.buf_uop) for u in big_sink.toposort() if u.op is Ops.AFTER]
# GOOD: simple, always works
after_map = [(u, u.buf_uop) for u in big_sink.toposort() if u.op is Ops.AFTER]
```
The conditional check adds complexity, potential bugs, and often negligible speedup. Only optimize when profiling shows a real bottleneck.
### Testing LLM Changes
```bash
# Quick smoke test
echo "Hello" | DEBUG=1 python tinygrad/apps/llm.py --model "llama3.2:1b"
# Check cache hits (should see "cache hit" after warmup)
echo "Hello world" | DEBUG=1 python tinygrad/apps/llm.py --model "llama3.2:1b" 2>&1 | grep cache
# Test with beam search
echo "Hello" | BEAM=2 python tinygrad/apps/llm.py --model "llama3.2:1b"
```
## Common Patterns
### Graph Transformation
```python
def my_transform(ctx, x):
# Return new UOp or None to skip
return x.replace(arg=new_arg)
pm = PatternMatcher([
(UPat(Ops.SOMETHING, name="x"), my_transform),
])
result = graph_rewrite(input_uop, pm, ctx={})
```
### Finding Variables
```python
# Get all variables in a UOp graph
variables = uop.variables()
# Get bound variable values
var, val = bind_uop.unbind()
```
### Shape Handling
```python
# Shapes can be symbolic (contain UOps)
shape = tensor.shape # tuple[sint, ...] where sint = int | UOp
```
## Performance Optimization
When optimizing tinygrad internals:
1. **Measure wall time, not just call counts** - Reducing `graph_rewrite` calls doesn't always improve wall time. The overhead of conditional checks can exceed the cost of the operation being skipped.
2. **Profile each optimization individually** - Run benchmarks with and without each change to measure actual impact. Use `test/external/external_benchmark_schedule.py` for schedule/rewrite timing.
3. **Early exits in hot paths are effective** - Simple checks like `if self.op is Ops.CONST: return self` in `simplify()` can eliminate many unnecessary `graph_rewrite` calls.
4. **`graph_rewrite` is expensive** - Each call has overhead even for small graphs. Avoid calling it when the result is trivially known (e.g., simplifying a CONST returns itself).
5. **Beware iterator overhead** - Checks like `all(x.op is Ops.CONST for x in self.src)` can be slower than just running the operation, especially for small sequences.
6. **Verify cache hit rates before adding/keeping caches** - Measure actual hit rates with real workloads. A cache with 0% hit rate is pure overhead (e.g., `pm_cache` was removed because the algorithm guarantees each UOp is only passed to `pm_rewrite` once).
7. **Use `TRACK_MATCH_STATS=2` to profile pattern matching** - This shows match rates and time per pattern. Look for patterns with 0% match rate that still cost significant time - these are pure overhead for that workload.
8. **Cached properties beat manual traversal** - `backward_slice` uses `@functools.cached_property`. A DFS with early-exit sounds faster but is actually slower because it doesn't benefit from caching. The cache hit benefit often outweighs algorithmic improvements.
9. **Avoid creating intermediate objects in hot paths** - For example, `any(x.op in ops for x in self.backward_slice)` is faster than `any(x.op in ops for x in {self:None, **self.backward_slice})` because it avoids dict creation.
## Pattern Matching Profiling
Use `TRACK_MATCH_STATS=2` to identify expensive patterns:
```bash
TRACK_MATCH_STATS=2 PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
```
Output format: `matches / attempts -- match_time / total_time ms -- location`
Key patterns to watch (from ResNet50 benchmark):
- `split_load_store`: ~146ms, 31% match rate - does real work
- `simplify_valid`: ~75ms, 0% match rate in this workload - checks AND ops for INDEX in backward slice
- `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches
Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose.

View file

@ -1,135 +0,0 @@
# tinygrad is a tensor library, and as a tensor library it has multiple parts
# 1. a "runtime". this allows buffer management, compilation, and running programs
# 2. a "Device" that uses the runtime but specifies compute in an abstract way for all
# 3. a "UOp" that fuses the compute into kernels, using memory only when needed
# 4. a "Tensor" that provides an easy to use frontend with autograd ".backward()"
print("******** first, the runtime ***********")
from tinygrad.runtime.ops_cpu import ClangJITCompiler, CPUDevice, CPUProgram
cpu = CPUDevice()
# allocate some buffers
out = cpu.allocator.alloc(4)
a = cpu.allocator.alloc(4)
b = cpu.allocator.alloc(4)
# load in some values (little endian)
cpu.allocator._copyin(a, memoryview(bytearray([2,0,0,0])))
cpu.allocator._copyin(b, memoryview(bytearray([3,0,0,0])))
# compile a program to a binary
lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
# create a runtime for the program
fxn = cpu.runtime("add", lib)
# run the program
fxn(out, a, b)
# check the data out
print(val := cpu.allocator._as_buffer(out).cast("I").tolist()[0])
assert val == 5
print("******** second, the Device ***********")
DEVICE = "CPU" # NOTE: you can change this!
import struct
from tinygrad.dtype import dtypes
from tinygrad.device import Buffer, Device
from tinygrad.uop.ops import UOp, Ops
# allocate some buffers + load in values
out = Buffer(DEVICE, 1, dtypes.int32).allocate()
a = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
# NOTE: a._buf is the same as the return from cpu.allocator.alloc
# describe the computation
idx = UOp.const(dtypes.index, 0)
buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
alu = buf_1.index(idx) + buf_2.index(idx)
output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.index(idx), alu))
s = UOp(Ops.SINK, dtypes.void, (st_0,))
# convert the computation to a "linearized" format (print the format)
from tinygrad.engine.realize import get_program, CompiledRunner
program = get_program(s, Device[DEVICE].renderer)
# compile a program (and print the source)
fxn = CompiledRunner(program)
print(fxn.p.src)
# NOTE: fxn.clprg is the CPUProgram
# run the program
fxn.exec([out, a, b])
# check the data out
assert out.as_buffer().cast('I')[0] == 5
print("******** third, the UOp ***********")
from tinygrad.engine.realize import run_schedule
from tinygrad.engine.schedule import create_schedule_with_vars
from tinygrad.schedule.rangeify import get_rangeify_map
# allocate some values + load in values
a = UOp.new_buffer(DEVICE, 1, dtypes.int32)
b = UOp.new_buffer(DEVICE, 1, dtypes.int32)
a.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
b.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
# describe the computation
out = a + b
s = UOp(Ops.SINK, dtypes.void, (out,))
# group the computation into kernels
becomes_map = get_rangeify_map(s)
# the compute maps to an assign
assign = becomes_map[a+b].base
# the first source is the output buffer (data)
assert assign.src[0].op is Ops.BUFFER
# the second source is the kernel (compute)
assert assign.src[1].op is Ops.KERNEL
# schedule the kernel graph in a linear list
s = UOp(Ops.SINK, dtypes.void, (assign,))
sched, _ = create_schedule_with_vars(s)
assert len(sched) == 1
# DEBUGGING: print the compute ast
print(sched[-1].ast)
# NOTE: sched[-1].ast is the same as st_0 above
# the output will be stored in a new buffer
out = assign.buf_uop
assert out.op is Ops.BUFFER and not out.buffer.is_allocated()
print(out)
# run that schedule
run_schedule(sched)
# check the data out
assert out.is_realized and out.buffer.as_buffer().cast('I')[0] == 5
print("******** fourth, the Tensor ***********")
from tinygrad import Tensor
a = Tensor([2], dtype=dtypes.int32, device=DEVICE)
b = Tensor([3], dtype=dtypes.int32, device=DEVICE)
out = a + b
# check the data out
print(val:=out.item())
assert val == 5

View file

@ -38,25 +38,19 @@ optim.schedule_step() # this will step the optimizer without running realize
# The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
# l1.uop and l2.uop define a computation graph
from tinygrad.engine.schedule import ScheduleItem
schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)
from tinygrad.engine.schedule import ExecItem
schedule: List[ExecItem] = Tensor.schedule(l1, l2)
print(f"The schedule contains {len(schedule)} items.")
for si in schedule: print(str(si)[:80])
# *****
# 4. Lower a schedule.
# 4. Lower and run the schedule.
from tinygrad.engine.realize import lower_schedule_item, ExecItem
lowered: List[ExecItem] = [lower_schedule_item(si) for si in tqdm(schedule)]
for si in tqdm(schedule): si.run()
# *****
# 5. Run the schedule
for ei in tqdm(lowered): ei.run()
# *****
# 6. Print the weight change
# 5. Print the weight change
print("first weight change\n", l1.numpy()-l1n)
print("second weight change\n", l2.numpy()-l2n)

View file

@ -17,15 +17,15 @@ The `UOp` graph specifies the compute in terms of low level tinygrad ops. Not al
## Scheduling
The [scheduler](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/schedule.py) converts the graph of UOps into a list of `ScheduleItem`. One `ScheduleItem` is one kernel on the GPU, and the scheduler is responsible for breaking the large compute graph into subgraphs that can fit in a kernel. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.
The [scheduler](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/schedule.py) converts the graph of UOps into a list of `ExecItem`. One `ExecItem` is one kernel on the GPU, and the scheduler is responsible for breaking the large compute graph into subgraphs that can fit in a kernel. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.
::: tinygrad.engine.schedule.ScheduleItem
::: tinygrad.engine.schedule.ExecItem
## Lowering
The code in [realize](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/realize.py) lowers `ScheduleItem` to `ExecItem` with
The code in [realize](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/engine/realize.py) lowers `ExecItem` by populating its `prg` field with
::: tinygrad.engine.realize.lower_schedule
::: tinygrad.engine.realize.run_schedule
There's a ton of complexity hidden behind this, see the `codegen/` directory.

View file

@ -131,7 +131,7 @@ timeit.repeat(jit_step, repeat=5, number=1)
1.0 ms is 75x faster! Note that we aren't syncing the GPU, so GPU time may be slower.
The slowness the first two times is the JIT capturing the kernels. And this JIT will not run any Python in the function, it will just replay the tinygrad kernels that were run, so be aware that non tinygrad Python operations won't work. Randomness functions work as expected.
The first two runs of the function execute normally, with the JIT capturing the kernels. Starting from the third run, only the tinygrad operations are replayed, removing the overhead by skipping Python code execution. So be aware that any non-tinygrad Python values affecting the kernels will be "frozen" from the second run. Note that `Tensor` randomness functions work as expected.
Unlike other JITs, we JIT everything, including the optimizer. Think of it as a dumb replay on different data.

View file

@ -1,293 +0,0 @@
#!/usr/bin/env python3
# this file is a "ramp" for people new to tinygrad to think about how to approach it
# it is runnable and editable.
# whenever you see stuff like DEBUG=2 or CPU=1 discussed, these are environment variables
# in a unix shell like bash `DEBUG=2 CPU=1 python docs/ramp.py`
# this pip installs tinygrad master for the system
# the -e allows you to edit the tinygrad folder and update system tinygrad
# tinygrad is pure Python, so you are encouraged to do this
# git pull in the tinygrad directory will also get you the latest
"""
git clone https://github.com/tinygrad/tinygrad.git
cd tinygrad
python3 -m pip install -e .
"""
# %% ********
print("******* PART 1 *******")
# we start with a Device.
# a Device is where Tensors are stored and compute is run
# tinygrad autodetects the best device on your system and makes it the DEFAULT
from tinygrad import Device
print(Device.DEFAULT) # on Mac, you can see this prints METAL
# now, lets create a Tensor
from tinygrad import Tensor, dtypes
t = Tensor([1,2,3,4])
# you can see this Tensor is on the DEFAULT device with int dtype and shape (4,)
assert t.device == Device.DEFAULT
assert t.dtype == dtypes.int
assert t.shape == (4,)
# unlike in torch, if we print it, it doesn't print the contents
# this is because tinygrad is lazy
# this Tensor has not been computed yet
print(t)
# <Tensor <UOp METAL (4,) int (<Ops.COPY: 7>, None)> on METAL with grad None>
# the ".uop" property on Tensor contains the specification of how to compute it
print(t.uop)
"""
UOp(Ops.COPY, dtypes.int, arg=None, src=(
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=0, src=()),
UOp(Ops.DEVICE, dtypes.void, arg='PYTHON', src=()),)),
UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# as you can see, it's specifying a copy from PYTHON device
# which is where the [1,2,3,4] array lives
# UOps are the specification language in tinygrad
# they are immutable and form a DAG
# they have a "Ops", a "dtype", a tuple of srcs (parents), and an arg
t.realize()
# if we want to "realize" a tensor, we can with the "realize" method
# now when we look at the uop, it's changed
print(t.uop)
"""
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# the copy was actually run, and now the "uop" of the Tensor is just a BUFFER
# if you run this script with DEBUG=2 in the environment, you can see the copy happen
# *** METAL 1 copy 16, METAL <- PYTHON ...
# now let's do some compute
# we look at the uop to see the specification of the compute
t_times_2 = t * 2
print(t_times_2.uop)
"""
UOp(Ops.MUL, dtypes.int, arg=None, src=(
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
x2:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)),
UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
UOp(Ops.CONST, dtypes.int, arg=2, src=(
UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
x2,)),)),)),)),))
"""
# the BUFFER from above is being multiplied by a CONST 2
# it's RESHAPEd and EXPANDed to broadcast the CONST to the BUFFER
# we can check the result with
assert t_times_2.tolist() == [2, 4, 6, 8]
# UOps are both immutable and globally unique
# if i multiply the Tensor by 4 twice, these result Tensors will have the same uop specification
t_times_4_try_1 = t * 4
t_times_4_try_2 = t * 4
assert t_times_4_try_1.uop is t_times_4_try_2.uop
# the specification isn't just the same, it's the exact same Python object
assert t_times_4_try_1 is not t_times_4_try_2
# the Tensor is a different Python object
# if we realize `t_times_4_try_1` ...
t_times_4_try_1.realize()
print(t_times_4_try_2.uop)
"""
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=4, src=()),
UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# ... `t_times_4_try_2` also becomes the same BUFFER
assert t_times_4_try_1.uop is t_times_4_try_2.uop
# so this print doesn't require any computation, just a copy back to the CPU so we can print it
print("** only the copy start")
print(t_times_4_try_2.tolist()) # [4, 8, 12, 16]
print("** only the copy end")
# you can confirm this with DEBUG=2, seeing what's printed in between the "**" prints
# tinygrad has an auto differentiation engine that operates according to these same principles
# the derivative of "log(x)" is "1/x", and you can see this on line 20 of gradient.py
t_float = Tensor([3.0])
t_log = t_float.log()
t_log_grad, = t_log.sum().gradient(t_float)
# due to how log is implemented, this gradient contains a lot of UOps
print(t_log_grad.uop)
# ...not shown here...
# but if you run with DEBUG=4 (CPU=1 used here for simpler code), you can see the generated code
"""
void E_(float* restrict data0, float* restrict data1) {
float val0 = *(data1+0);
*(data0+0) = (1/val0);
}
"""
# the derivative is close to 1/3
assert (t_log_grad.item() - 1/3) < 1e-6
# %% ********
print("******* PART 2 *******")
# we redefine the same t here so this cell can run on it's own
from tinygrad import Tensor
t = Tensor([1,2,3,4])
# what's above gives you enough of an understanding to go use tinygrad as a library
# however, a lot of the beauty of tinygrad is in how easy it is to interact with the internals
# NOTE: the APIs here are subject to change
t_plus_3_plus_4 = t + 3 + 4
print(t_plus_3_plus_4.uop)
"""
UOp(Ops.ADD, dtypes.int, arg=None, src=(
UOp(Ops.ADD, dtypes.int, arg=None, src=(
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
x3:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
UOp(Ops.CONST, dtypes.int, arg=3, src=(
x7:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
x3,)),)),)),)),)),
UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
UOp(Ops.CONST, dtypes.int, arg=4, src=(
x7,)),)),)),))
"""
# you can see it's adding both 3 and 4
# but by the time we are actually running the code, it's adding 7
# `kernelize` will simplify and group the operations in the graph into kernels
t_plus_3_plus_4.kernelize()
print(t_plus_3_plus_4.uop)
"""
UOp(Ops.ASSIGN, dtypes.int, arg=None, src=(
x0:=UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=7, src=()),
x2:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
UOp(Ops.KERNEL, dtypes.void, arg=<Kernel 12 SINK(<Ops.STORE: 48>,) (__add__,)>, src=(
x0,
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
x2,)),)),))
"""
# ASSIGN has two srcs, src[0] is the BUFFER that's assigned to, and src[1] is the thing to assign
# src[1] is the GPU Kernel that's going to be run
# we can get the ast of the Kernel as follows
kernel_ast = t_plus_3_plus_4.uop.src[1].arg.ast
# almost everything in tinygrad functions as a rewrite of the UOps
# the codegen rewrites the ast to a simplified form ready for "rendering"
from tinygrad.codegen import full_rewrite_to_sink
rewritten_ast = full_rewrite_to_sink(kernel_ast)
print(rewritten_ast)
"""
UOp(Ops.SINK, dtypes.void, arg=None, src=(
UOp(Ops.STORE, dtypes.void, arg=None, src=(
UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=0, src=()),
x3:=UOp(Ops.SPECIAL, dtypes.int, arg=('gidx0', 4), src=()),)),
UOp(Ops.ADD, dtypes.int, arg=None, src=(
UOp(Ops.LOAD, dtypes.int, arg=None, src=(
UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=1, src=()),
x3,)),)),
UOp(Ops.CONST, dtypes.int, arg=7, src=()),)),)),))
"""
# you can see at this point we are adding 7, not 3 and 4
# with DEBUG=4, we can see the code.
# since optimizations are on, it UPCASTed the operation, explicitly writing out all 4 +7s
t_plus_3_plus_4.realize()
"""
void E_4n2(int* restrict data0, int* restrict data1) {
int val0 = *(data1+0);
int val1 = *(data1+1);
int val2 = *(data1+2);
int val3 = *(data1+3);
*(data0+0) = (val0+7);
*(data0+1) = (val1+7);
*(data0+2) = (val2+7);
*(data0+3) = (val3+7);
}
"""
# the function name E_4n2 is "E" for elementwise op (as opposed to "r" for reduce op)
# "4" for the size, and "n2" for name deduping (it's the 3rd function with the same E and 4 in this session)
# when you print the name with DEBUG=2, you'll see the 4 is yellow, meaning that it's upcasted
# if you run with NOOPT=1 ...
"""
void E_4n2(int* restrict data0, int* restrict data1) {
for (int ridx0 = 0; ridx0 < 4; ridx0++) {
int val0 = *(data1+ridx0);
*(data0+ridx0) = (val0+7);
}
}
"""
# ... you get this unoptimized code with a loop and the 4 is blue (for global). the color code is in kernel.py
# %% ********
print("******* PART 3 *******")
# now, we go even lower and understand UOps better and how the graph rewrite engine works.
# it's much simpler than what's in LLVM or MLIR
from tinygrad import dtypes
from tinygrad.uop.ops import UOp, Ops
# first, we'll construct some const UOps
a = UOp(Ops.CONST, dtypes.int, arg=2)
b = UOp(Ops.CONST, dtypes.int, arg=2)
# if you have been paying attention, you should know these are the same Python object
assert a is b
# UOps support normal Python math operations, so a_plus_b expresses the spec for 2 + 2
a_plus_b = a + b
print(a_plus_b)
"""
UOp(Ops.ADD, dtypes.int, arg=None, src=(
x0:=UOp(Ops.CONST, dtypes.int, arg=2, src=()),
x0,))
"""
# we could actually render this 2+2 into a language like c and run it
# or, we can use tinygrad's graph rewrite engine to "constant fold"
from tinygrad.uop.ops import graph_rewrite, UPat, PatternMatcher
# a `PatternMatcher` is a list of tuples. for each element in the list:
# [0] is the pattern to match, and [1] is the function to run.
# this function can return either a UOp to replace the pattern with, or None to not replace
simple_pm = PatternMatcher([
(UPat(Ops.ADD, src=(UPat(Ops.CONST, name="c1"), UPat(Ops.CONST, name="c2"))),
lambda c1,c2: UOp(Ops.CONST, dtype=c1.dtype, arg=c1.arg+c2.arg)),
])
# this pattern matches the addition of two CONST and rewrites it into a single CONST UOp
# to actually apply the pattern to a_plus_b, we use graph_rewrite
a_plus_b_simplified = graph_rewrite(a_plus_b, simple_pm)
print(a_plus_b_simplified)
"""
UOp(Ops.CONST, dtypes.int, arg=4, src=())
"""
# 2+2 is in fact, 4
# we can also use syntactic sugar to write the pattern nicer
simpler_pm = PatternMatcher([
(UPat.cvar("c1")+UPat.cvar("c2"), lambda c1,c2: c1.const_like(c1.arg+c2.arg))
])
assert graph_rewrite(a_plus_b, simple_pm) is graph_rewrite(a_plus_b, simpler_pm)
# note again the use of is, UOps are immutable and globally unique
# %% ********
# that brings you to an understanding of the most core concepts in tinygrad
# you can run this with VIZ=1 to use the web based graph rewrite explorer
# hopefully now you understand it. the nodes in the graph are just UOps

View file

@ -41,7 +41,7 @@ The BMC also has a web interface you can use if you find that easier.
It is recommended that you change the BMC password after setting up the box, as the password on the screen is only the initial password.
If you do decide to change the BMC password and no longer want the initial password to be displayed, remove the `/root/.bmc_password` file.
Reboot after making these changes or restart the `displayservice.service` service.
Reboot after making these changes or restart the `tinybox-display.service` service.
## What do I use it for?

View file

@ -21,7 +21,7 @@ if __name__ == "__main__":
X_train, Y_train, X_test, Y_test = mnist(fashion=getenv("FASHION"))
model = Model()
opt = (nn.optim.Adam if not getenv("MUON") else nn.optim.Muon)(nn.state.get_parameters(model))
opt = (nn.optim.Muon if getenv("MUON") else nn.optim.SGD if getenv("SGD") else nn.optim.Adam)(nn.state.get_parameters(model))
@TinyJit
@Tensor.train()

View file

@ -1,93 +0,0 @@
#!/usr/bin/env python3
import os, sys, traceback
sys.path.append(os.getcwd())
from io import StringIO
from contextlib import redirect_stdout
from tinygrad import Tensor, nn
from tinygrad.helpers import Timing, colored, getenv, fetch
from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
from sentencepiece import SentencePieceProcessor
def create_fixed_tokenizer(output_file):
print("creating fixed tokenizer")
import extra.junk.sentencepiece_model_pb2 as spb2
mp = spb2.ModelProto()
mp.ParseFromString(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/tokenizer.model?download=true").read_bytes())
mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
with open(output_file, "wb") as f:
f.write(mp.SerializeToString())
# example:
# echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py
if __name__ == "__main__":
# https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
with Timing("create model: "):
model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1))
with Timing("download weights: "):
part1 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00001-of-00002.bin?download=true"))
part2 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00002-of-00002.bin?download=true"))
with Timing("weights -> model: "):
nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part1, 32, 32, 8)), strict=False)
nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part2, 32, 32, 8)), strict=False)
if not os.path.isfile("/tmp/tokenizer.model"): create_fixed_tokenizer("/tmp/tokenizer.model")
spp = SentencePieceProcessor(model_file="/tmp/tokenizer.model")
# https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/tokenizer_config.json
# "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
IM_END = 32000
IM_START = 32001
def encode_prompt(k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
def start_prompt(k): return [IM_START]+spp.encode(f"{k}\n")
def output(outputted, toks, color):
cur = spp.decode(toks)[len(outputted):]
sys.stdout.write(colored(cur, color))
sys.stdout.flush()
outputted += cur
return outputted
# *** app below this line ***
toks = [spp.bos_id()] + encode_prompt("system", "You are Quentin. Quentin is a useful assistant who writes Python code to answer questions. He keeps the code as short as possible and doesn't read from user input")
PROMPT = getenv("PROMPT", 1)
temperature = getenv("TEMP", 0.7)
start_pos = 0
outputted = output("", toks, "green")
turn = True
while 1:
if PROMPT:
toks += encode_prompt("user", input("Q: ")) + start_prompt("assistant")
else:
toks += start_prompt("user" if turn else "assistant")
turn = not turn
old_output_len = len(outputted)
while 1:
tok = model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
start_pos = len(toks)
toks.append(tok)
outputted = output(outputted, toks, "blue" if not turn else "cyan")
if tok == IM_END: break
if tok == spp.eos_id(): break
new_output = outputted[old_output_len:]
if new_output.endswith("```") and '```python\n' in new_output:
python_code = new_output.split('```python\n')[1].split("```")[0]
# AI safety. Warning to user. Do not press y if the AI is trying to do unsafe things.
if input(colored(f" <-- PYTHON DETECTED, RUN IT? ", "red")).lower() == 'y':
my_stdout = StringIO()
try:
with redirect_stdout(my_stdout): exec(python_code)
result = my_stdout.getvalue()
except Exception as e:
result = ''.join(traceback.format_exception_only(e))
toks += spp.encode(f"\nOutput:\n```\n{result}```")
outputted = output(outputted, toks, "yellow")
old_output_len = len(outputted)
print("")

View file

@ -1,341 +0,0 @@
import argparse
import multiprocessing as mp
import os
import re
import sys
import time
from contextlib import contextmanager
from pathlib import Path
import numpy as np
import pyaudio
import yaml
from llama import LLaMa
from vits import MODELS as VITS_MODELS
from vits import Y_LENGTH_ESTIMATE_SCALARS, HParams, Synthesizer, TextMapper, get_hparams_from_file, load_model
from whisper import init_whisper, transcribe_waveform
from sentencepiece import SentencePieceProcessor
from tinygrad.helpers import Timing, fetch
from tinygrad import Tensor, dtypes
# Whisper constants
RATE = 16000
CHUNK = 1600
# LLaMa constants
IM_START = 32001
IM_END = 32002
# Functions for encoding prompts to chatml md
def encode_prompt(spp, k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
def start_prompt(spp, k): return [IM_START]+spp.encode(f"{k}\n")
def chunks(lst, n):
for i in range(0, len(lst), n): yield lst[i:i + n]
def create_fixed_tokenizer():
"""Function needed for extending tokenizer with additional chat tokens"""
import extra.junk.sentencepiece_model_pb2 as spb2
tokenizer_path = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/tokenizer.model")
if SentencePieceProcessor(model_file=str(tokenizer_path)).vocab_size() != 32003:
print("creating fixed tokenizer")
mp = spb2.ModelProto()
mp.ParseFromString(tokenizer_path.read_bytes())
# https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/blob/main/added_tokens.json
mp.pieces.append(spb2.ModelProto.SentencePiece(piece="[PAD]", score=0))
mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
tokenizer_path.write_bytes(mp.SerializeToString())
return tokenizer_path
def llama_prepare(llama: LLaMa, temperature: float, pre_prompt_path: Path) -> tuple[list[int], str, str, str]:
"""Prepares a llama model from a specified pre-prompt file"""
with open(str(pre_prompt_path)) as f:
config = yaml.safe_load(f.read())
toks = [llama.tokenizer.bos_id()] + encode_prompt(llama.tokenizer, "system", config["pre_prompt"].replace("\n", " "))
for i in config["examples"]:
toks += encode_prompt(llama.tokenizer, config["user_delim"], i["user_prompt"])
toks += encode_prompt(llama.tokenizer, config["resp_delim"], i["resp_prompt"])
llama.model(Tensor([toks]), 0, temperature).realize() # NOTE: outputs are not used
return toks, config["user_delim"], config["resp_delim"], len(toks), llama.tokenizer.decode(toks)
def llama_generate(
llama: LLaMa,
toks: list[int],
outputted: str,
prompt: str,
start_pos: int,
user_delim: str,
resp_delim: str,
temperature=0.7,
max_tokens=1000
):
"""Generates an output for the specified prompt"""
toks += encode_prompt(llama.tokenizer, user_delim, prompt)
toks += start_prompt(llama.tokenizer, resp_delim)
outputted = llama.tokenizer.decode(toks)
init_length = len(outputted)
for _ in range(max_tokens):
token = llama.model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
start_pos = len(toks)
toks.append(token)
cur = llama.tokenizer.decode(toks)
# Print is just for debugging
sys.stdout.write(cur[len(outputted):])
sys.stdout.flush()
outputted = cur
if toks[-1] == IM_END: break
else:
toks.append(IM_END)
print() # because the output is flushed
return outputted, start_pos, outputted[init_length:].replace("<|im_end|>", "")
def tts(
text_to_synthesize: str,
synth: Synthesizer,
hps: HParams,
emotion_embedding: Path,
speaker_id: int,
model_to_use: str,
noise_scale: float,
noise_scale_w: float,
length_scale: float,
estimate_max_y_length: bool,
text_mapper: TextMapper,
model_has_multiple_speakers: bool,
pad_length=600,
vits_pad_length=1000
):
if model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
# Convert the input text to a tensor.
stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
init_shape = stn_tst.shape
assert init_shape[0] < pad_length, "text is too long"
x_tst, x_tst_lengths = stn_tst.pad(((0, pad_length - init_shape[0]),), value=1).unsqueeze(0), Tensor([init_shape[0]], dtype=dtypes.int64)
sid = Tensor([speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
# Perform inference.
audio_tensor = synth.infer(x_tst, x_tst_lengths, sid, noise_scale, length_scale, noise_scale_w, emotion_embedding=emotion_embedding,
max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[model_to_use] if estimate_max_y_length else None, pad_length=vits_pad_length)[0, 0]
# Save the audio output.
audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
return audio_data
def init_vits(
model_to_use: str,
emotion_path: Path,
speaker_id: int,
seed: int,
):
model_config = VITS_MODELS[model_to_use]
# Load the hyperparameters from the config file.
hps = get_hparams_from_file(fetch(model_config[0]))
# If model has multiple speakers, validate speaker id and retrieve name if available.
model_has_multiple_speakers = hps.data.n_speakers > 0
if model_has_multiple_speakers:
if speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {speaker_id} is invalid for this model.")
if hps.__contains__("speakers"): # maps speaker ids to names
speakers = hps.speakers
if isinstance(speakers, list): speakers = {speaker: i for i, speaker in enumerate(speakers)}
# Load emotions if any. TODO: find an english model with emotions, this is untested atm.
emotion_embedding = None
if emotion_path is not None:
if emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(emotion_path), dtype=dtypes.int64).unsqueeze(0)
else: raise ValueError("Emotion path must be a .npy file.")
# Load symbols, instantiate TextMapper and clean the text.
if hps.__contains__("symbols"): symbols = hps.symbols
elif model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'")
text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
# Load the model.
if seed is not None:
Tensor.manual_seed(seed)
np.random.seed(seed)
net_g = load_model(text_mapper.symbols, hps, model_config)
return net_g, emotion_embedding, text_mapper, hps, model_has_multiple_speakers
@contextmanager
def output_stream(num_channels: int, sample_rate: int):
try:
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=num_channels, rate=sample_rate, output=True)
yield stream
except KeyboardInterrupt: pass
finally:
stream.stop_stream()
stream.close()
p.terminate()
@contextmanager
def log_writer():
try:
logs = []
yield logs
finally:
sep = "="*os.get_terminal_size()[1]
print(f"{sep[:-1]}\nCHAT LOG")
print(*logs, sep="\n")
print(sep)
def listener(q: mp.Queue, event: mp.Event):
try:
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK)
did_print = False
while True:
data = stream.read(CHUNK) # read data to avoid overflow
if event.is_set():
if not did_print:
print("listening")
did_print = True
q.put(((np.frombuffer(data, np.int16)/32768).astype(np.float32)*3))
else:
did_print = False
finally:
stream.stop_stream()
stream.close()
p.terminate()
def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_rate: int):
with output_stream(num_channels, sample_rate) as stream:
while True:
try:
stream.write(q.get())
counter.value += 1
except KeyboardInterrupt:
break
if __name__ == "__main__":
import nltk
nltk.download("punkt")
# Parse CLI arguments
parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad")
# Whisper args
parser.add_argument("--whisper_model_name", type=str, default="tiny.en")
# LLAMA args
parser.add_argument("--llama_pre_prompt_path", type=Path, default=Path(__file__).parent / "conversation_data" / "pre_prompt_stacy.yaml", help="Path to yaml file which contains all pre-prompt data needed. ")
parser.add_argument("--llama_count", type=int, default=1000, help="Max number of tokens to generate")
parser.add_argument("--llama_temperature", type=float, default=0.7, help="Temperature in the softmax")
parser.add_argument("--llama_quantize", type=str, default=None, help="Quantize the weights to int8 or nf4 in memory")
parser.add_argument("--llama_model", type=Path, default=None, help="Folder with the original weights to load, or single .index.json, .safetensors or .bin file")
parser.add_argument("--llama_gen", type=str, default="tiny", required=False, help="Generation of the model to use")
parser.add_argument("--llama_size", type=str, default="1B-Chat", required=False, help="Size of model to use")
parser.add_argument("--llama_tokenizer", type=Path, default=None, required=False, help="Path to llama tokenizer.model")
# vits args
parser.add_argument("--vits_model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
parser.add_argument("--vits_speaker_id", type=int, default=12, help="Specify the speaker ID. Default is 6.")
parser.add_argument("--vits_noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
parser.add_argument("--vits_noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
parser.add_argument("--vits_length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
parser.add_argument("--vits_seed", type=int, default=None, help="Specify the seed (set to None if no seed). Default is 1337.")
parser.add_argument("--vits_num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
parser.add_argument("--vits_sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
parser.add_argument("--vits_emotion_path", type=Path, default=None, help="Specify the path to emotion reference.")
parser.add_argument("--vits_estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
parser.add_argument("--vits_vocab_path", type=Path, default=None, help="Path to the TTS vocabulary.")
# conversation args
parser.add_argument("--max_sentence_length", type=int, default=20, help="Max words in one sentence to pass to vits")
args = parser.parse_args()
# Init models
model, enc = init_whisper(args.whisper_model_name)
synth, emotion_embedding, text_mapper, hps, model_has_multiple_speakers = init_vits(args.vits_model_to_use, args.vits_emotion_path, args.vits_speaker_id, args.vits_seed)
# Download tinyllama chat as a default model
if args.llama_model is None:
args.llama_model = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/model.safetensors", "tinyllamachat.safetensors")
args.llama_gen = "tiny"
args.llama_size = "1B-Chat"
# Add 3 more tokens to the tokenizer
if args.llama_gen == "tiny" and args.llama_size.endswith("Chat"): args.llama_tokenizer = create_fixed_tokenizer()
tokenizer_path = args.llama_tokenizer or args.llama_model.parent / "tokenizer.model"
llama = LLaMa.build(args.llama_model, tokenizer_path, args.llama_gen, args.llama_size, args.llama_quantize)
toks, user_delim, resp_delim, start_pos, outputted = llama_prepare(llama, args.llama_temperature, args.llama_pre_prompt_path)
# Start child process for mic input
q = mp.Queue()
is_listening_event = mp.Event()
p = mp.Process(target=listener, args=(q, is_listening_event,))
p.daemon = True
p.start()
# Start child process for speaker output
out_q = mp.Queue()
out_counter = mp.Value("i", 0)
out_p = mp.Process(target=mp_output_stream, args=(out_q, out_counter, args.vits_num_channels, hps.data.sampling_rate,))
out_p.daemon = True
out_p.start()
# JIT tts
for i in ["Hello, I'm a chat bot", "I am capable of doing a lot of things"]:
tts(
i, synth, hps, emotion_embedding,
args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
args.vits_noise_scale_w, args.vits_length_scale,
args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
)
# Start the pipeline
with log_writer() as log:
while True:
tokens = [enc._special_tokens["<|startoftranscript|>"], enc._special_tokens["<|notimestamps|>"]]
total = np.array([])
out_counter.value = 0
s = time.perf_counter()
is_listening_event.set()
prev_text = None
while True:
for _ in range(RATE // CHUNK): total = np.concatenate([total, q.get()])
txt = transcribe_waveform(model, enc, [total], truncate=True)
print(txt, end="\r")
if txt == "[BLANK_AUDIO]" or re.match(r"^\([\w+ ]+\)$", txt.strip()): continue
if prev_text is not None and prev_text == txt:
is_listening_event.clear()
break
prev_text = txt
print() # to avoid llama printing on the same line
log.append(f"{user_delim.capitalize()}: {txt}")
# Generate with llama
with Timing("llama generation: "):
outputted, start_pos, response = llama_generate(
llama, toks, outputted, txt, start_pos,
user_delim=user_delim, resp_delim=resp_delim, temperature=args.llama_temperature,
max_tokens=args.llama_count
)
log.append(f"{resp_delim.capitalize()}: {response}")
# Convert to voice
with Timing("tts: "):
sentences = nltk.sent_tokenize(response.replace('"', ""))
for i in sentences:
total = np.array([], dtype=np.int16)
for j in chunks(i.split(), args.max_sentence_length):
audio_data = tts(
" ".join(j), synth, hps, emotion_embedding,
args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
args.vits_noise_scale_w, args.vits_length_scale,
args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
)
total = np.concatenate([total, audio_data])
out_q.put(total.tobytes())
while out_counter.value < len(sentences): continue
log.append(f"Total: {time.perf_counter() - s}")

View file

@ -1,89 +0,0 @@
# load weights from
# https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth
# a rough copy of
# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
import sys
import ast
import time
import numpy as np
from PIL import Image
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv, fetch, Timing
from tinygrad.engine.jit import TinyJit
from extra.models.efficientnet import EfficientNet
np.set_printoptions(suppress=True)
# TODO: you should be able to put these in the jitted function
bias = Tensor([0.485, 0.456, 0.406])
scale = Tensor([0.229, 0.224, 0.225])
@TinyJit
def _infer(model, img):
img = img.permute((2,0,1))
img = img / 255.0
img = img - bias.reshape((1,-1,1,1))
img = img / scale.reshape((1,-1,1,1))
return model.forward(img).realize()
def infer(model, img):
# preprocess image
aspect_ratio = img.size[0] / img.size[1]
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
img = np.array(img)
y0,x0=(np.asarray(img.shape)[:2]-224)//2
retimg = img = img[y0:y0+224, x0:x0+224]
# if you want to look at the image
"""
import matplotlib.pyplot as plt
plt.imshow(img)
plt.show()
"""
# run the net
out = _infer(model, Tensor(img.astype("float32"))).numpy()
# if you want to look at the outputs
"""
import matplotlib.pyplot as plt
plt.plot(out[0])
plt.show()
"""
return out, retimg
if __name__ == "__main__":
# instantiate my net
model = EfficientNet(getenv("NUM", 0))
model.load_from_pretrained()
# category labels
lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
# load image and preprocess
url = sys.argv[1] if len(sys.argv) >= 2 else "https://raw.githubusercontent.com/tinygrad/tinygrad/master/docs/showcase/stable_diffusion_by_tinygrad.jpg"
if url == 'webcam':
import cv2
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
while 1:
_ = cap.grab() # discard one frame to circumvent capture buffering
ret, frame = cap.read()
img = Image.fromarray(frame[:, :, [2,1,0]])
lt = time.monotonic_ns()
out, retimg = infer(model, img)
print(f"{(time.monotonic_ns()-lt)*1e-6:7.2f} ms", np.argmax(out), np.max(out), lbls[np.argmax(out)])
SCALE = 3
simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
cv2.imshow('capture', retimg)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
else:
img = Image.open(fetch(url))
for i in range(getenv("CNT", 1)):
with Timing("did inference in "):
out, _ = infer(model, img)
print(np.argmax(out), np.max(out), lbls[np.argmax(out)])

View file

@ -1,498 +0,0 @@
# pip3 install sentencepiece
# This file incorporates code from the following:
# Github Name | License | Link
# black-forest-labs/flux | Apache | https://github.com/black-forest-labs/flux/tree/main/model_licenses
from tinygrad import Tensor, nn, dtypes, TinyJit
from tinygrad.nn.state import safe_load, load_state_dict
from tinygrad.helpers import fetch, tqdm, colored
from sdxl import FirstStage
from extra.models.clip import FrozenClosedClipEmbedder
from extra.models.t5 import T5Embedder
import numpy as np
import math, time, argparse, tempfile
from typing import List, Dict, Optional, Union, Tuple, Callable
from dataclasses import dataclass
from pathlib import Path
from PIL import Image
urls:dict = {
"flux-schnell": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/flux1-schnell.safetensors",
"flux-dev": "https://huggingface.co/camenduru/FLUX.1-dev/resolve/main/flux1-dev.sft",
"ae": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/ae.safetensors",
"T5_1_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00001-of-00002.safetensors",
"T5_2_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00002-of-00002.safetensors",
"T5_tokenizer": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/tokenizer_2/spiece.model",
"clip": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder/model.safetensors"
}
def tensor_identity(x:Tensor) -> Tensor: return x
class AutoEncoder:
def __init__(self, scale_factor:float, shift_factor:float):
self.decoder = FirstStage.Decoder(128, 3, 3, 16, [1, 2, 4, 4], 2, 256)
self.scale_factor = scale_factor
self.shift_factor = shift_factor
def decode(self, z:Tensor) -> Tensor:
z = z / self.scale_factor + self.shift_factor
return self.decoder(z)
# Conditioner
class ClipEmbedder(FrozenClosedClipEmbedder):
def __call__(self, texts:Union[str, List[str], Tensor]) -> Tensor:
if isinstance(texts, str): texts = [texts]
assert isinstance(texts, (list,tuple)), f"expected list of strings, got {type(texts).__name__}"
tokens = Tensor.cat(*[Tensor(self.tokenizer.encode(text)) for text in texts], dim=0)
return self.transformer.text_model(tokens.reshape(len(texts),-1))[:, tokens.argmax(-1)]
# https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
def attention(q:Tensor, k:Tensor, v:Tensor, pe:Tensor) -> Tensor:
q, k = apply_rope(q, k, pe)
x = Tensor.scaled_dot_product_attention(q, k, v)
return x.rearrange("B H L D -> B L (H D)")
def rope(pos:Tensor, dim:int, theta:int) -> Tensor:
assert dim % 2 == 0
scale = Tensor.arange(0, dim, 2, dtype=dtypes.float32, device=pos.device) / dim # NOTE: this is torch.float64 in reference implementation
omega = 1.0 / (theta**scale)
out = Tensor.einsum("...n,d->...nd", pos, omega)
out = Tensor.stack(Tensor.cos(out), -Tensor.sin(out), Tensor.sin(out), Tensor.cos(out), dim=-1)
out = out.rearrange("b n d (i j) -> b n d i j", i=2, j=2)
return out.float()
def apply_rope(xq:Tensor, xk:Tensor, freqs_cis:Tensor) -> Tuple[Tensor, Tensor]:
xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
return xq_out.reshape(*xq.shape).cast(xq.dtype), xk_out.reshape(*xk.shape).cast(xk.dtype)
# https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
class EmbedND:
def __init__(self, dim:int, theta:int, axes_dim:List[int]):
self.dim = dim
self.theta = theta
self.axes_dim = axes_dim
def __call__(self, ids:Tensor) -> Tensor:
n_axes = ids.shape[-1]
emb = Tensor.cat(*[rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3)
return emb.unsqueeze(1)
class MLPEmbedder:
def __init__(self, in_dim:int, hidden_dim:int):
self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
def __call__(self, x:Tensor) -> Tensor:
return self.out_layer(self.in_layer(x).silu())
class QKNorm:
def __init__(self, dim:int):
self.query_norm = nn.RMSNorm(dim)
self.key_norm = nn.RMSNorm(dim)
def __call__(self, q:Tensor, k:Tensor) -> Tuple[Tensor, Tensor]:
return self.query_norm(q), self.key_norm(k)
class SelfAttention:
def __init__(self, dim:int, num_heads:int = 8, qkv_bias:bool = False):
self.num_heads = num_heads
head_dim = dim // num_heads
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.norm = QKNorm(head_dim)
self.proj = nn.Linear(dim, dim)
def __call__(self, x:Tensor, pe:Tensor) -> Tensor:
qkv = self.qkv(x)
q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
q, k = self.norm(q, k)
x = attention(q, k, v, pe=pe)
return self.proj(x)
@dataclass
class ModulationOut:
shift:Tensor
scale:Tensor
gate:Tensor
class Modulation:
def __init__(self, dim:int, double:bool):
self.is_double = double
self.multiplier = 6 if double else 3
self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
def __call__(self, vec:Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
out = self.lin(vec.silu())[:, None, :].chunk(self.multiplier, dim=-1)
return ModulationOut(*out[:3]), ModulationOut(*out[3:]) if self.is_double else None
class DoubleStreamBlock:
def __init__(self, hidden_size:int, num_heads:int, mlp_ratio:float, qkv_bias:bool = False):
mlp_hidden_dim = int(hidden_size * mlp_ratio)
self.num_heads = num_heads
self.hidden_size = hidden_size
self.img_mod = Modulation(hidden_size, double=True)
self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.img_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
self.txt_mod = Modulation(hidden_size, double=True)
self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.txt_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
def __call__(self, img:Tensor, txt:Tensor, vec:Tensor, pe:Tensor) -> tuple[Tensor, Tensor]:
img_mod1, img_mod2 = self.img_mod(vec)
txt_mod1, txt_mod2 = self.txt_mod(vec)
assert img_mod2 is not None and txt_mod2 is not None
# prepare image for attention
img_modulated = self.img_norm1(img)
img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
img_qkv = self.img_attn.qkv(img_modulated)
img_q, img_k, img_v = img_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
img_q, img_k = self.img_attn.norm(img_q, img_k)
# prepare txt for attention
txt_modulated = self.txt_norm1(txt)
txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
txt_qkv = self.txt_attn.qkv(txt_modulated)
txt_q, txt_k, txt_v = txt_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k)
# run actual attention
q = Tensor.cat(txt_q, img_q, dim=2)
k = Tensor.cat(txt_k, img_k, dim=2)
v = Tensor.cat(txt_v, img_v, dim=2)
attn = attention(q, k, v, pe=pe)
txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
# calculate the img bloks
img = img + img_mod1.gate * self.img_attn.proj(img_attn)
img = img + img_mod2.gate * ((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift).sequential(self.img_mlp)
# calculate the txt bloks
txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
txt = txt + txt_mod2.gate * ((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift).sequential(self.txt_mlp)
return img, txt
class SingleStreamBlock:
"""
A DiT block with parallel linear layers as described in
https://arxiv.org/abs/2302.05442 and adapted modulation interface.
"""
def __init__(self,hidden_size:int, num_heads:int, mlp_ratio:float=4.0, qk_scale:Optional[float]=None):
self.hidden_dim = hidden_size
self.num_heads = num_heads
head_dim = hidden_size // num_heads
self.scale = qk_scale or head_dim**-0.5
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
# qkv and mlp_in
self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
# proj and mlp_out
self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
self.norm = QKNorm(head_dim)
self.hidden_size = hidden_size
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.mlp_act = Tensor.gelu
self.modulation = Modulation(hidden_size, double=False)
def __call__(self, x:Tensor, vec:Tensor, pe:Tensor) -> Tensor:
mod, _ = self.modulation(vec)
x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
qkv, mlp = Tensor.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
q, k = self.norm(q, k)
# compute attention
attn = attention(q, k, v, pe=pe)
# compute activation in mlp stream, cat again and run second linear layer
output = self.linear2(Tensor.cat(attn, self.mlp_act(mlp), dim=2))
return x + mod.gate * output
class LastLayer:
def __init__(self, hidden_size:int, patch_size:int, out_channels:int):
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
self.adaLN_modulation:List[Callable[[Tensor], Tensor]] = [Tensor.silu, nn.Linear(hidden_size, 2 * hidden_size, bias=True)]
def __call__(self, x:Tensor, vec:Tensor) -> Tensor:
shift, scale = vec.sequential(self.adaLN_modulation).chunk(2, dim=1)
x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
return self.linear(x)
def timestep_embedding(t:Tensor, dim:int, max_period:int=10000, time_factor:float=1000.0) -> Tensor:
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
t = time_factor * t
half = dim // 2
freqs = Tensor.exp(-math.log(max_period) * Tensor.arange(0, stop=half, dtype=dtypes.float32) / half).to(t.device)
args = t[:, None].float() * freqs[None]
embedding = Tensor.cat(Tensor.cos(args), Tensor.sin(args), dim=-1)
if dim % 2: embedding = Tensor.cat(*[embedding, Tensor.zeros_like(embedding[:, :1])], dim=-1)
if Tensor.is_floating_point(t): embedding = embedding.cast(t.dtype)
return embedding
# https://github.com/black-forest-labs/flux/blob/main/src/flux/model.py
class Flux:
"""
Transformer model for flow matching on sequences.
"""
def __init__(
self,
guidance_embed:bool,
in_channels:int = 64,
vec_in_dim:int = 768,
context_in_dim:int = 4096,
hidden_size:int = 3072,
mlp_ratio:float = 4.0,
num_heads:int = 24,
depth:int = 19,
depth_single_blocks:int = 38,
axes_dim:Optional[List[int]] = None,
theta:int = 10_000,
qkv_bias:bool = True,
):
axes_dim = axes_dim or [16, 56, 56]
self.guidance_embed = guidance_embed
self.in_channels = in_channels
self.out_channels = self.in_channels
if hidden_size % num_heads != 0:
raise ValueError(f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}")
pe_dim = hidden_size // num_heads
if sum(axes_dim) != pe_dim:
raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
self.hidden_size = hidden_size
self.num_heads = num_heads
self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
self.guidance_in:Callable[[Tensor], Tensor] = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else tensor_identity
self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
self.double_blocks = [DoubleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias) for _ in range(depth)]
self.single_blocks = [SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio) for _ in range(depth_single_blocks)]
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
def __call__(self, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, timesteps:Tensor, y:Tensor, guidance:Optional[Tensor] = None) -> Tensor:
if img.ndim != 3 or txt.ndim != 3:
raise ValueError("Input img and txt tensors must have 3 dimensions.")
# running on sequences img
img = self.img_in(img)
vec = self.time_in(timestep_embedding(timesteps, 256))
if self.guidance_embed:
if guidance is None:
raise ValueError("Didn't get guidance strength for guidance distilled model.")
vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
vec = vec + self.vector_in(y)
txt = self.txt_in(txt)
ids = Tensor.cat(txt_ids, img_ids, dim=1)
pe = self.pe_embedder(ids)
for double_block in self.double_blocks:
img, txt = double_block(img=img, txt=txt, vec=vec, pe=pe)
img = Tensor.cat(txt, img, dim=1)
for single_block in self.single_blocks:
img = single_block(img, vec=vec, pe=pe)
img = img[:, txt.shape[1] :, ...]
return self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
# https://github.com/black-forest-labs/flux/blob/main/src/flux/util.py
def load_flow_model(name:str, model_path:str):
# Loading Flux
print("Init model")
model = Flux(guidance_embed=(name != "flux-schnell"))
if not model_path: model_path = fetch(urls[name])
state_dict = {k.replace("scale", "weight"): v for k, v in safe_load(model_path).items()}
load_state_dict(model, state_dict)
return model
def load_T5(max_length:int=512):
# max length 64, 128, 256 and 512 should work (if your sequence is short enough)
print("Init T5")
T5 = T5Embedder(max_length, fetch(urls["T5_tokenizer"]))
pt_1 = fetch(urls["T5_1_of_2"])
pt_2 = fetch(urls["T5_2_of_2"])
load_state_dict(T5.encoder, safe_load(pt_1) | safe_load(pt_2), strict=False)
return T5
def load_clip():
print("Init Clip")
clip = ClipEmbedder()
load_state_dict(clip.transformer, safe_load(fetch(urls["clip"])))
return clip
def load_ae() -> AutoEncoder:
# Loading the autoencoder
print("Init AE")
ae = AutoEncoder(0.3611, 0.1159)
load_state_dict(ae, safe_load(fetch(urls["ae"])))
return ae
# https://github.com/black-forest-labs/flux/blob/main/src/flux/sampling.py
def prepare(T5:T5Embedder, clip:ClipEmbedder, img:Tensor, prompt:Union[str, List[str]]) -> Dict[str, Tensor]:
bs, _, h, w = img.shape
if bs == 1 and not isinstance(prompt, str):
bs = len(prompt)
img = img.rearrange("b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
if img.shape[0] == 1 and bs > 1:
img = img.expand((bs, *img.shape[1:]))
img_ids = Tensor.zeros(h // 2, w // 2, 3).contiguous()
img_ids[..., 1] = img_ids[..., 1] + Tensor.arange(h // 2)[:, None]
img_ids[..., 2] = img_ids[..., 2] + Tensor.arange(w // 2)[None, :]
img_ids = img_ids.rearrange("h w c -> 1 (h w) c")
img_ids = img_ids.expand((bs, *img_ids.shape[1:]))
if isinstance(prompt, str):
prompt = [prompt]
txt = T5(prompt).realize()
if txt.shape[0] == 1 and bs > 1:
txt = txt.expand((bs, *txt.shape[1:]))
txt_ids = Tensor.zeros(bs, txt.shape[1], 3)
vec = clip(prompt).realize()
if vec.shape[0] == 1 and bs > 1:
vec = vec.expand((bs, *vec.shape[1:]))
return {"img": img, "img_ids": img_ids.to(img.device), "txt": txt.to(img.device), "txt_ids": txt_ids.to(img.device), "vec": vec.to(img.device)}
def get_schedule(num_steps:int, image_seq_len:int, base_shift:float=0.5, max_shift:float=1.15, shift:bool=True) -> List[float]:
# extra step for zero
step_size = -1.0 / num_steps
timesteps = Tensor.arange(1, 0 + step_size, step_size)
# shifting the schedule to favor high timesteps for higher signal images
if shift:
# estimate mu based on linear estimation between two points
mu = 0.5 + (max_shift - base_shift) * (image_seq_len - 256) / (4096 - 256)
timesteps = math.exp(mu) / (math.exp(mu) + (1 / timesteps - 1))
return timesteps.tolist()
@TinyJit
def run(model, *args): return model(*args).realize()
def denoise(model, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, vec:Tensor, timesteps:List[float], guidance:float=4.0) -> Tensor:
# this is ignored for schnell
guidance_vec = Tensor((guidance,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
for t_curr, t_prev in tqdm(list(zip(timesteps[:-1], timesteps[1:])), "Denoising"):
t_vec = Tensor((t_curr,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
pred = run(model, img, img_ids, txt, txt_ids, t_vec, vec, guidance_vec)
img = img + (t_prev - t_curr) * pred
return img
def unpack(x:Tensor, height:int, width:int) -> Tensor:
return x.rearrange("b (h w) (c ph pw) -> b c (h ph) (w pw)", h=math.ceil(height / 16), w=math.ceil(width / 16), ph=2, pw=2)
# https://github.com/black-forest-labs/flux/blob/main/src/flux/cli.py
if __name__ == "__main__":
default_prompt = "bananas and a can of coke"
parser = argparse.ArgumentParser(description="Run Flux.1", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--name", type=str, default="flux-schnell", help="Name of the model to load")
parser.add_argument("--model_path", type=str, default="", help="path of the model file")
parser.add_argument("--width", type=int, default=512, help="width of the sample in pixels (should be a multiple of 16)")
parser.add_argument("--height", type=int, default=512, help="height of the sample in pixels (should be a multiple of 16)")
parser.add_argument("--seed", type=int, default=None, help="Set a seed for sampling")
parser.add_argument("--prompt", type=str, default=default_prompt, help="Prompt used for sampling")
parser.add_argument('--out', type=str, default=Path(tempfile.gettempdir()) / "rendered.png", help="Output filename")
parser.add_argument("--num_steps", type=int, default=None, help="number of sampling steps (default 4 for schnell, 50 for guidance distilled)") #noqa:E501
parser.add_argument("--guidance", type=float, default=3.5, help="guidance value used for guidance distillation")
parser.add_argument("--output_dir", type=str, default="output", help="output directory")
args = parser.parse_args()
if args.name not in ["flux-schnell", "flux-dev"]:
raise ValueError(f"Got unknown model name: {args.name}, chose from flux-schnell and flux-dev")
if args.num_steps is None:
args.num_steps = 4 if args.name == "flux-schnell" else 50
# allow for packing and conversion to latent space
height = 16 * (args.height // 16)
width = 16 * (args.width // 16)
if args.seed is None: args.seed = Tensor._seed
else: Tensor.manual_seed(args.seed)
print(f"Generating with seed {args.seed}:\n{args.prompt}")
t0 = time.perf_counter()
# prepare input noise
x = Tensor.randn(1, 16, 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), dtype="bfloat16")
# load text embedders
T5 = load_T5(max_length=256 if args.name == "flux-schnell" else 512)
clip = load_clip()
# embed text to get inputs for model
inp = prepare(T5, clip, x, prompt=args.prompt)
timesteps = get_schedule(args.num_steps, inp["img"].shape[1], shift=(args.name != "flux-schnell"))
# done with text embedders
del T5, clip
# load model
model = load_flow_model(args.name, args.model_path)
# denoise initial noise
x = denoise(model, **inp, timesteps=timesteps, guidance=args.guidance)
# done with model
del model, run
# load autoencoder
ae = load_ae()
# decode latents to pixel space
x = unpack(x.float(), height, width)
x = ae.decode(x).realize()
t1 = time.perf_counter()
print(f"Done in {t1 - t0:.1f}s. Saving {args.out}")
# bring into PIL format and save
x = x.clamp(-1, 1)
x = x[0].rearrange("c h w -> h w c")
x = (127.5 * (x + 1.0)).cast("uint8")
img = Image.fromarray(x.numpy())
img.save(args.out)
# validation!
if args.prompt == default_prompt and args.name=="flux-schnell" and args.seed == 0 and args.width == args.height == 512:
ref_image = Tensor(np.array(Image.open("examples/flux1_seed0.png")))
distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
assert distance < 4e-3, colored(f"validation failed with {distance=}", "red")
print(colored(f"output validated with {distance=}", "green"))

108
examples/gradaccum_mnist.py Normal file
View file

@ -0,0 +1,108 @@
import itertools
from typing import Callable
from tinygrad import nn, Tensor, dtypes, Device, TinyJit
from tinygrad.helpers import getenv, trange, partition
class Model:
def __init__(self):
self.layers: list[Callable[[Tensor], Tensor]] = [
nn.Conv2d(1, 32, 5), Tensor.relu,
nn.Conv2d(32, 32, 5), Tensor.relu,
nn.BatchNorm(32), Tensor.max_pool2d,
nn.Conv2d(32, 64, 3), Tensor.relu,
nn.Conv2d(64, 64, 3), Tensor.relu,
nn.BatchNorm(64), Tensor.max_pool2d,
lambda x: x.flatten(1), nn.Linear(576, 10)]
def __call__(self, x:Tensor) -> Tensor: return x.sequential(self.layers)
# TODO: refactor this into optim/onnx
def functional_adam(g:Tensor, m:Tensor, v:Tensor, b1_t:Tensor, b2_t:Tensor, lr=0.001, b1=0.9, b2=0.999, eps=1e-6) -> Tensor:
b1_t *= b1
b2_t *= b2
m.assign(b1 * m + (1.0 - b1) * g)
v.assign(b2 * v + (1.0 - b2) * (g * g))
m_hat = m / (1.0 - b1_t)
v_hat = v / (1.0 - b2_t)
return lr * (m_hat / (v_hat.sqrt() + eps))
if __name__ == "__main__":
BS = getenv("BS", 512)
ACC_STEPS = getenv("ACC_STEPS", 8)
X_train, Y_train, X_test, Y_test = nn.datasets.mnist()
model = Model()
params = nn.state.get_parameters(model)
# init params, set requires grad on the ones we need gradients of
for x in params:
if x.requires_grad is None: x.requires_grad_()
x.replace(x.contiguous())
Tensor.realize(*params)
# split params (with grads) and buffers (without)
params, buffers = partition(params, lambda x: x.requires_grad)
print(f"params: {len(params)} buffers: {len(buffers)}")
# optim params
pos_params = list(itertools.accumulate(params, lambda x,y: x+y.numel(), initial=0))
adam_m = Tensor.zeros(pos_params[-1], device="CPU").contiguous()
adam_v = Tensor.zeros(pos_params[-1], device="CPU").contiguous()
adam_b1_t = Tensor.ones((1,), dtype=dtypes.float32, device="CPU", requires_grad=False).contiguous()
adam_b2_t = Tensor.ones((1,), dtype=dtypes.float32, device="CPU", requires_grad=False).contiguous()
adam_params = [adam_m, adam_v, adam_b1_t, adam_b2_t]
# create loss and grads. init all state so the JIT works on microbatch
for x in params: x.assign(x.detach())
loss = Tensor.zeros(tuple()).contiguous()
grads = Tensor.zeros(pos_params[-1]).contiguous()
Tensor.realize(*params, *buffers, *adam_params, loss, grads)
@TinyJit
@Tensor.train()
def microbatch():
samples = Tensor.randint(BS // ACC_STEPS, high=X_train.shape[0])
for t in params: t.grad = None
# divide by ACC_STEPS at the loss
uloss = (model(X_train[samples]).sparse_categorical_crossentropy(Y_train[samples]) / ACC_STEPS).backward()
ugrads = Tensor.cat(*[t.grad.contiguous().flatten() for t in params], dim=0)
for t in params: t.grad = None
# concat the grads and assign them
loss.assign(loss + uloss)
grads.assign(grads + ugrads)
Tensor.realize(*params, *buffers, loss, grads)
@TinyJit
def optimizer():
# run optimizer (on CPU, where adam params live)
delta = functional_adam(grads.to("CPU"), adam_m, adam_v, adam_b1_t, adam_b2_t)
# update the params, copying back the delta one at a time to avoid OOM
# NOTE: the scheduler is ordering things poorly, all the copies are happening before the adds
for j,tt in enumerate(params):
tt.assign(tt.detach() - delta[pos_params[j]:pos_params[j+1]].reshape(tt.shape).to(Device.DEFAULT))
# realize everything, zero out loss and grads
loss.assign(Tensor.zeros_like(loss))
grads.assign(Tensor.zeros_like(grads))
Tensor.realize(*params, *adam_params, loss, grads)
@TinyJit
def get_test_acc() -> Tensor: return (model(X_test).argmax(axis=1) == Y_test).mean()*100
test_acc = float('nan')
for i in (t:=trange(getenv("STEPS", 70))):
# microbatch sets the gradients
for _ in range(ACC_STEPS): microbatch()
# get the loss before the optimizer clears it
# this is already realized so this isn't a schedule
loss_item = loss.item()
# run the optimizer
optimizer()
# eval
if i%10 == 9: test_acc = get_test_acc().item()
t.set_description(f"loss: {loss_item:6.2f} test_accuracy: {test_acc:5.2f}%")

View file

@ -1,299 +0,0 @@
from extra.models.mask_rcnn import MaskRCNN
from extra.models.resnet import ResNet
from extra.models.mask_rcnn import BoxList
from torch.nn import functional as F
from torchvision import transforms as T
from torchvision.transforms import functional as Ft
import random
from tinygrad.tensor import Tensor
from PIL import Image
import numpy as np
import torch
import argparse
import cv2
class Resize:
def __init__(self, min_size, max_size):
if not isinstance(min_size, (list, tuple)):
min_size = (min_size,)
self.min_size = min_size
self.max_size = max_size
# modified from torchvision to add support for max size
def get_size(self, image_size):
w, h = image_size
size = random.choice(self.min_size)
max_size = self.max_size
if max_size is not None:
min_original_size = float(min((w, h)))
max_original_size = float(max((w, h)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if (w <= h and w == size) or (h <= w and h == size):
return (h, w)
if w < h:
ow = size
oh = int(size * h / w)
else:
oh = size
ow = int(size * w / h)
return (oh, ow)
def __call__(self, image):
size = self.get_size(image.size)
image = Ft.resize(image, size)
return image
class Normalize:
def __init__(self, mean, std, to_bgr255=True):
self.mean = mean
self.std = std
self.to_bgr255 = to_bgr255
def __call__(self, image):
if self.to_bgr255:
image = image[[2, 1, 0]] * 255
else:
image = image[[0, 1, 2]] * 255
image = Ft.normalize(image, mean=self.mean, std=self.std)
return image
transforms = lambda size_scale: T.Compose(
[
Resize(int(800*size_scale), int(1333*size_scale)),
T.ToTensor(),
Normalize(
mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.], to_bgr255=True
),
]
)
def expand_boxes(boxes, scale):
w_half = (boxes[:, 2] - boxes[:, 0]) * .5
h_half = (boxes[:, 3] - boxes[:, 1]) * .5
x_c = (boxes[:, 2] + boxes[:, 0]) * .5
y_c = (boxes[:, 3] + boxes[:, 1]) * .5
w_half *= scale
h_half *= scale
boxes_exp = torch.zeros_like(boxes)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
def expand_masks(mask, padding):
N = mask.shape[0]
M = mask.shape[-1]
pad2 = 2 * padding
scale = float(M + pad2) / M
padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2))
padded_mask[:, :, padding:-padding, padding:-padding] = mask
return padded_mask, scale
def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1):
# TODO: remove torch
mask = torch.tensor(mask.numpy())
box = torch.tensor(box.numpy())
padded_mask, scale = expand_masks(mask[None], padding=padding)
mask = padded_mask[0, 0]
box = expand_boxes(box[None], scale)[0]
box = box.to(dtype=torch.int32)
TO_REMOVE = 1
w = int(box[2] - box[0] + TO_REMOVE)
h = int(box[3] - box[1] + TO_REMOVE)
w = max(w, 1)
h = max(h, 1)
mask = mask.expand((1, 1, -1, -1))
mask = mask.to(torch.float32)
mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
mask = mask[0][0]
if thresh >= 0:
mask = mask > thresh
else:
mask = (mask * 255).to(torch.uint8)
im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8)
x_0 = max(box[0], 0)
x_1 = min(box[2] + 1, im_w)
y_0 = max(box[1], 0)
y_1 = min(box[3] + 1, im_h)
im_mask[y_0:y_1, x_0:x_1] = mask[
(y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0])
]
return im_mask
class Masker:
def __init__(self, threshold=0.5, padding=1):
self.threshold = threshold
self.padding = padding
def forward_single_image(self, masks, boxes):
boxes = boxes.convert("xyxy")
im_w, im_h = boxes.size
res = [
paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding)
for mask, box in zip(masks, boxes.bbox)
]
if len(res) > 0:
res = torch.stack(*res, dim=0)[:, None]
else:
res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1]))
return Tensor(res.numpy())
def __call__(self, masks, boxes):
if isinstance(boxes, BoxList):
boxes = [boxes]
results = []
for mask, box in zip(masks, boxes):
result = self.forward_single_image(mask, box)
results.append(result)
return results
masker = Masker(threshold=0.5, padding=1)
def select_top_predictions(predictions, confidence_threshold=0.9):
scores = predictions.get_field("scores").numpy()
keep = [idx for idx, score in enumerate(scores) if score > confidence_threshold]
return predictions[keep]
def compute_prediction(original_image, model, confidence_threshold, size_scale=1.0):
image = transforms(size_scale)(original_image).numpy()
image = Tensor(image, requires_grad=False)
predictions = model(image)
prediction = predictions[0]
prediction = select_top_predictions(prediction, confidence_threshold)
width, height = original_image.size
prediction = prediction.resize((width, height))
if prediction.has_field("mask"):
masks = prediction.get_field("mask")
masks = masker([masks], [prediction])[0]
prediction.add_field("mask", masks)
return prediction
def compute_prediction_batched(batch, model, size_scale=1.0):
imgs = []
for img in batch:
imgs.append(transforms(size_scale)(img).numpy())
image = [Tensor(image, requires_grad=False) for image in imgs]
predictions = model(image)
del image
return predictions
palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
def findContours(*args, **kwargs):
if cv2.__version__.startswith('4'):
contours, hierarchy = cv2.findContours(*args, **kwargs)
elif cv2.__version__.startswith('3'):
_, contours, hierarchy = cv2.findContours(*args, **kwargs)
return contours, hierarchy
def compute_colors_for_labels(labels):
l = labels[:, None]
colors = l * palette
colors = (colors % 255).astype("uint8")
return colors
def overlay_mask(image, predictions):
image = np.asarray(image)
masks = predictions.get_field("mask").numpy()
labels = predictions.get_field("labels").numpy()
colors = compute_colors_for_labels(labels).tolist()
for mask, color in zip(masks, colors):
thresh = mask[0, :, :, None]
contours, hierarchy = findContours(
thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
)
image = cv2.drawContours(image, contours, -1, color, 3)
composite = image
return composite
CATEGORIES = [
"__background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant",
"bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
"sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
"carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
"toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster",
"sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
]
def overlay_boxes(image, predictions):
labels = predictions.get_field("labels").numpy()
boxes = predictions.bbox
image = np.asarray(image)
colors = compute_colors_for_labels(labels).tolist()
for box, color in zip(boxes, colors):
box = torch.tensor(box.numpy())
box = box.to(torch.int64)
top_left, bottom_right = box[:2].tolist(), box[2:].tolist()
image = cv2.rectangle(
image, tuple(top_left), tuple(bottom_right), tuple(color), 1
)
return image
def overlay_class_names(image, predictions):
scores = predictions.get_field("scores").numpy().tolist()
labels = predictions.get_field("labels").numpy().tolist()
labels = [CATEGORIES[int(i)] for i in labels]
boxes = predictions.bbox.numpy()
image = np.asarray(image)
template = "{}: {:.2f}"
for box, score, label in zip(boxes, scores, labels):
x, y = box[:2]
s = template.format(label, score)
x, y = int(x), int(y)
cv2.putText(
image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1
)
return image
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run MaskRCNN', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--image', type=str, help="Path of the image to run")
parser.add_argument('--threshold', type=float, default=0.7, help="Detector threshold")
parser.add_argument('--size_scale', type=float, default=1.0, help="Image resize multiplier")
parser.add_argument('--out', type=str, default="/tmp/rendered.png", help="Output filename")
args = parser.parse_args()
resnet = ResNet(50, num_classes=None, stride_in_1x1=True)
model_tiny = MaskRCNN(resnet)
model_tiny.load_from_pretrained()
img = Image.open(args.image)
top_result_tiny = compute_prediction(img, model_tiny, confidence_threshold=args.threshold, size_scale=args.size_scale)
bbox_image = overlay_boxes(img, top_result_tiny)
mask_image = overlay_mask(bbox_image, top_result_tiny)
final_image = overlay_class_names(mask_image, top_result_tiny)
im = Image.fromarray(final_image)
print(f"saving {args.out}")
im.save(args.out)
im.show()

View file

@ -223,13 +223,13 @@ def get_mlperf_bert_model():
def get_fake_data_bert(BS:int):
return {
"input_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
"input_mask": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
"segment_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
"masked_lm_positions": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"),
"masked_lm_ids": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"),
"masked_lm_weights": Tensor.empty((BS, 76), dtype=dtypes.float32, device="CPU"),
"next_sentence_labels": Tensor.empty((BS, 1), dtype=dtypes.int32, device="CPU"),
"input_ids": Tensor.zeros((BS, 512), dtype=dtypes.int32, device="CPU").contiguous(),
"input_mask": Tensor.zeros((BS, 512), dtype=dtypes.int32, device="CPU").contiguous(),
"segment_ids": Tensor.zeros((BS, 512), dtype=dtypes.int32, device="CPU").contiguous(),
"masked_lm_positions": Tensor.zeros((BS, 76), dtype=dtypes.int32, device="CPU").contiguous(),
"masked_lm_ids": Tensor.zeros((BS, 76), dtype=dtypes.int32, device="CPU").contiguous(),
"masked_lm_weights": Tensor.zeros((BS, 76), dtype=dtypes.float32, device="CPU").contiguous(),
"next_sentence_labels": Tensor.zeros((BS, 1), dtype=dtypes.int32, device="CPU").contiguous(),
}
def find_matches(match_quality_matrix:np.ndarray, high_threshold:float=0.5, low_threshold:float=0.4, allow_low_quality_matches:bool=False) -> np.ndarray:

View file

@ -59,9 +59,7 @@ class EmbeddingBert(nn.Embedding):
arange_shp, weight_shp, big_shp = (1, 1, self.vocab_sz, 1), (1, 1, self.vocab_sz, self.embed_sz), idx.shape+(self.vocab_sz, self.embed_sz,)
if not hasattr(self, 'arange'): self.arange = Tensor.arange(self.vocab_sz, requires_grad=False, device=self.weight.device).reshape(arange_shp)
arange, idx, vals = self.arange.expand(big_shp), idx.reshape(idx.shape+(1, 1,)).expand(big_shp), self.weight.cast(dtypes.default_float).reshape(weight_shp).expand(big_shp)
# TODO: contiguous() here because the embedding dropout creates different asts on each device, and search becomes very slow.
# Should fix with fixing random ast on multi device, and fuse arange to make embedding fast.
return (arange == idx).mul(vals).sum(2, dtype=vals.dtype).contiguous()
return (arange == idx).where(vals, 0).sum(2, dtype=vals.dtype)
class LayerNormBert:
def __init__(self, normalized_shape:Union[int, tuple[int, ...]], eps:float=1e-12, elementwise_affine:bool=True):

View file

@ -204,43 +204,6 @@ def eval_bert():
st = time.perf_counter()
def eval_mrcnn():
from tqdm import tqdm
from extra.models.mask_rcnn import MaskRCNN
from extra.models.resnet import ResNet
from extra.datasets.coco import BASEDIR, images, convert_prediction_to_coco_bbox, convert_prediction_to_coco_mask, accumulate_predictions_for_coco, evaluate_predictions_on_coco, iterate
from examples.mask_rcnn import compute_prediction_batched, Image
mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True))
mdl.load_from_pretrained()
bbox_output = '/tmp/results_bbox.json'
mask_output = '/tmp/results_mask.json'
accumulate_predictions_for_coco([], bbox_output, rm=True)
accumulate_predictions_for_coco([], mask_output, rm=True)
#TODO: bs > 1 not as accurate
bs = 1
for batch in tqdm(iterate(images, bs=bs), total=len(images)//bs):
batch_imgs = []
for image_row in batch:
image_name = image_row['file_name']
img = Image.open(BASEDIR/f'val2017/{image_name}').convert("RGB")
batch_imgs.append(img)
batch_result = compute_prediction_batched(batch_imgs, mdl)
for image_row, result in zip(batch, batch_result):
image_name = image_row['file_name']
box_pred = convert_prediction_to_coco_bbox(image_name, result)
mask_pred = convert_prediction_to_coco_mask(image_name, result)
accumulate_predictions_for_coco(box_pred, bbox_output)
accumulate_predictions_for_coco(mask_pred, mask_output)
del batch_imgs
del batch_result
evaluate_predictions_on_coco(bbox_output, iou_type='bbox')
evaluate_predictions_on_coco(mask_output, iou_type='segm')
def eval_llama3():
from extra.models.llama import Transformer
from examples.llama3 import MODEL_PARAMS, load, convert_from_huggingface
@ -541,7 +504,7 @@ if __name__ == "__main__":
# inference only
Tensor.training = False
models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",")
models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert").split(",")
for m in models:
nm = f"eval_{m}"
if nm in globals():

View file

@ -918,40 +918,6 @@ def train_rnnt():
# TODO: RNN-T
pass
@TinyJit
def train_step_bert(model, optimizer, scheduler, loss_scaler:float, GPUS, grad_acc:int, **kwargs):
optimizer.zero_grad()
for i in range(grad_acc):
input_ids, segment_ids = kwargs[f"input_ids{i}"], kwargs[f"segment_ids{i}"]
# NOTE: these two have different names
attention_mask, masked_positions = kwargs[f"input_mask{i}"], kwargs[f"masked_lm_positions{i}"]
masked_lm_ids, masked_lm_weights, next_sentence_labels = kwargs[f"masked_lm_ids{i}"], kwargs[f"masked_lm_weights{i}"], kwargs[f"next_sentence_labels{i}"]
for t in [input_ids, segment_ids, attention_mask, masked_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels]:
if len(GPUS) > 1: t.shard_(GPUS, axis=0)
else: t.to_(GPUS[0])
lm_logits, seq_relationship_logits = model(input_ids, attention_mask, masked_positions, segment_ids)
loss = model.loss(lm_logits, seq_relationship_logits, masked_lm_ids, masked_lm_weights, next_sentence_labels)
(loss * loss_scaler).backward()
# TODO: OOM without this realize with large grad_acc
Tensor.realize(*[p.grad for p in optimizer.params])
global_norm = Tensor(0.0, dtype=dtypes.float32, device=optimizer[0].device)
for p in optimizer.params:
p.grad = p.grad / loss_scaler
global_norm += p.grad.float().square().sum()
global_norm = global_norm.sqrt().contiguous()
for p in optimizer.params:
p.grad = (global_norm > 1.0).where((p.grad/global_norm).cast(p.grad.dtype), p.grad)
optimizer.step()
scheduler.step()
# TODO: no to("CPU") here because it blocks and messes the python time
Tensor.realize(loss, global_norm, optimizer.optimizers[0].lr)
return loss, global_norm, optimizer.optimizers[0].lr
@TinyJit
def eval_step_bert(model, input_ids:Tensor, segment_ids:Tensor, attention_mask:Tensor, masked_positions:Tensor, masked_lm_ids:Tensor,
masked_lm_weights:Tensor, next_sentence_labels:Tensor, GPUS):
@ -1014,7 +980,8 @@ def train_bert():
# ** hyperparameters **
BS = config["BS"] = getenv("BS", 11 * len(GPUS) if dtypes.default_float in (dtypes.float16, dtypes.bfloat16) else 8 * len(GPUS))
grad_acc = config["GRADIENT_ACC_STEPS"] = getenv("GRADIENT_ACC_STEPS", 1)
# TODO: mlperf logging
# TODO: implement grad accumulation + mlperf logging
assert grad_acc == 1
GBS = config["GLOBAL_BATCH_SIZE"] = BS * grad_acc
EVAL_BS = config["EVAL_BS"] = getenv("EVAL_BS", 1 * len(GPUS))
max_lr = config["OPT_BASE_LEARNING_RATE"] = getenv("OPT_BASE_LEARNING_RATE", 0.000175 * math.sqrt(GBS/96))
@ -1073,8 +1040,8 @@ def train_bert():
# ** Optimizer **
parameters_no_wd = [v for k, v in get_state_dict(model).items() if "bias" in k or "LayerNorm" in k]
parameters = [x for x in parameters if x not in set(parameters_no_wd)]
optimizer_wd = LAMB(parameters, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
parameters_wd = [x for x in parameters if x not in set(parameters_no_wd)]
optimizer_wd = LAMB(parameters_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=decay, adam=False)
optimizer_no_wd = LAMB(parameters_no_wd, lr=max_lr, b1=opt_lamb_beta_1, b2=opt_lamb_beta_2, eps=epsilon, weight_decay=0.0, adam=False)
optimizer_group = OptimizerGroup(optimizer_wd, optimizer_no_wd)
@ -1131,12 +1098,38 @@ def train_bert():
# ** train loop **
wc_start = time.perf_counter()
i, train_data = start_step, [next(train_it) for _ in range(grad_acc)]
i, train_data = start_step, next(train_it)
if RUNMLPERF:
if MLLOGGER:
MLLOGGER.start(key=mllog_constants.EPOCH_START, value=i*GBS, metadata={"epoch_num": i*GBS})
@TinyJit
def train_step_bert(input_ids:Tensor, segment_ids:Tensor, attention_mask:Tensor,
masked_positions:Tensor, masked_lm_ids:Tensor, masked_lm_weights:Tensor, next_sentence_labels:Tensor):
for t in [input_ids, segment_ids, attention_mask, masked_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels]:
if len(GPUS) > 1: t.shard_(GPUS, axis=0)
else: t.to_(GPUS[0])
optimizer_group.zero_grad()
lm_logits, seq_relationship_logits = model(input_ids, attention_mask, masked_positions, segment_ids)
loss = model.loss(lm_logits, seq_relationship_logits, masked_lm_ids, masked_lm_weights, next_sentence_labels)
(loss * loss_scaler).backward()
global_norm = Tensor(0.0, dtype=dtypes.float32, device=optimizer_group[0].device)
for p in optimizer_group.params:
p.grad = p.grad / loss_scaler
global_norm += p.grad.float().square().sum()
global_norm = global_norm.sqrt().contiguous()
for p in optimizer_group.params:
p.grad = (global_norm > 1.0).where((p.grad/global_norm).cast(p.grad.dtype), p.grad)
optimizer_group.step()
scheduler_group.step()
# TODO: no to("CPU") here because it blocks and messes the python time
Tensor.realize(loss, global_norm, optimizer_group.optimizers[0].lr)
return loss, global_norm, optimizer_group.optimizers[0].lr
while train_data is not None and i < train_steps and not achieved:
if getenv("TRAIN", 1):
Tensor.training = True
@ -1144,16 +1137,12 @@ def train_bert():
st = time.perf_counter()
GlobalCounters.reset()
with WallTimeEvent(BenchEvent.STEP):
data = {f"{k}{i}":v for i,d in enumerate(train_data) for k,v in d.items()}
loss, global_norm, lr = train_step_bert(model, optimizer_group, scheduler_group, loss_scaler, GPUS, grad_acc, **data)
loss, global_norm, lr = train_step_bert(
train_data["input_ids"], train_data["segment_ids"], train_data["input_mask"], train_data["masked_lm_positions"], \
train_data["masked_lm_ids"], train_data["masked_lm_weights"], train_data["next_sentence_labels"])
pt = time.perf_counter()
try:
next_data = [next(train_it) for _ in range(grad_acc)]
except StopIteration:
next_data = None
next_data = next(train_it)
dt = time.perf_counter()
device_str = parameters[0].device if isinstance(parameters[0].device, str) else f"{parameters[0].device[0]} * {len(parameters[0].device)}"
@ -1188,8 +1177,8 @@ def train_bert():
if MLLOGGER and RUNMLPERF:
MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": i*GBS, "step_num": i})
if getenv("RESET_STEP"): train_step_bert.reset()
elif getenv("FREE_INTERMEDIATE", 0) and train_step_bert.captured is not None:
# TODO: FREE_INTERMEDIATE nan'ed after jit step 2
elif getenv("FREE_INTERMEDIATE") and train_step_bert.captured is not None:
# TODO: this hangs on tiny green after 90 minutes of training
train_step_bert.captured.free_intermediates()
eval_lm_losses = []
eval_clsf_losses = []
@ -1224,7 +1213,7 @@ def train_bert():
return
if getenv("RESET_STEP"): eval_step_bert.reset()
elif getenv("FREE_INTERMEDIATE", 0) and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()
elif getenv("FREE_INTERMEDIATE") and eval_step_bert.captured is not None: eval_step_bert.captured.free_intermediates()
del eval_data
avg_lm_loss = sum(eval_lm_losses) / len(eval_lm_losses)
@ -1300,6 +1289,7 @@ def train_llama3():
BASEDIR = config["BASEDIR"] = Path(getenv("BASEDIR", "/raid/datasets/c4/"))
BS = config["BS"] = getenv("BS", 16)
grad_acc = config["GRADIENT_ACC_STEPS"] = getenv("GRADIENT_ACC_STEPS", 1)
assert grad_acc == 1, f"{grad_acc=} is not supported"
GBS = config["GLOBAL_BATCH_SIZE"] = BS * grad_acc
SEED = config["SEED"] = getenv("SEED", 5760)
SEQLEN = config["SEQLEN"] = getenv("SEQLEN", 8192)
@ -1324,12 +1314,14 @@ def train_llama3():
opt_base_learning_rate = getenv("LR", 8e-5 * GBS / 1152) # NOTE: cannot change for benchmark
opt_end_learning_rate = getenv("END_LR", 8e-7)
# TODO: confirm weights are in bf16
model_params = MODEL_PARAMS[getenv("LLAMA3_SIZE", "8B")]["args"]
# vocab_size from the mixtral tokenizer
params = MODEL_PARAMS[getenv("LLAMA3_SIZE", "8B")]["args"]
params = params | {"vocab_size": 32000} if not SMALL else params
if (llama_layers:=getenv("LLAMA_LAYERS")) != 0: params['n_layers'] = llama_layers
model = Transformer(**params, max_context=SEQLEN, jit=False, disable_kv_cache=True)
if not SMALL: model_params |= {"vocab_size": 32000}
if (llama_layers:=getenv("LLAMA_LAYERS")) != 0: model_params['n_layers'] = llama_layers
model = Transformer(**model_params, max_context=SEQLEN, jit=False, disable_kv_cache=True)
params = get_parameters(model)
# weights are all bfloat16 for now
assert params and all(p.dtype == dtypes.bfloat16 for p in params)
if getenv("FAKEDATA"):
for v in get_parameters(model):
@ -1374,20 +1366,17 @@ def train_llama3():
@TinyJit
@Tensor.train()
def train_step(model, tokens:Tensor, grad_acc:int):
def train_step(model, tokens:Tensor):
optim.zero_grad()
# grad acc
for batch in tokens.split(tokens.shape[0]//grad_acc):
if (DP := getenv("DP", 1)) > 1:
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(DP))
batch = batch.shard(device, 0)
if (MP := getenv("MP", 1)) > 1:
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(MP))
batch = batch.shard(device)
logits:Tensor = model(batch[:, :-1], start_pos=0, temperature=math.nan)
loss = logits.sparse_categorical_crossentropy(batch[:, 1:])
loss.backward()
Tensor.realize(*[p.grad for p in optim.params])
if (DP := getenv("DP", 1)) > 1:
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(DP))
tokens = tokens.shard(device, 0)
if (MP := getenv("MP", 1)) > 1:
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(MP))
tokens = tokens.shard(device)
logits:Tensor = model(tokens[:, :-1], start_pos=0, temperature=math.nan)
loss = logits.sparse_categorical_crossentropy(tokens[:, 1:])
loss.backward()
# L2 norm grad clip
# https://github.com/NVIDIA/NeMo/blob/3368c3fc0b4a186ab33a1d68a504315100c0b2a6/nemo/collections/nlp/modules/common/megatron/clip_grads.py#L57
# https://docs.pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html
@ -1422,18 +1411,18 @@ def train_llama3():
# ** data iters **
def fake_data(bs, samples):
for _ in range(samples // bs):
yield Tensor.randint(bs, SEQLEN + 1, low=0, high=params["vocab_size"], dtype=dtypes.int32, device=Device.DEFAULT)
yield Tensor.randint(bs, SEQLEN + 1, low=0, high=model_params["vocab_size"], dtype=dtypes.int32, device=Device.DEFAULT)
def get_train_iter():
if getenv("FAKEDATA", 0):
return fake_data(GBS, SAMPLES)
return fake_data(BS, SAMPLES)
else:
if SMALL:
from examples.mlperf.dataloader import batch_load_llama3_small
return batch_load_llama3_small(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
return batch_load_llama3_small(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
else:
from examples.mlperf.dataloader import batch_load_llama3
return batch_load_llama3(GBS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
return batch_load_llama3(BS, SAMPLES, SEQLEN, BASEDIR, seed=SEED, val=bool(TRAIN_ON_VAL))
def get_eval_iter():
if getenv("FAKEDATA", 0):
@ -1451,7 +1440,7 @@ def train_llama3():
for tokens in tqdm(iter, total=SAMPLES//GBS):
t = time.perf_counter()
GlobalCounters.reset()
loss, lr = train_step(model, tokens, grad_acc)
loss, lr = train_step(model, tokens)
loss = loss.float().item()
i += 1

View file

@ -0,0 +1,31 @@
#!/bin/bash
set -e # Exit on any error
set -o pipefail # Make pipeline fail if any command fails
export PYTHONPATH="." AMD=1
export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_8xMI350X"
export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
export TRAIN_STEPS=3900
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=5000000
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
export BASEDIR="/raid/datasets/wiki"
# pip install -e ".[mlperf]"
export LOGMLPERF=1
export SEED=$RANDOM
DATETIME=$(date "+%m%d%H%M")
LOGFILE="bert_8xMI350x_${DATETIME}_${SEED}.log"
BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE

View file

@ -2,7 +2,7 @@
export PYTHONPATH="." NV=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000

View file

@ -2,7 +2,7 @@
export PYTHONPATH="." NV=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000

View file

@ -5,7 +5,7 @@ set -o pipefail # Make pipeline fail if any command fails
export PYTHONPATH="." NV=1
export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_green"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=500000

View file

@ -1,118 +0,0 @@
import json, pprint
from tinygrad import fetch, nn, Tensor
from tinygrad.helpers import DEBUG
class FeedForward:
def __init__(self, model_dim, intermediate_dim):
self.proj_1 = nn.Linear(model_dim, 2*intermediate_dim, bias=False)
self.proj_2 = nn.Linear(intermediate_dim, model_dim, bias=False)
def __call__(self, x):
y_12 = self.proj_1(x)
y_1, y_2 = y_12.chunk(2, dim=-1)
return self.proj_2(y_1.silu() * y_2)
# NOTE: this RoPE doesn't match LLaMA's?
def _rotate_half(x: Tensor) -> Tensor:
x1, x2 = x.chunk(2, dim=-1)
return Tensor.cat(-x2, x1, dim=-1)
def _apply_rotary_pos_emb(x: Tensor, pos_sin: Tensor, pos_cos: Tensor) -> Tensor:
return (x * pos_cos) + (_rotate_half(x) * pos_sin)
class Attention:
def __init__(self, model_dim, num_query_heads, num_kv_heads, head_dim):
self.qkv_proj = nn.Linear(model_dim, (num_query_heads + num_kv_heads*2) * head_dim, bias=False)
self.num_query_heads, self.num_kv_heads = num_query_heads, num_kv_heads
self.head_dim = head_dim
self.q_norm = nn.RMSNorm(head_dim)
self.k_norm = nn.RMSNorm(head_dim)
self.out_proj = nn.Linear(num_query_heads * head_dim, model_dim, bias=False)
def __call__(self, x:Tensor) -> Tensor:
batch_size, seq_len, embed_dim = x.shape
qkv = self.qkv_proj(x)
qkv = qkv.reshape(batch_size, seq_len, self.num_query_heads+self.num_kv_heads*2, self.head_dim).transpose(1, 2)
xq,xk,xv = qkv.split([self.num_query_heads, self.num_kv_heads, self.num_kv_heads], dim=1)
xq = self.q_norm(xq)
xk = self.k_norm(xk)
# add positional embedding (how many kernels is this?)
freq_constant = 10000
inv_freq = 1.0 / (freq_constant ** (Tensor.arange(0, self.head_dim, 2) / self.head_dim))
pos_index_theta = Tensor.einsum("i,j->ij", Tensor.arange(seq_len), inv_freq)
emb = Tensor.cat(pos_index_theta, pos_index_theta, dim=-1)
cos_emb, sin_emb = emb.cos()[None, None, :, :], emb.sin()[None, None, :, :]
xq = _apply_rotary_pos_emb(xq, sin_emb, cos_emb)
xk = _apply_rotary_pos_emb(xk, sin_emb, cos_emb)
# grouped-query attention
num_groups = self.num_query_heads // self.num_kv_heads
xk = xk.repeat_interleave(num_groups, dim=1)
xv = xv.repeat_interleave(num_groups, dim=1)
# masked attention
#start_pos = 0
#mask = Tensor.full((1, 1, seq_len, start_pos+seq_len), float("-inf"), dtype=xq.dtype, device=xq.device).triu(start_pos+1)
#attn_output = xq.scaled_dot_product_attention(xk, xv, mask).transpose(1, 2)
# causal is fine, no mask needed
attn_output = xq.scaled_dot_product_attention(xk, xv, is_causal=True).transpose(1, 2)
return self.out_proj(attn_output.reshape(batch_size, seq_len, self.num_query_heads * self.head_dim))
class Layer:
def __init__(self, model_dim, intermediate_dim, num_query_heads, num_kv_heads, head_dim):
self.ffn = FeedForward(model_dim, intermediate_dim)
self.attn = Attention(model_dim, num_query_heads, num_kv_heads, head_dim)
self.ffn_norm = nn.RMSNorm(model_dim)
self.attn_norm = nn.RMSNorm(model_dim)
def __call__(self, x:Tensor) -> Tensor: # (batch, seq_len, embed_dim)
x = x + self.attn(self.attn_norm(x))
x = x + self.ffn(self.ffn_norm(x))
return x
# stupidly complex
def make_divisible(v, divisor):
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v: new_v += divisor
return new_v
class Transformer:
def __init__(self, cfg):
if DEBUG >= 3: pprint.pp(cfg)
self.layers = [Layer(cfg['model_dim'], make_divisible(int(cfg["model_dim"] * cfg['ffn_multipliers'][i]), cfg['ffn_dim_divisor']),
cfg['num_query_heads'][i], cfg['num_kv_heads'][i], cfg['head_dim']) for i in range(cfg['num_transformer_layers'])]
self.norm = nn.RMSNorm(cfg['model_dim'])
self.token_embeddings = nn.Embedding(cfg['vocab_size'], cfg['model_dim'])
def __call__(self, tokens:Tensor):
# _bsz, seqlen = tokens.shape
x = self.token_embeddings(tokens)
for l in self.layers: x = l(x)
return self.norm(x) @ self.token_embeddings.weight.T
if __name__ == "__main__":
#model_name = "OpenELM-270M-Instruct"
model_name = "OpenELM-270M" # this is fp32
model = Transformer(json.loads(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/config.json?download=true").read_bytes()))
weights = nn.state.safe_load(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/model.safetensors?download=true"))
if DEBUG >= 3:
for k, v in weights.items(): print(k, v.shape)
nn.state.load_state_dict(model, {k.removeprefix("transformer."):v for k,v in weights.items()})
from sentencepiece import SentencePieceProcessor
tokenizer = SentencePieceProcessor(fetch("https://github.com/karpathy/llama2.c/raw/master/tokenizer.model").as_posix())
toks = [tokenizer.bos_id()] + tokenizer.encode("Some car brands include")
for i in range(100):
ttoks = Tensor([toks])
out = model(ttoks).realize()
t0 = out[0].argmax(axis=-1).tolist()
toks.append(t0[-1])
# hmmm...passthrough still doesn't match (it shouldn't, it outputs the most likely)
print(tokenizer.decode(toks))
#print(toks)
#print(tokenizer.decode(t0))
#print(t0)

View file

@ -1,55 +0,0 @@
from tinygrad.helpers import trange
from tinygrad.nn.datasets import mnist
import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
from functools import partial
class Model(nn.Module):
def __init__(self):
super().__init__()
self.c1 = nn.Conv2d(1, 32, 5)
self.c2 = nn.Conv2d(32, 32, 5)
self.bn1 = nn.BatchNorm(32)
self.m1 = nn.MaxPool2d(2)
self.c3 = nn.Conv2d(32, 64, 3)
self.c4 = nn.Conv2d(64, 64, 3)
self.bn2 = nn.BatchNorm(64)
self.m2 = nn.MaxPool2d(2)
self.lin = nn.Linear(576, 10)
def __call__(self, x):
x = mx.maximum(self.c1(x), 0)
x = mx.maximum(self.c2(x), 0)
x = self.m1(self.bn1(x))
x = mx.maximum(self.c3(x), 0)
x = mx.maximum(self.c4(x), 0)
x = self.m2(self.bn2(x))
return self.lin(mx.flatten(x, 1))
if __name__ == "__main__":
X_train, Y_train, X_test, Y_test = mnist()
X_train = mx.array(X_train.float().permute((0,2,3,1)).numpy())
Y_train = mx.array(Y_train.numpy())
X_test = mx.array(X_test.float().permute((0,2,3,1)).numpy())
Y_test = mx.array(Y_test.numpy())
model = Model()
optimizer = optim.Adam(1e-3)
def loss_fn(model, x, y): return nn.losses.cross_entropy(model(x), y).mean()
state = [model.state, optimizer.state]
@partial(mx.compile, inputs=state, outputs=state)
def step(samples):
# Compiled functions will also treat any inputs not in the parameter list as constants.
X,Y = X_train[samples], Y_train[samples]
loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
loss, grads = loss_and_grad_fn(model, X, Y)
optimizer.update(model, grads)
return loss
test_acc = float('nan')
for i in (t:=trange(70)):
samples = mx.random.randint(0, X_train.shape[0], (512,)) # putting this in JIT didn't work well
loss = step(samples)
if i%10 == 9: test_acc = ((model(X_test).argmax(axis=-1) == Y_test).sum() * 100 / X_test.shape[0]).item()
t.set_description(f"loss: {loss.item():6.2f} test_accuracy: {test_acc:5.2f}%")

View file

@ -1,45 +0,0 @@
import gymnasium as gym
import numpy as np
from gymnasium.envs.registration import register
# a very simple game
# one of <size> lights will light up
# take the action of the lit up light
# in <hard_mode>, you act differently based on the step number and need to track this
class PressTheLightUpButton(gym.Env):
metadata = {"render_modes": []}
def __init__(self, render_mode=None, size=2, game_length=10, hard_mode=False):
self.size, self.game_length = size, game_length
self.observation_space = gym.spaces.Box(0, 1, shape=(self.size,), dtype=np.float32)
self.action_space = gym.spaces.Discrete(self.size)
self.step_num = 0
self.done = True
self.hard_mode = hard_mode
def _get_obs(self):
obs = [0]*self.size
if self.step_num < len(self.state):
obs[self.state[self.step_num]] = 1
return np.array(obs, dtype=np.float32)
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.state = np.random.randint(0, self.size, size=self.game_length)
self.step_num = 0
self.done = False
return self._get_obs(), {}
def step(self, action):
target = ((action + self.step_num) % self.size) if self.hard_mode else action
reward = int(target == self.state[self.step_num])
self.step_num += 1
if not reward:
self.done = True
return self._get_obs(), reward, self.done, self.step_num >= self.game_length, {}
register(
id="PressTheLightUpButton-v0",
entry_point="examples.rl.lightupbutton:PressTheLightUpButton",
max_episode_steps=None,
)

View file

@ -115,7 +115,7 @@ if __name__ == "__main__":
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
if not args.fakeweights:
default_weights_url = 'https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors'
default_weights_url = 'https://huggingface.co/sd2-community/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors'
weights_fn = args.weights_fn
if not weights_fn:
weights_url = args.weights_url if args.weights_url else default_weights_url

View file

@ -1,136 +0,0 @@
#!/usr/bin/env python
#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
import sys
import numpy as np
from tinygrad.nn.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.nn import BatchNorm2d, optim
from tinygrad.helpers import getenv
from extra.datasets import fetch_mnist
from extra.augment import augment_img
from extra.training import train, evaluate
GPU = getenv("GPU")
QUICK = getenv("QUICK")
DEBUG = getenv("DEBUG")
class SqueezeExciteBlock2D:
def __init__(self, filters):
self.filters = filters
self.weight1 = Tensor.scaled_uniform(self.filters, self.filters//32)
self.bias1 = Tensor.scaled_uniform(1,self.filters//32)
self.weight2 = Tensor.scaled_uniform(self.filters//32, self.filters)
self.bias2 = Tensor.scaled_uniform(1, self.filters)
def __call__(self, input):
se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
se = se.reshape(shape=(-1, self.filters))
se = se.dot(self.weight1) + self.bias1
se = se.relu()
se = se.dot(self.weight2) + self.bias2
se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting
se = input.mul(se)
return se
class ConvBlock:
def __init__(self, h, w, inp, filters=128, conv=3):
self.h, self.w = h, w
self.inp = inp
#init weights
self.cweights = [Tensor.scaled_uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
self.cbiases = [Tensor.scaled_uniform(1, filters, 1, 1) for i in range(3)]
#init layers
self._bn = BatchNorm2d(128)
self._seb = SqueezeExciteBlock2D(filters)
def __call__(self, input):
x = input.reshape(shape=(-1, self.inp, self.w, self.h))
for cweight, cbias in zip(self.cweights, self.cbiases):
x = x.pad(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
x = self._bn(x)
x = self._seb(x)
return x
class BigConvNet:
def __init__(self):
self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
self.weight1 = Tensor.scaled_uniform(128,10)
self.weight2 = Tensor.scaled_uniform(128,10)
def parameters(self):
if DEBUG: #keeping this for a moment
pars = [par for par in get_parameters(self) if par.requires_grad]
no_pars = 0
for par in pars:
print(par.shape)
no_pars += np.prod(par.shape)
print('no of parameters', no_pars)
return pars
else:
return get_parameters(self)
def save(self, filename):
with open(filename+'.npy', 'wb') as f:
for par in get_parameters(self):
#if par.requires_grad:
np.save(f, par.numpy())
def load(self, filename):
with open(filename+'.npy', 'rb') as f:
for par in get_parameters(self):
#if par.requires_grad:
try:
par.numpy()[:] = np.load(f)
if GPU:
par.gpu()
except:
print('Could not load parameter')
def forward(self, x):
x = self.conv[0](x)
x = self.conv[1](x)
x = x.avg_pool2d(kernel_size=(2,2))
x = self.conv[2](x)
x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
xo = x1.dot(self.weight1) + x2.dot(self.weight2)
return xo
if __name__ == "__main__":
lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
epochss = [2, 1] if QUICK else [13, 3, 3, 1]
BS = 32
lmbd = 0.00025
lossfn = lambda out,y: out.sparse_categorical_crossentropy(y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
X_train, Y_train, X_test, Y_test = fetch_mnist()
X_train = X_train.reshape(-1, 28, 28).astype(np.uint8)
X_test = X_test.reshape(-1, 28, 28).astype(np.uint8)
steps = len(X_train)//BS
np.random.seed(1337)
if QUICK:
steps = 1
X_test, Y_test = X_test[:BS], Y_test[:BS]
model = BigConvNet()
if len(sys.argv) > 1:
try:
model.load(sys.argv[1])
print('Loaded weights "'+sys.argv[1]+'", evaluating...')
evaluate(model, X_test, Y_test, BS=BS)
except:
print('could not load weights "'+sys.argv[1]+'".')
if GPU:
params = get_parameters(model)
[x.gpu_() for x in params]
for lr, epochs in zip(lrs, epochss):
optimizer = optim.Adam(model.parameters(), lr=lr)
for epoch in range(1,epochs+1):
#first epoch without augmentation
X_aug = X_train if epoch == 1 else augment_img(X_train)
train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, BS=BS)
accuracy = evaluate(model, X_test, Y_test, BS=BS)
model.save(f'examples/checkpoint{accuracy * 1e6:.0f}')

View file

@ -1,17 +0,0 @@
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d, BatchNorm2d
from tinygrad.nn.state import get_parameters
if __name__ == "__main__":
with Tensor.train():
BS, C1, H, W = 4, 16, 224, 224
C2, K, S, P = 64, 7, 2, 1
x = Tensor.uniform(BS, C1, H, W)
conv = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
bn = BatchNorm2d(C2, track_running_stats=False)
for t in get_parameters([x, conv, bn]): t.realize()
print("running network")
x.sequential([conv, bn]).numpy()

View file

@ -1,669 +0,0 @@
# original implementation: https://github.com/svc-develop-team/so-vits-svc
from __future__ import annotations
import sys, logging, time, io, math, argparse, operator, numpy as np
from functools import partial, reduce
from pathlib import Path
from typing import Tuple, Optional, Type
from tinygrad import nn, dtypes, Tensor
from tinygrad.helpers import getenv, fetch
from tinygrad.nn.state import torch_load
from examples.vits import ResidualCouplingBlock, PosteriorEncoder, Encoder, ResBlock1, ResBlock2, LRELU_SLOPE, sequence_mask, split, get_hparams_from_file, load_checkpoint, weight_norm, HParams
from examples.sovits_helpers import preprocess
import soundfile
DEBUG = getenv("DEBUG")
F0_BIN = 256
F0_MAX = 1100.0
F0_MIN = 50.0
F0_MEL_MIN = 1127 * np.log(1 + F0_MIN / 700)
F0_MEL_MAX = 1127 * np.log(1 + F0_MAX / 700)
class SpeechEncoder:
def __init__(self, hidden_dim, model:ContentVec): self.hidden_dim, self.model = hidden_dim, model
def encode(self, ): raise NotImplementedError("implement me")
@classmethod
def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
contentvec = ContentVec.load_from_pretrained(checkpoint_path, checkpoint_url)
return cls(contentvec)
class ContentVec256L9(SpeechEncoder):
def __init__(self, model:ContentVec): super().__init__(hidden_dim=256, model=model)
def encode(self, wav: Tensor):
feats = wav
if len(feats.shape) == 2: # double channels
feats = feats.mean(-1)
assert len(feats.shape) == 1, feats.dim()
feats = feats.reshape(1, -1)
padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=9)
feats = self.model.final_proj(logits[0])
return feats.transpose(1,2)
class ContentVec768L12(SpeechEncoder):
def __init__(self, model:ContentVec): super().__init__(hidden_dim=768, model=model)
def encode(self, wav: Tensor):
feats = wav
if len(feats.shape) == 2: # double channels
feats = feats.mean(-1)
assert len(feats.shape) == 1, feats.dim()
feats = feats.reshape(1, -1)
padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=12)
return logits[0].transpose(1,2)
# original code for contentvec: https://github.com/auspicious3000/contentvec/
class ContentVec:
# self.final_proj dims are hardcoded and depend on fairseq.data.dictionary Dictionary in the checkpoint. This param can't yet be loaded since there is no pickle for it. See with DEBUG=2.
# This means that the ContentVec only works with the hubert weights used in all SVC models
def __init__(self, cfg: HParams):
self.feature_grad_mult, self.untie_final_proj = cfg.feature_grad_mult, cfg.untie_final_proj
feature_enc_layers = eval(cfg.conv_feature_layers)
self.embed = feature_enc_layers[-1][0]
final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
self.feature_extractor = ConvFeatureExtractionModel(conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias)
self.post_extract_proj = nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None
self.encoder = TransformerEncoder(cfg)
self.layer_norm = nn.LayerNorm(self.embed)
self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim * 1) if self.untie_final_proj else nn.Linear(cfg.encoder_embed_dim, final_dim)
self.mask_emb = Tensor.uniform(cfg.encoder_embed_dim, dtype=dtypes.float32)
self.label_embs_concat = Tensor.uniform(504, final_dim, dtype=dtypes.float32)
def forward_features(self, source, padding_mask):
if self.feature_grad_mult > 0:
features = self.feature_extractor(source, padding_mask)
if self.feature_grad_mult != 1.0: pass # training: GradMultiply.forward(features, self.feature_grad_mult)
else:
features = self.feature_extractor(source, padding_mask)
return features
def forward_padding_mask(self, features, padding_mask): # replaces original forward_padding_mask for batch inference
lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1) # ensure its bool for tilde
lengths = (lengths_org - 400).float().div(320).floor().cast(dtypes.int64) + 1 # intermediate float to divide
padding_mask = lengths_to_padding_mask(lengths)
return padding_mask
def extract_features(self, source: Tensor, spk_emb:Tensor=None, padding_mask=None, ret_conv=False, output_layer=None, tap=False):
features = self.forward_features(source, padding_mask)
if padding_mask is not None:
padding_mask = self.forward_padding_mask(features, padding_mask)
features = features.transpose(1, 2)
features = self.layer_norm(features)
if self.post_extract_proj is not None:
features = self.post_extract_proj(features)
x, _ = self.encoder(features, spk_emb, padding_mask=padding_mask, layer=(None if output_layer is None else output_layer - 1), tap=tap)
res = features if ret_conv else x
return res, padding_mask
@classmethod
def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
fetch(checkpoint_url, checkpoint_path)
cfg = load_fairseq_cfg(checkpoint_path)
enc = cls(cfg.model)
_ = load_checkpoint_enc(checkpoint_path, enc, None)
logging.debug(f"{cls.__name__}: Loaded model with cfg={cfg}")
return enc
class TransformerEncoder:
def __init__(self, cfg: HParams):
def make_conv() -> nn.Conv1d:
layer = nn.Conv1d(self.embedding_dim, self.embedding_dim, kernel_size=cfg.conv_pos, padding=cfg.conv_pos // 2, groups=cfg.conv_pos_groups)
std = std = math.sqrt(4 / (cfg.conv_pos * self.embedding_dim))
layer.weight, layer.bias = (Tensor.normal(*layer.weight.shape, std=std)), (Tensor.zeros(*layer.bias.shape))
# for training: layer.weights need to be weight_normed
return layer
self.dropout, self.embedding_dim, self.layer_norm_first, self.layerdrop, self.num_layers, self.num_layers_1 = cfg.dropout, cfg.encoder_embed_dim, cfg.layer_norm_first, cfg.encoder_layerdrop, cfg.encoder_layers, cfg.encoder_layers_1
self.pos_conv, self.pos_conv_remove = [make_conv()], (1 if cfg.conv_pos % 2 == 0 else 0)
self.layers = [
TransformerEncoderLayer(self.embedding_dim, cfg.encoder_ffn_embed_dim, cfg.encoder_attention_heads, self.dropout, cfg.attention_dropout, cfg.activation_dropout, cfg.activation_fn, self.layer_norm_first, cond_layer_norm=(i >= cfg.encoder_layers))
for i in range(cfg.encoder_layers + cfg.encoder_layers_1)
]
self.layer_norm = nn.LayerNorm(self.embedding_dim)
self.cond_layer_norm = CondLayerNorm(self.embedding_dim) if cfg.encoder_layers_1 > 0 else None
# training: apply init_bert_params
def __call__(self, x, spk_emb, padding_mask=None, layer=None, tap=False):
x, layer_results = self.extract_features(x, spk_emb, padding_mask, layer, tap)
if self.layer_norm_first and layer is None:
x = self.cond_layer_norm(x, spk_emb) if (self.num_layers_1 > 0) else self.layer_norm(x)
return x, layer_results
def extract_features(self, x: Tensor, spk_emb: Tensor, padding_mask=None, tgt_layer=None, tap=False):
if tgt_layer is not None: # and not self.training
assert tgt_layer >= 0 and tgt_layer < len(self.layers)
if padding_mask is not None:
# x[padding_mask] = 0
assert padding_mask.shape == x.shape[:len(padding_mask.shape)] # first few dims of x must match padding_mask
tmp_mask = padding_mask.unsqueeze(-1).repeat((1, 1, x.shape[-1]))
tmp_mask = tilde(tmp_mask.cast(dtypes.bool))
x = tmp_mask.where(x, 0)
x_conv = self.pos_conv[0](x.transpose(1,2))
if self.pos_conv_remove > 0: x_conv = x_conv[:, :, : -self.pos_conv_remove]
x_conv = x_conv.gelu().transpose(1, 2)
x = (x + x_conv).transpose(0, 1) # B x T x C -> T x B x C
if not self.layer_norm_first: x = self.layer_norm(x)
x = x.dropout(p=self.dropout)
layer_results = []
r = None
for i, layer in enumerate(self.layers):
if i < self.num_layers: # if (not self.training or (dropout_probability > self.layerdrop)) and (i < self.num_layers):
assert layer.cond_layer_norm == False
x = layer(x, self_attn_padding_mask=padding_mask, need_weights=False)
if tgt_layer is not None or tap:
layer_results.append(x.transpose(0, 1))
if i>= self.num_layers:
assert layer.cond_layer_norm == True
x = layer(x, emb=spk_emb, self_attn_padding_mask=padding_mask, need_weights=False)
if i == tgt_layer:
r = x
break
if r is not None:
x = r
x = x.transpose(0, 1) # T x B x C -> B x T x C
return x, layer_results
class TransformerEncoderLayer:
def __init__(self, embedding_dim=768.0, ffn_embedding_dim=3072.0, num_attention_heads=8.0, dropout=0.1, attention_dropout=0.1, activation_dropout=0.1, activation_fn="relu", layer_norm_first=False, cond_layer_norm=False):
def get_activation_fn(activation):
if activation == "relu": return Tensor.relu
if activation == "gelu": return Tensor.gelu
else: raise RuntimeError(f"activation function={activation} is not forseen")
self.embedding_dim, self.dropout, self.activation_dropout, self.layer_norm_first, self.num_attention_heads, self.cond_layer_norm, self.activation_fn = embedding_dim, dropout, activation_dropout, layer_norm_first, num_attention_heads, cond_layer_norm, get_activation_fn(activation_fn)
self.self_attn = MultiHeadAttention(self.embedding_dim, self.num_attention_heads)
self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
self.final_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
def __call__(self, x:Tensor, self_attn_mask:Tensor=None, self_attn_padding_mask:Tensor=None, emb:Tensor=None, need_weights=False):
#self_attn_padding_mask = self_attn_padding_mask.reshape(x.shape[0], 1, 1, self_attn_padding_mask.shape[1]).expand(-1, self.num_attention_heads, -1, -1).reshape(x.shape[0] * self.num_attention_heads, 1, self_attn_padding_mask.shape[1]) if self_attn_padding_mask is not None else None
assert self_attn_mask is None and self_attn_padding_mask is not None
residual = x
if self.layer_norm_first:
x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
x = self.self_attn(x=x, mask=self_attn_padding_mask)
x = x.dropout(self.dropout)
x = residual + x
x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
x = self.activation_fn(self.fc1(x))
x = x.dropout(self.activation_dropout)
x = self.fc2(x)
x = x.dropout(self.dropout)
x = residual + x
else:
x = self.self_attn(x=x, mask=self_attn_padding_mask)
x = x.dropout(self.dropout)
x = residual + x
x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
residual = x
x = self.activation_fn(self.fc1(x))
x = x.dropout(self.activation_dropout)
x = self.fc2(x)
x = x.dropout(self.dropout)
x = residual + x
x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
return x
class MultiHeadAttention:
def __init__(self, n_state, n_head):
self.n_state, self.n_head = n_state, n_head
self.q_proj, self.k_proj, self.v_proj, self.out_proj = [nn.Linear(n_state, n_state) for _ in range(4)]
def __call__(self, x:Tensor, xa:Optional[Tensor]=None, mask:Optional[Tensor]=None):
x = x.transpose(0,1) # TxBxC -> BxTxC
q, k, v = self.q_proj(x), self.k_proj(xa or x), self.v_proj(xa or x)
q, k, v = [x.reshape(*q.shape[:2], self.n_head, -1) for x in (q, k, v)]
wv = Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), None).transpose(1, 2).reshape(*x.shape[:2], -1)
ret = self.out_proj(wv).transpose(0,1) # BxTxC -> TxBxC
return ret
class ConvFeatureExtractionModel:
def __init__(self, conv_layers, dropout=.0, mode="default", conv_bias=False):
assert mode in {"default", "group_norm_masked", "layer_norm"}
def block(n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False):
def make_conv():
conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
conv.weight = Tensor.kaiming_normal(*conv.weight.shape)
return conv
assert (is_layer_norm and is_group_norm) == False, "layer norm and group norm are exclusive"
if is_layer_norm:
return [make_conv(), partial(Tensor.dropout, p=dropout),[partial(Tensor.transpose, dim0=-2, dim1=-1), nn.LayerNorm(dim, elementwise_affine=True), partial(Tensor.transpose, dim0=-2, dim1=-1)], Tensor.gelu]
elif is_group_norm and mode == "default":
return [make_conv(), partial(Tensor.dropout, p=dropout), nn.GroupNorm(dim, dim, affine=True), Tensor.gelu]
elif is_group_norm and mode == "group_norm_masked":
return [make_conv(), partial(Tensor.dropout, p=dropout), GroupNormMasked(dim, dim, affine=True), Tensor.gelu]
else:
return [make_conv(), partial(Tensor.dropout, p=dropout), Tensor.gelu]
in_d, self.conv_layers, self.mode = 1, [], mode
for i, cl in enumerate(conv_layers):
assert len(cl) == 3, "invalid conv definition: " + str(cl)
(dim, k, stride) = cl
if i == 0: self.cl = cl
self.conv_layers.append(block(in_d, dim, k, stride, is_layer_norm=(mode == "layer_norm"), is_group_norm=((mode == "default" or mode == "group_norm_masked") and i == 0), conv_bias=conv_bias))
in_d = dim
def __call__(self, x:Tensor, padding_mask:Tensor):
x = x.unsqueeze(1) # BxT -> BxCxT
if self.mode == "group_norm_masked":
if padding_mask is not None:
_, k, stride = self.cl
lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1) # ensure padding_mask is bool for tilde
lengths = (((lengths_org - k) / stride) + 1).floor().cast(dtypes.int64)
padding_mask = tilde(lengths_to_padding_mask(lengths)).cast(dtypes.int64) # lengths_to_padding_mask returns bool tensor
x = self.conv_layers[0][0](x) # padding_mask is numeric
x = self.conv_layers[0][1](x)
x = self.conv_layers[0][2](x, padding_mask)
x = self.conv_layers[0][3](x)
else:
x = x.sequential(self.conv_layers[0]) # default
for _, conv in enumerate(self.conv_layers[1:], start=1):
conv = reduce(lambda a,b: operator.iconcat(a,b if isinstance(b, list) else [b]), conv, []) # flatten
x = x.sequential(conv)
return x
class CondLayerNorm: # https://github.com/auspicious3000/contentvec/blob/main/contentvec/modules/cond_layer_norm.py#L10
def __init__(self, dim_last, eps=1e-5, dim_spk=256, elementwise_affine=True):
self.dim_last, self.eps, self.dim_spk, self.elementwise_affine = dim_last, eps, dim_spk, elementwise_affine
if self.elementwise_affine:
self.weight_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
self.bias_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
self.weight_ln.weight, self.bias_ln.weight = (Tensor.ones(*self.weight_ln.weight.shape)), (Tensor.zeros(*self.bias_ln.weight.shape))
def __call__(self, x: Tensor, spk_emb: Tensor):
axis = tuple(-1-i for i in range(len(x.shape[1:])))
x = x.layernorm(axis=axis, eps=self.eps)
if not self.elementwise_affine: return x
weights, bias = self.weight_ln(spk_emb), self.bias_ln(spk_emb)
return weights * x + bias
class GroupNormMasked: # https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/modules/fp32_group_norm.py#L16
def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
self.num_groups, self.num_channels, self.eps, self.affine = num_groups, num_channels, eps, affine
self.weight, self.bias = (Tensor.ones(num_channels)), (Tensor.zeros(num_channels)) if self.affine else (None, None)
def __call__(self, x:Tensor, mask:Tensor):
bsz, n_c, length = x.shape
assert n_c % self.num_groups == 0
x = x.reshape(bsz, self.num_groups, n_c // self.num_groups, length)
if mask is None: mask = Tensor.ones_like(x)
else: mask = mask.reshape(bsz, 1, 1, length)
x = x * mask
lengths = mask.sum(axis=3, keepdim=True)
assert x.shape[2] == 1
mean_ = x.mean(dim=3, keepdim=True)
mean = mean_ * length / lengths
var = (((x.std(axis=3, keepdim=True) ** 2) + mean_**2) * length / lengths - mean**2) + self.eps
return x.add(-mean).div(var.sqrt()).reshape(bsz, n_c, length).mul(self.weight.reshape(1,-1,1)).add(self.bias.reshape(1,-1,1))
class Synthesizer:
def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, ssl_dim, n_speakers, sampling_rate=44100, vol_embedding=False, n_flow_layer=4, **kwargs):
self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.vol_embedding = spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, vol_embedding
self.emb_g = nn.Embedding(n_speakers, gin_channels)
if vol_embedding: self.emb_vol = nn.Linear(1, hidden_channels)
self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
self.enc_p = TextEncoder(inter_channels, hidden_channels, kernel_size, n_layers, filter_channels=filter_channels, n_heads=n_heads, p_dropout=p_dropout)
self.dec = Generator(sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels)
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels)
self.emb_uv = nn.Embedding(vocab_size=2, embed_size=hidden_channels)
def infer(self, c:Tensor, f0:Tensor, uv:Tensor, g:Tensor=None, noise_scale=0.35, seed=52468, vol=None) -> Tuple[Tensor, Tensor]:
Tensor.manual_seed(getenv('SEED', seed))
c_lengths = (Tensor.ones([c.shape[0]]) * c.shape[-1]).to(c.device)
if len(g.shape) == 1: g = g.unsqueeze(0)
g = self.emb_g(g).transpose(1, 2)
x_mask = sequence_mask(c_lengths, c.shape[2]).unsqueeze(1).cast(c.dtype)
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
x = self.pre(c) * x_mask + self.emb_uv(uv.cast(dtypes.int64)).transpose(1, 2) + vol
z_p, _, _, c_mask = self.enc_p.forward(x, x_mask, f0=self._f0_to_coarse(f0), noise_scale=noise_scale)
z = self.flow.forward(z_p, c_mask, g=g, reverse=True)
o = self.dec.forward(z * c_mask, g=g, f0=f0)
return o,f0
def _f0_to_coarse(self, f0 : Tensor):
f0_mel = 1127 * (1 + f0 / 700).log()
a = (F0_BIN - 2) / (F0_MEL_MAX - F0_MEL_MIN)
b = F0_MEL_MIN * a - 1.
f0_mel = (f0_mel > 0).where(f0_mel * a - b, f0_mel)
f0_coarse = f0_mel.ceil().cast(dtype=dtypes.int64)
f0_coarse = f0_coarse * (f0_coarse > 0)
f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
f0_coarse = f0_coarse * (f0_coarse < F0_BIN)
f0_coarse = f0_coarse + ((f0_coarse >= F0_BIN) * (F0_BIN - 1))
return f0_coarse
@classmethod
def load_from_pretrained(cls, config_path:str, config_url:str, weights_path:str, weights_url:str) -> Synthesizer:
fetch(config_url, config_path)
hps = get_hparams_from_file(config_path)
fetch(weights_url, weights_path)
net_g = cls(hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model)
_ = load_checkpoint(weights_path, net_g, None, skip_list=["f0_decoder"])
logging.debug(f"{cls.__name__}:Loaded model with hps: {hps}")
return net_g, hps
class TextEncoder:
def __init__(self, out_channels, hidden_channels, kernel_size, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None):
self.out_channels, self.hidden_channels, self.kernel_size, self.n_layers, self.gin_channels = out_channels, hidden_channels, kernel_size, n_layers, gin_channels
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
self.f0_emb = nn.Embedding(256, hidden_channels) # n_vocab = 256
self.enc_ = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
def forward(self, x, x_mask, f0=None, noise_scale=1):
x = x + self.f0_emb(f0).transpose(1, 2)
x = self.enc_.forward(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = split(stats, self.out_channels, dim=1)
z = (m + randn_like(m) * logs.exp() * noise_scale) * x_mask
return z, m, logs, x_mask
class Upsample:
def __init__(self, scale_factor):
assert scale_factor % 1 == 0, "Only integer scale factor allowed."
self.scale = int(scale_factor)
def forward(self, x:Tensor):
repeats = tuple([1] * len(x.shape) + [self.scale])
new_shape = (*x.shape[:-1], x.shape[-1] * self.scale)
return x.unsqueeze(-1).repeat(repeats).reshape(new_shape)
class SineGen:
def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voice_threshold=0, flag_for_pulse=False):
self.sine_amp, self.noise_std, self.harmonic_num, self.sampling_rate, self.voiced_threshold, self.flag_for_pulse = sine_amp, noise_std, harmonic_num, samp_rate, voice_threshold, flag_for_pulse
self.dim = self.harmonic_num + 1
def _f02uv(self, f0): return (f0 > self.voiced_threshold).float() #generate uv signal
def _f02sine(self, f0_values):
def padDiff(x : Tensor): return (x.pad((0,0,-1,1)) - x).pad((0,0,0,-1))
def mod(x: Tensor, n: int) -> Tensor: return x - n * x.div(n).floor() # this is what the % operator does in pytorch.
rad_values = mod((f0_values / self.sampling_rate) , 1) # convert to F0 in rad
rand_ini = Tensor.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device) # initial phase noise
#rand_ini[:, 0] = 0
m = Tensor.ones(f0_values.shape[0]).unsqueeze(1).pad((0,f0_values.shape[2]-1,0,0)).cast(dtypes.bool)
m = tilde(m)
rand_ini = m.where(rand_ini, 0)
#rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp = rad_values[:, 0, :] + rand_ini
m = Tensor.ones(tmp.shape).pad((0,0,0,rad_values.shape[1]-1,0)).cast(dtypes.bool)
m = tilde(m)
tmp = tmp.unsqueeze(1).pad((0,0,0,rad_values.shape[1]-1,0))
rad_values = m.where(rad_values, tmp)
tmp_over_one = mod(rad_values.cumsum(1), 1)
tmp_over_one_idx = padDiff(tmp_over_one) < 0
cumsum_shift = Tensor.zeros_like(rad_values)
#cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
tmp_over_one_idx = (tmp_over_one_idx * -1.0).pad((0,0,1,0))
cumsum_shift = tmp_over_one_idx
sines = ((rad_values + cumsum_shift).cumsum(1) * 2 * np.pi).sin()
return sines
def forward(self, f0, upp=None):
fn = f0.mul(Tensor([[range(1, self.harmonic_num + 2)]], dtype=dtypes.float32).to(f0.device))
sine_waves = self._f02sine(fn) * self.sine_amp #generate sine waveforms
uv = self._f02uv(f0) # generate uv signal
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceHnNSF:
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0):
self.sine_amp, self.noise_std = sine_amp, add_noise_std
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
self.l_linear = nn.Linear(harmonic_num + 1, 1)
def forward(self, x, upp=None):
sine_waves, uv, _ = self.l_sin_gen.forward(x, upp)
sine_merge = self.l_linear(sine_waves.cast(self.l_linear.weight.dtype)).tanh()
noise = randn_like(uv) * self.sine_amp / 3
return sine_merge, noise, uv
# most of the hifigan in standard vits is reused here, but need to upsample and construct harmonic source from f0
class Generator:
def __init__(self, sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels):
self.sampling_rate, self.inter_channels, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.gin_channels = sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels
self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
self.conv_pre = nn.Conv1d(inter_channels, upsample_initial_channel, 7, 1, padding=3)
self.f0_upsamp = Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceHnNSF(sampling_rate, harmonic_num=8)
resblock = ResBlock1 if resblock == '1' else ResBlock2
self.ups, self.noise_convs, self.resblocks = [], [], []
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
c_cur = upsample_initial_channel//(2**(i+1))
self.ups.append(nn.ConvTranspose1d(upsample_initial_channel//(2**i), c_cur, k, u, padding=(k-u)//2))
stride_f0 = int(np.prod(upsample_rates[i + 1:]))
self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2) if (i + 1 < len(upsample_rates)) else nn.Conv1d(1, c_cur, kernel_size=1))
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3)
if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp = np.prod(upsample_rates)
def forward(self, x, f0, g=None):
f0 = self.f0_upsamp.forward(f0[:, None]).transpose(1, 2) # bs,n,t
har_source, _, _ = self.m_source.forward(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None: x = x + self.cond(g)
for i in range(self.num_upsamples):
x, xs = self.ups[i](x.leaky_relu(LRELU_SLOPE)), None
x_source = self.noise_convs[i](har_source)
x = x + x_source
for j in range(self.num_kernels):
if xs is None: xs = self.resblocks[i * self.num_kernels + j].forward(x)
else: xs += self.resblocks[i * self.num_kernels + j].forward(x)
x = xs / self.num_kernels
return self.conv_post(x.leaky_relu()).tanh()
# **** helpers ****
def randn_like(x:Tensor) -> Tensor: return Tensor.randn(*x.shape, dtype=x.dtype).to(device=x.device)
def tilde(x: Tensor) -> Tensor:
if x.dtype == dtypes.bool: return (1 - x).cast(dtypes.bool)
return (x + 1) * -1 # this seems to be what the ~ operator does in pytorch for non bool
def lengths_to_padding_mask(lens:Tensor) -> Tensor:
bsz, max_lens = lens.shape[0], lens.max().numpy().item()
mask = Tensor.arange(max_lens).to(lens.device).reshape(1, max_lens)
mask = mask.expand(bsz, -1) >= lens.reshape(bsz, 1).expand(-1, max_lens)
return mask.cast(dtypes.bool)
def repeat_expand_2d_left(content, target_len): # content : [h, t]
src_len = content.shape[-1]
temp = np.arange(src_len+1) * target_len / src_len
current_pos, cols = 0, []
for i in range(target_len):
if i >= temp[current_pos+1]:
current_pos += 1
cols.append(content[:, current_pos])
return Tensor.stack(*cols).transpose(0, 1)
def load_fairseq_cfg(checkpoint_path):
assert Path(checkpoint_path).is_file()
state = torch_load(checkpoint_path)
cfg = state["cfg"] if ("cfg" in state and state["cfg"] is not None) else None
if cfg is None: raise RuntimeError(f"No cfg exist in state keys = {state.keys()}")
return HParams(**cfg)
def load_checkpoint_enc(checkpoint_path, model: ContentVec, optimizer=None, skip_list=[]):
assert Path(checkpoint_path).is_file()
start_time = time.time()
checkpoint_dict = torch_load(checkpoint_path)
saved_state_dict = checkpoint_dict['model']
weight_g, weight_v, parent = None, None, None
for key, v in saved_state_dict.items():
if any(layer in key for layer in skip_list): continue
try:
obj, skip = model, False
for k in key.split('.'):
if k.isnumeric(): obj = obj[int(k)]
elif isinstance(obj, dict): obj = obj[k]
else:
if k in ["weight_g", "weight_v"]:
parent, skip = obj, True
if k == "weight_g": weight_g = v
else: weight_v = v
if not skip:
parent = obj
obj = getattr(obj, k)
if weight_g and weight_v:
setattr(obj, "weight_g", weight_g.numpy())
setattr(obj, "weight_v", weight_v.numpy())
obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
weight_g, weight_v, parent, skip = None, None, None, False
if not skip and obj.shape == v.shape:
if "feature_extractor" in key and (isinstance(parent, (nn.GroupNorm, nn.LayerNorm))): # cast
obj.assign(v.to(obj.device).float())
else:
obj.assign(v.to(obj.device))
elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
except Exception as e: raise e
logging.info(f"Loaded checkpoint '{checkpoint_path}' in {time.time() - start_time:.4f}s")
return model, optimizer
def pad_array(arr, target_length):
current_length = arr.shape[0]
if current_length >= target_length: return arr
pad_width = target_length - current_length
pad_left = pad_width // 2
pad_right = pad_width - pad_left
padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
return padded_arr
def split_list_by_n(list_collection, n, pre=0):
for i in range(0, len(list_collection), n):
yield list_collection[i-pre if i-pre>=0 else i: i + n]
def get_sid(spk2id:HParams, speaker:str) -> Tensor:
speaker_id = spk2id[speaker]
if not speaker_id and type(speaker) is int:
if len(spk2id.__dict__) >= speaker: speaker_id = speaker
if speaker_id is None: raise RuntimeError(f"speaker={speaker} not in the speaker list")
return Tensor([int(speaker_id)], dtype=dtypes.int64).unsqueeze(0)
def get_encoder(ssl_dim) -> Type[SpeechEncoder]:
if ssl_dim == 256: return ContentVec256L9
if ssl_dim == 768: return ContentVec768L12
#########################################################################################
# CODE: https://github.com/svc-develop-team/so-vits-svc
#########################################################################################
# CONTENTVEC:
# CODE: https://github.com/auspicious3000/contentvec
# PAPER: https://arxiv.org/abs/2204.09224
#########################################################################################
# INSTALLATION: dependencies are for preprocessing and loading/saving audio.
# pip3 install soundfile librosa praat-parselmouth
#########################################################################################
# EXAMPLE USAGE:
# python3 examples/so_vits_svc.py --model tf2spy --file ~/recording.wav
#########################################################################################
# DEMO USAGE (uses audio sample from LJ-Speech):
# python3 examples/so_vits_svc.py --model saul_goodman
#########################################################################################
SO_VITS_SVC_PATH = Path(__file__).parents[1] / "weights/So-VITS-SVC"
VITS_MODELS = { # config_path, weights_path, config_url, weights_url
"saul_goodman" : (SO_VITS_SVC_PATH / "config_saul_gman.json", SO_VITS_SVC_PATH / "pretrained_saul_gman.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/G_80000.pth"),
"drake" : (SO_VITS_SVC_PATH / "config_drake.json", SO_VITS_SVC_PATH / "pretrained_drake.pth", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/config_aubrey.json", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/pretrained_aubrey.pth"),
"cartman" : (SO_VITS_SVC_PATH / "config_cartman.json", SO_VITS_SVC_PATH / "pretrained_cartman.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/G_10200.pth"),
"tf2spy" : (SO_VITS_SVC_PATH / "config_tf2spy.json", SO_VITS_SVC_PATH / "pretrained_tf2spy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/G_60000.pth"),
"tf2heavy" : (SO_VITS_SVC_PATH / "config_tf2heavy.json", SO_VITS_SVC_PATH / "pretrained_tf2heavy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/G_100000.pth"),
"lady_gaga" : (SO_VITS_SVC_PATH / "config_gaga.json", SO_VITS_SVC_PATH / "pretrained_gaga.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/G_14400.pth")
}
ENCODER_MODELS = { # weights_path, weights_url
"contentvec": (SO_VITS_SVC_PATH / "contentvec_checkpoint.pt", "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
}
ENCODER_MODEL = "contentvec"
DEMO_PATH, DEMO_URL = Path(__file__).parents[1] / "temp/LJ037-0171.wav", "https://keithito.com/LJ-Speech-Dataset/LJ037-0171.wav"
if __name__=="__main__":
logging.basicConfig(stream=sys.stdout, level=(logging.INFO if DEBUG < 1 else logging.DEBUG))
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", default=None, help=f"Specify the model to use. All supported models: {VITS_MODELS.keys()}", required=True)
parser.add_argument("-f", "--file", default=DEMO_PATH, help=f"Specify the path of the input file")
parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
parser.add_argument("--speaker", default=None, help="If not specified, the first available speaker is chosen. Usually there is only one speaker per model.")
parser.add_argument("--noise_scale", default=0.4)
parser.add_argument("--tran", default=0.0, help="Pitch shift, supports positive and negative (semitone) values. Default 0.0")
parser.add_argument("--pad_seconds", default=0.5)
parser.add_argument("--lg_num", default=0.0)
parser.add_argument("--clip_seconds", default=0.0)
parser.add_argument("--slice_db", default=-40)
args = parser.parse_args()
vits_model = args.model
encoder_location, vits_location = ENCODER_MODELS[ENCODER_MODEL], VITS_MODELS[vits_model]
Tensor.training = False
# Get Synthesizer and ContentVec
net_g, hps = Synthesizer.load_from_pretrained(vits_location[0], vits_location[2], vits_location[1], vits_location[3])
Encoder = get_encoder(hps.model.ssl_dim)
encoder = Encoder.load_from_pretrained(encoder_location[0], encoder_location[1])
# model config args
target_sample, spk2id, hop_length, target_sample = hps.data.sampling_rate, hps.spk, hps.data.hop_length, hps.data.sampling_rate
vol_embedding = hps.model.vol_embedding if hasattr(hps.data, "vol_embedding") and hps.model.vol_embedding is not None else False
# args
slice_db, clip_seconds, lg_num, pad_seconds, tran, noise_scale, audio_path = args.slice_db, args.clip_seconds, args.lg_num, args.pad_seconds, args.tran, args.noise_scale, args.file
speaker = args.speaker if args.speaker is not None else list(hps.spk.__dict__.keys())[0]
### Loading audio and slicing ###
if audio_path == DEMO_PATH: fetch(DEMO_URL, DEMO_PATH)
assert Path(audio_path).is_file() and Path(audio_path).suffix == ".wav"
chunks = preprocess.cut(audio_path, db_thresh=slice_db)
audio_data, audio_sr = preprocess.chunks2audio(audio_path, chunks)
per_size = int(clip_seconds * audio_sr)
lg_size = int(lg_num * audio_sr)
### Infer per slice ###
global_frame = 0
audio = []
for (slice_tag, data) in audio_data:
print(f"\n====segment start, {round(len(data) / audio_sr, 3)}s====")
length = int(np.ceil(len(data) / audio_sr * target_sample))
if slice_tag:
print("empty segment")
_audio = np.zeros(length)
audio.extend(list(pad_array(_audio, length)))
global_frame += length // hop_length
continue
datas = [data] if per_size == 0 else split_list_by_n(data, per_size, lg_size)
for k, dat in enumerate(datas):
per_length = int(np.ceil(len(dat) / audio_sr * target_sample)) if clip_seconds!=0 else length
pad_len = int(audio_sr * pad_seconds)
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
raw_path = io.BytesIO()
soundfile.write(raw_path, dat, audio_sr, format="wav")
raw_path.seek(0)
### Infer START ###
wav, sr = preprocess.load_audiofile(raw_path)
wav = preprocess.sinc_interp_resample(wav, sr, target_sample)[0]
wav16k, f0, uv = preprocess.get_unit_f0(wav, tran, hop_length, target_sample)
sid = get_sid(spk2id, speaker)
n_frames = f0.shape[1]
# ContentVec infer
start = time.time()
c = encoder.encode(wav16k)
c = repeat_expand_2d_left(c.squeeze(0).realize(), f0.shape[1]) # interpolate speech encoding to match f0
c = c.unsqueeze(0).realize()
enc_time = time.time() - start
# VITS infer
vits_start = time.time()
out_audio, f0 = net_g.infer(c, f0=f0, uv=uv, g=sid, noise_scale=noise_scale, vol=None)
out_audio = out_audio[0,0].float().realize()
vits_time = time.time() - vits_start
infer_time = time.time() - start
logging.info("total infer time:{:.2f}s, speech_enc time:{:.2f}s, vits time:{:.2f}s".format(infer_time, enc_time, vits_time))
### Infer END ###
out_sr, out_frame = out_audio.shape[-1], n_frames
global_frame += out_frame
_audio = out_audio.numpy()
pad_len = int(target_sample * pad_seconds)
_audio = _audio[pad_len:-pad_len]
_audio = pad_array(_audio, per_length)
audio.extend(list(_audio))
audio = np.array(audio)
out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model}{f'_spk_{speaker}'}_{args.base_name}.wav")
out_path.parent.mkdir(parents=True, exist_ok=True)
soundfile.write(out_path, audio, target_sample, format="flac")
logging.info(f"Saved audio output to {out_path}")

View file

@ -1,204 +0,0 @@
import math
from typing import Optional, Tuple
from tinygrad import Tensor, dtypes
import librosa
import soundfile
import numpy as np
import parselmouth
class PMF0Predictor: # from https://github.com/svc-develop-team/so-vits-svc/
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
self.hop_length, self.f0_min, self.f0_max, self.sampling_rate, self.name = hop_length, f0_min, f0_max, sampling_rate, "pm"
def interpolate_f0(self,f0):
vuv_vector = np.zeros_like(f0, dtype=np.float32)
vuv_vector[f0 > 0.0] = 1.0
vuv_vector[f0 <= 0.0] = 0.0
nzindex = np.nonzero(f0)[0]
data = f0[nzindex]
nzindex = nzindex.astype(np.float32)
time_org = self.hop_length / self.sampling_rate * nzindex
time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
if data.shape[0] <= 0: return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
if data.shape[0] == 1: return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
return f0,vuv_vector
def compute_f0(self,wav,p_len=None):
x = wav
if p_len is None: p_len = x.shape[0]//self.hop_length
else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = parselmouth.Sound(x, self.sampling_rate) \
.to_pitch_ac(time_step=time_step / 1000, voicing_threshold=0.6,pitch_floor=self.f0_min, pitch_ceiling=self.f0_max) \
.selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
f0,uv = self.interpolate_f0(f0)
return f0
def compute_f0_uv(self,wav,p_len=None):
x = wav
if p_len is None: p_len = x.shape[0]//self.hop_length
else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
f0,uv = self.interpolate_f0(f0)
return f0,uv
class Slicer: # from https://github.com/svc-develop-team/so-vits-svc/
def __init__(self, sr: int, threshold: float = -40., min_length: int = 5000, min_interval: int = 300, hop_size: int = 20, max_sil_kept: int = 5000):
if not min_length >= min_interval >= hop_size:
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
if not max_sil_kept >= hop_size:
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1: return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
else: return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
def slice(self, waveform):
samples = librosa.to_mono(waveform) if len(waveform.shape) > 1 else waveform
if samples.shape[0] <= self.min_length: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
sil_tags, silence_start, clip_start = [], None, 0
for i, rms in enumerate(rms_list):
if rms < self.threshold: # Keep looping while frame is silent.
if silence_start is None: # Record start of silent frames.
silence_start = i
continue
if silence_start is None: continue # Keep looping while frame is not silent and silence start has not been recorded.
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
if i - silence_start <= self.max_sil_kept: # Need slicing. Record the range of silent frames to be removed.
pos = rms_list[silence_start: i + 1].argmin() + silence_start
sil_tags.append((0, pos) if silence_start == 0 else (pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
pos += i - self.max_sil_kept
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
sil_tags.append((0, pos_r) if silence_start == 0 else (pos_l, pos_r))
clip_start = pos_r
silence_start = None
total_frames = rms_list.shape[0]
if silence_start is not None and total_frames - silence_start >= self.min_interval: # Deal with trailing silence.
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
if len(sil_tags) == 0: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}} # Apply and return slices.
chunks = []
if sil_tags[0][0]:
chunks.append({"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
for i in range(0, len(sil_tags)):
if i: chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
chunks.append({"slice": True, "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
if sil_tags[-1][1] * self.hop_size < len(waveform):
chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
chunk_dict = {}
for i in range(len(chunks)): chunk_dict[str(i)] = chunks[i]
return chunk_dict
# sinc_interp_hann audio resampling
class Resample:
def __init__(self, orig_freq:int=16000, new_freq:int=16000, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None, dtype:Optional[dtypes]=None):
self.orig_freq, self.new_freq, self.lowpass_filter_width, self.rolloff, self.beta = orig_freq, new_freq, lowpass_filter_width, rolloff, beta
self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
self.kernel, self.width = self._get_sinc_resample_kernel(dtype) if self.orig_freq != self.new_freq else (None, None)
def __call__(self, waveform:Tensor) -> Tensor:
if self.orig_freq == self.new_freq: return waveform
return self._apply_sinc_resample_kernel(waveform)
def _apply_sinc_resample_kernel(self, waveform:Tensor):
if not waveform.is_floating_point(): raise TypeError(f"Waveform tensor expected to be of type float, but received {waveform.dtype}.")
orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
shape = waveform.shape
waveform = waveform.reshape(-1, shape[-1]) # pack batch
num_wavs, length = waveform.shape
target_length = int(math.ceil(new_freq * length / orig_freq))
waveform = waveform.pad((self.width, self.width + orig_freq))
resampled = waveform[:, None].conv2d(self.kernel, stride=orig_freq)
resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
resampled = resampled[..., :target_length]
resampled = resampled.reshape(shape[:-1] + resampled.shape[-1:]) # unpack batch
return resampled
def _get_sinc_resample_kernel(self, dtype=None):
orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
if self.lowpass_filter_width <= 0: raise ValueError("Low pass filter width should be positive.")
base_freq = min(orig_freq, new_freq)
base_freq *= self.rolloff
width = math.ceil(self.lowpass_filter_width * orig_freq / base_freq)
idx = Tensor.arange(-width, width + orig_freq, dtype=(dtype if dtype is not None else dtypes.float32))[None, None] / orig_freq
t = Tensor.arange(0, -new_freq, -1, dtype=dtype)[:, None, None] / new_freq + idx
t *= base_freq
t = t.clip(-self.lowpass_filter_width, self.lowpass_filter_width)
window = (t * math.pi / self.lowpass_filter_width / 2).cos() ** 2
t *= math.pi
scale = base_freq / orig_freq
kernels = Tensor.where(t == 0, Tensor(1.0, dtype=t.dtype).to(t.device), t.sin() / t)
kernels *= window * scale
if dtype is None: kernels = kernels.cast(dtype=dtypes.float32)
return kernels, width
def sinc_interp_resample(x:Tensor, orig_freq:int=16000, new_freq:int=1600, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None):
resamp = Resample(orig_freq, new_freq, lowpass_filter_width, rolloff, beta, x.dtype)
return resamp(x)
def cut(audio_path, db_thresh=-30, min_len=5000):
audio, sr = librosa.load(audio_path, sr=None)
slicer = Slicer(sr=sr, threshold=db_thresh, min_length=min_len)
chunks = slicer.slice(audio)
return chunks
def chunks2audio(audio_path, chunks):
chunks = dict(chunks)
audio, sr = load_audiofile(audio_path)
if len(audio.shape) == 2 and audio.shape[1] >= 2:
audio = audio.mean(0).unsqueeze(0)
audio = audio.numpy()[0]
result = []
for k, v in chunks.items():
tag = v["split_time"].split(",")
if tag[0] != tag[1]:
result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
return result, sr
def load_audiofile(filepath:str, frame_offset:int=0, num_frames:int=-1, channels_first:bool=True):
with soundfile.SoundFile(filepath, "r") as file_:
frames = file_._prepare_read(frame_offset, None, num_frames)
waveform = file_.read(frames, "float32", always_2d=True)
sample_rate = file_.samplerate
waveform = Tensor(waveform)
if channels_first: waveform = waveform.transpose(0, 1)
return waveform, sample_rate
def get_unit_f0(wav:Tensor, tran, hop_length, target_sample, f0_filter=False) -> Tuple[Tensor,Tensor,Tensor]:
f0_predictor = PMF0Predictor(hop_length, sampling_rate=target_sample)
f0, uv = f0_predictor.compute_f0_uv(wav.numpy())
if f0_filter and sum(f0) == 0: raise RuntimeError("No voice detected")
f0 = Tensor(f0.astype(np.float32)).float()
f0 = (f0 * 2 ** (tran / 12)).unsqueeze(0)
uv = Tensor(uv.astype(np.float32)).float().unsqueeze(0)
wav16k = sinc_interp_resample(wav[None,:], target_sample, 16000)[0]
return wav16k.realize(), f0.realize(), uv.realize()

View file

@ -9,7 +9,7 @@ from typing import Dict, Any
from PIL import Image
import numpy as np
from tinygrad import Device, GlobalCounters, dtypes, Tensor, TinyJit
from tinygrad.helpers import Timing, Context, getenv, fetch, colored, tqdm, flatten
from tinygrad.helpers import Timing, Context, getenv, fetch, colored, tqdm, flatten, profile_marker
from tinygrad.nn import Conv2d, GroupNorm
from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
from extra.models.clip import Closed, Tokenizer, FrozenOpenClipEmbedder
@ -266,13 +266,16 @@ if __name__ == "__main__":
parser.add_argument('--fakeweights', action='store_true', help="Skip loading checkpoints and use fake weights")
args = parser.parse_args()
profile_marker("create model")
model = StableDiffusion()
# load in weights
profile_marker("load in weights")
with WallTimeEvent(BenchEvent.LOAD_WEIGHTS):
if not args.fakeweights:
model_bin = fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt')
load_state_dict(model, torch_load(model_bin)['state_dict'], verbose=False, strict=False, realize=False)
state_dict = torch_load(model_bin)['state_dict']
profile_marker("state dict loaded")
load_state_dict(model, state_dict, verbose=False, strict=False, realize=False)
if args.fp16:
for k,v in get_state_dict(model).items():
@ -281,12 +284,13 @@ if __name__ == "__main__":
Tensor.realize(*get_state_dict(model).values())
# run through CLIP to get context
profile_marker("run clip (conditional)")
tokenizer = Tokenizer.ClipTokenizer()
prompt = Tensor([tokenizer.encode(args.prompt)])
context = model.cond_stage_model.transformer.text_model(prompt).realize()
print("got CLIP context", context.shape)
profile_marker("run clip (unconditional)")
prompt = Tensor([tokenizer.encode("")])
unconditional_context = model.cond_stage_model.transformer.text_model(prompt).realize()
print("got unconditional CLIP context", unconditional_context.shape)
@ -310,6 +314,7 @@ if __name__ == "__main__":
step_times = []
with Context(BEAM=getenv("LATEBEAM")):
for index, timestep in (t:=tqdm(list(enumerate(timesteps))[::-1])):
profile_marker(f"step {len(timesteps)-index-1}")
GlobalCounters.reset()
st = time.perf_counter_ns()
t.set_description("%3d %3d" % (index, timestep))
@ -319,24 +324,26 @@ if __name__ == "__main__":
latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
if args.timing: Device[Device.DEFAULT].synchronize()
step_times.append((time.perf_counter_ns() - st)*1e-6)
# done with diffusion model
del run
del model.model
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
min_time = min(step_times)
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
# upsample latent space to image with autoencoder
x = model.decode(latent)
profile_marker("run decoder") # upsample latent space to image with autoencoder
x = model.decode(latent).realize()
print(x.shape)
# save image
profile_marker("save image")
im = Image.fromarray(x.numpy())
print(f"saving {args.out}")
im.save(args.out)
# Open image.
if not args.noshow: im.show()
# validation!
if args.prompt == default_prompt and args.steps == 6 and args.seed == 0 and args.guidance == 7.5:
profile_marker("validate")
ref_image = Tensor(np.array(Image.open(Path(__file__).parent / "stable_diffusion_seed0.png")))
distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
assert distance < 3e-3, colored(f"validation failed with {distance=}", "red") # higher distance with WINO

View file

@ -1,104 +0,0 @@
import traceback
import time
from multiprocessing import Process, Queue
import numpy as np
from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim
from tinygrad.helpers import getenv, trange
from tinygrad.tensor import Tensor
from extra.datasets import fetch_cifar
from extra.models.efficientnet import EfficientNet
class TinyConvNet:
def __init__(self, classes=10):
conv = 3
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
self.l1 = Tensor.uniform(out_chan*6*6, classes)
def forward(self, x):
x = x.conv2d(self.c1).relu().max_pool2d()
x = x.conv2d(self.c2).relu().max_pool2d()
x = x.reshape(shape=[x.shape[0], -1])
return x.dot(self.l1)
if __name__ == "__main__":
IMAGENET = getenv("IMAGENET")
classes = 1000 if IMAGENET else 10
TINY = getenv("TINY")
TRANSFER = getenv("TRANSFER")
if TINY:
model = TinyConvNet(classes)
elif TRANSFER:
model = EfficientNet(getenv("NUM", 0), classes, has_se=True)
model.load_from_pretrained()
else:
model = EfficientNet(getenv("NUM", 0), classes, has_se=False)
parameters = get_parameters(model)
print("parameter count", len(parameters))
optimizer = optim.Adam(parameters, lr=0.001)
BS, steps = getenv("BS", 64 if TINY else 16), getenv("STEPS", 2048)
print(f"training with batch size {BS} for {steps} steps")
if IMAGENET:
from extra.datasets.imagenet import fetch_batch
def loader(q):
while 1:
try:
q.put(fetch_batch(BS))
except Exception:
traceback.print_exc()
q = Queue(16)
for i in range(2):
p = Process(target=loader, args=(q,))
p.daemon = True
p.start()
else:
X_train, Y_train, _, _ = fetch_cifar()
X_train = X_train.reshape((-1, 3, 32, 32))
Y_train = Y_train.reshape((-1,))
with Tensor.train():
for i in (t := trange(steps)):
if IMAGENET:
X, Y = q.get(True)
else:
samp = np.random.randint(0, X_train.shape[0], size=(BS))
X, Y = X_train.numpy()[samp], Y_train.numpy()[samp]
st = time.time()
out = model.forward(Tensor(X.astype(np.float32), requires_grad=False))
fp_time = (time.time()-st)*1000.0
y = np.zeros((BS,classes), np.float32)
y[range(y.shape[0]),Y] = -classes
y = Tensor(y, requires_grad=False)
loss = out.log_softmax().mul(y).mean()
optimizer.zero_grad()
st = time.time()
loss.backward()
bp_time = (time.time()-st)*1000.0
st = time.time()
optimizer.step()
opt_time = (time.time()-st)*1000.0
st = time.time()
loss = loss.numpy()
cat = out.argmax(axis=1).numpy()
accuracy = (cat == Y).mean()
finish_time = (time.time()-st)*1000.0
# printing
t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
(loss, accuracy,
fp_time, bp_time, opt_time, finish_time,
fp_time + bp_time + opt_time + finish_time))
del out, y, loss

View file

@ -1,46 +0,0 @@
import ast
import numpy as np
from PIL import Image
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv, fetch
from extra.models.vit import ViT
"""
fn = "gs://vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz"
import tensorflow as tf
with tf.io.gfile.GFile(fn, "rb") as f:
dat = f.read()
with open("cache/"+ fn.rsplit("/", 1)[1], "wb") as g:
g.write(dat)
"""
Tensor.training = False
if getenv("LARGE", 0) == 1:
m = ViT(embed_dim=768, num_heads=12)
else:
# tiny
m = ViT(embed_dim=192, num_heads=3)
m.load_from_pretrained()
# category labels
lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
#url = "https://upload.wikimedia.org/wikipedia/commons/4/41/Chicken.jpg"
url = "https://repository-images.githubusercontent.com/296744635/39ba6700-082d-11eb-98b8-cb29fb7369c0"
# junk
img = Image.open(fetch(url))
aspect_ratio = img.size[0] / img.size[1]
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
img = np.array(img)
y0,x0=(np.asarray(img.shape)[:2]-224)//2
img = img[y0:y0+224, x0:x0+224]
img = np.moveaxis(img, [2,0,1], [0,1,2])
img = img.astype(np.float32)[:3].reshape(1,3,224,224)
img /= 255.0
img -= 0.5
img /= 0.5
out = m.forward(Tensor(img))
outnp = out.numpy().ravel()
choice = outnp.argmax()
print(out.shape, choice, outnp[choice], lbls[choice])

View file

@ -1,740 +0,0 @@
import json, logging, math, re, sys, time, wave, argparse, numpy as np
from phonemizer.phonemize import default_separator, _phonemize
from phonemizer.backend import EspeakBackend
from phonemizer.punctuation import Punctuation
from functools import reduce
from pathlib import Path
from typing import List
from tinygrad import nn, dtypes
from tinygrad.helpers import fetch
from tinygrad.nn.state import torch_load
from tinygrad.tensor import Tensor
from tinygrad.engine.jit import TinyJit
from unidecode import unidecode
LRELU_SLOPE = 0.1
class Synthesizer:
def __init__(self, n_vocab, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, gin_channels=0, use_sdp=True, emotion_embedding=False, **kwargs):
self.n_vocab, self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.use_sdp = n_vocab, spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, use_sdp
self.enc_p = TextEncoder(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding)
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) if use_sdp else DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
if n_speakers > 1: self.emb_g = nn.Embedding(n_speakers, gin_channels)
def infer(self, x, x_lengths, sid=None, noise_scale=1.0, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None, max_y_length_estimate_scale=None, pad_length=-1):
x, m_p, logs_p, x_mask = self.enc_p.forward(x.realize(), x_lengths.realize(), emotion_embedding.realize() if emotion_embedding is not None else emotion_embedding)
g = self.emb_g(sid.reshape(1, 1)).squeeze(1).unsqueeze(-1) if self.n_speakers > 0 else None
logw = self.dp.forward(x, x_mask.realize(), g=g.realize(), reverse=self.use_sdp, noise_scale=noise_scale_w if self.use_sdp else 1.0)
w_ceil = Tensor.ceil(logw.exp() * x_mask * length_scale)
y_lengths = Tensor.maximum(w_ceil.sum([1, 2]), 1).cast(dtypes.int64)
return self.generate(g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length)
def generate(self, g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length):
max_y_length = y_lengths.max().item() if max_y_length_estimate_scale is None else max(15, x.shape[-1]) * max_y_length_estimate_scale
y_mask = sequence_mask(y_lengths, max_y_length).unsqueeze(1).cast(x_mask.dtype)
attn_mask = x_mask.unsqueeze(2) * y_mask.unsqueeze(-1)
attn = generate_path(w_ceil, attn_mask)
m_p_2 = attn.squeeze(1).matmul(m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
logs_p_2 = attn.squeeze(1).matmul(logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
z_p = m_p_2 + Tensor.randn(*m_p_2.shape, dtype=m_p_2.dtype) * logs_p_2.exp() * noise_scale
row_len = y_mask.shape[2]
if pad_length > -1:
# Pad flow forward inputs to enable JIT
assert pad_length > row_len, "pad length is too small"
y_mask = y_mask.pad(((0, 0), (0, 0), (0, pad_length - row_len))).cast(z_p.dtype)
# New y_mask tensor to remove sts mask
y_mask = Tensor(y_mask.numpy(), device=y_mask.device, dtype=y_mask.dtype, requires_grad=y_mask.requires_grad)
z_p = z_p.squeeze(0).pad(((0, 0), (0, pad_length - z_p.shape[2])), value=1).unsqueeze(0)
z = self.flow.forward(z_p.realize(), y_mask.realize(), g=g.realize(), reverse=True)
result_length = reduce(lambda x, y: x * y, self.dec.upsample_rates, row_len)
o = self.dec.forward((z * y_mask)[:, :, :max_len], g=g)[:, :, :result_length]
if max_y_length_estimate_scale is not None:
length_scaler = o.shape[-1] / max_y_length
o.realize()
real_max_y_length = y_lengths.max().numpy()
if real_max_y_length > max_y_length:
logging.warning(f"Underestimated max length by {(((real_max_y_length / max_y_length) * 100) - 100):.2f}%, recomputing inference without estimate...")
return self.generate(g, logs_p, m_p, max_len, None, noise_scale, w_ceil, x, x_mask, y_lengths)
if real_max_y_length < max_y_length:
overestimation = ((max_y_length / real_max_y_length) * 100) - 100
logging.info(f"Overestimated max length by {overestimation:.2f}%")
if overestimation > 10: logging.warning("Warning: max length overestimated by more than 10%")
o = o[:, :, :(real_max_y_length * length_scaler).astype(np.int32)]
return o
class StochasticDurationPredictor:
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
filter_channels = in_channels # it needs to be removed from future version.
self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.n_flows, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, n_flows, gin_channels
self.log_flow, self.flows = Log(), [ElementwiseAffine(2)]
for _ in range(n_flows):
self.flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
self.flows.append(Flip())
self.post_pre, self.post_proj = nn.Conv1d(1, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
self.post_convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
self.post_flows = [ElementwiseAffine(2)]
for _ in range(4):
self.post_flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
self.post_flows.append(Flip())
self.pre, self.proj = nn.Conv1d(in_channels, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
@TinyJit
def forward(self, x: Tensor, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
x = self.pre(x.detach())
if g is not None: x = x + self.cond(g.detach())
x = self.convs.forward(x, x_mask)
x = self.proj(x) * x_mask
if not reverse:
flows = self.flows
assert w is not None
log_det_tot_q = 0
h_w = self.post_proj(self.post_convs.forward(self.post_pre(w), x_mask)) * x_mask
e_q = Tensor.randn(w.size(0), 2, w.size(2), dtype=x.dtype).to(device=x.device) * x_mask
z_q = e_q
for flow in self.post_flows:
z_q, log_det_q = flow.forward(z_q, x_mask, g=(x + h_w))
log_det_tot_q += log_det_q
z_u, z1 = z_q.split([1, 1], 1)
u = z_u.sigmoid() * x_mask
z0 = (w - u) * x_mask
log_det_tot_q += Tensor.sum((z_u.logsigmoid() + (-z_u).logsigmoid()) * x_mask, [1,2])
log_q = Tensor.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - log_det_tot_q
log_det_tot = 0
z0, log_det = self.log_flow.forward(z0, x_mask)
log_det_tot += log_det
z = z0.cat(z1, 1)
for flow in flows:
z, log_det = flow.forward(z, x_mask, g=x, reverse=reverse)
log_det_tot = log_det_tot + log_det
nll = Tensor.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - log_det_tot
return (nll + log_q).realize() # [b]
flows = list(reversed(self.flows))
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
z = Tensor.randn(x.shape[0], 2, x.shape[2], dtype=x.dtype).to(device=x.device) * noise_scale
for flow in flows: z = flow.forward(z, x_mask, g=x, reverse=reverse)
z0, z1 = z.split([1, 1], 1)
return z0.realize()
class DurationPredictor:
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, gin_channels
self.conv_1, self.norm_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
self.conv_2, self.norm_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
self.proj = nn.Conv1d(filter_channels, 1, 1)
if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, in_channels, 1)
def forward(self, x: Tensor, x_mask, g=None):
x = x.detach()
if g is not None: x = x + self.cond(g.detach())
x = self.conv_1(x * x_mask).relu()
x = self.norm_1(x).dropout(self.p_dropout)
x = self.conv_2(x * x_mask).relu(x)
x = self.norm_2(x).dropout(self.p_dropout)
return self.proj(x * x_mask) * x_mask
class TextEncoder:
def __init__(self, n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding):
self.n_vocab, self.out_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout = n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
if n_vocab!=0:self.emb = nn.Embedding(n_vocab, hidden_channels)
if emotion_embedding: self.emo_proj = nn.Linear(1024, hidden_channels)
self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
@TinyJit
def forward(self, x: Tensor, x_lengths: Tensor, emotion_embedding=None):
if self.n_vocab!=0: x = (self.emb(x) * math.sqrt(self.hidden_channels))
if emotion_embedding: x = x + self.emo_proj(emotion_embedding).unsqueeze(1)
x = x.transpose(1, -1) # [b, t, h] -transpose-> [b, h, t]
x_mask = sequence_mask(x_lengths, x.shape[2]).unsqueeze(1).cast(x.dtype)
x = self.encoder.forward(x * x_mask, x_mask)
m, logs = (self.proj(x) * x_mask).split(self.out_channels, dim=1)
return x.realize(), m.realize(), logs.realize(), x_mask.realize()
class ResidualCouplingBlock:
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.n_flows, self.gin_channels = channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows, gin_channels
self.flows = []
for _ in range(n_flows):
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
self.flows.append(Flip())
@TinyJit
def forward(self, x, x_mask, g=None, reverse=False):
for flow in reversed(self.flows) if reverse else self.flows: x = flow.forward(x, x_mask, g=g, reverse=reverse)
return x.realize()
class PosteriorEncoder:
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
self.in_channels, self.out_channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels = in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels
self.pre, self.proj = nn.Conv1d(in_channels, hidden_channels, 1), nn.Conv1d(hidden_channels, out_channels * 2, 1)
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
def forward(self, x, x_lengths, g=None):
x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).cast(x.dtype)
stats = self.proj(self.enc.forward(self.pre(x) * x_mask, x_mask, g=g)) * x_mask
m, logs = stats.split(self.out_channels, dim=1)
z = (m + Tensor.randn(m.shape, m.dtype) * logs.exp()) * x_mask
return z, m, logs, x_mask
class Generator:
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
resblock = ResBlock1 if resblock == '1' else ResBlock2
self.ups = [nn.ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), k, u, padding=(k-u)//2) for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes))]
self.resblocks = []
self.upsample_rates = upsample_rates
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
@TinyJit
def forward(self, x: Tensor, g=None):
x = self.conv_pre(x)
if g is not None: x = x + self.cond(g)
for i in range(self.num_upsamples):
x = self.ups[i](x.leaky_relu(LRELU_SLOPE))
xs = sum(self.resblocks[i * self.num_kernels + j].forward(x) for j in range(self.num_kernels))
x = (xs / self.num_kernels).realize()
res = self.conv_post(x.leaky_relu()).tanh().realize()
return res
class LayerNorm(nn.LayerNorm):
def __init__(self, channels, eps=1e-5): super().__init__(channels, eps, elementwise_affine=True)
def forward(self, x: Tensor): return self.__call__(x.transpose(1, -1)).transpose(1, -1)
class WN:
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
assert (kernel_size % 2 == 1)
self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels, self.p_dropout = hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels, p_dropout
self.in_layers, self.res_skip_layers = [], []
if gin_channels != 0: self.cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
for i in range(n_layers):
dilation = dilation_rate ** i
self.in_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=int((kernel_size * dilation - dilation) / 2)))
self.res_skip_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels if i < n_layers - 1 else hidden_channels, 1))
def forward(self, x, x_mask, g=None, **kwargs):
output = Tensor.zeros_like(x)
if g is not None: g = self.cond_layer(g)
for i in range(self.n_layers):
x_in = self.in_layers[i](x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
else:
g_l = Tensor.zeros_like(x_in)
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
output = output + res_skip_acts[:, self.hidden_channels:, :]
else:
output = output + res_skip_acts
return output * x_mask
class ResBlock1:
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
self.convs1 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(3)]
self.convs2 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)) for _ in range(3)]
def forward(self, x: Tensor, x_mask=None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = x.leaky_relu(LRELU_SLOPE)
xt = c1(xt if x_mask is None else xt * x_mask).leaky_relu(LRELU_SLOPE)
x = c2(xt if x_mask is None else xt * x_mask) + x
return x if x_mask is None else x * x_mask
class ResBlock2:
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
self.convs = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(2)]
def forward(self, x, x_mask=None):
for c in self.convs:
xt = x.leaky_relu(LRELU_SLOPE)
xt = c(xt if x_mask is None else xt * x_mask)
x = xt + x
return x if x_mask is None else x * x_mask
class DDSConv: # Dilated and Depth-Separable Convolution
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
self.channels, self.kernel_size, self.n_layers, self.p_dropout = channels, kernel_size, n_layers, p_dropout
self.convs_sep, self.convs_1x1, self.norms_1, self.norms_2 = [], [], [], []
for i in range(n_layers):
dilation = kernel_size ** i
padding = (kernel_size * dilation - dilation) // 2
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding))
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g=None):
if g is not None: x = x + g
for i in range(self.n_layers):
y = self.convs_sep[i](x * x_mask)
y = self.norms_1[i].forward(y).gelu()
y = self.convs_1x1[i](y)
y = self.norms_2[i].forward(y).gelu()
x = x + y.dropout(self.p_dropout)
return x * x_mask
class ConvFlow:
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
self.in_channels, self.filter_channels, self.kernel_size, self.n_layers, self.num_bins, self.tail_bound = in_channels, filter_channels, kernel_size, n_layers, num_bins, tail_bound
self.half_channels = in_channels // 2
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = x.split([self.half_channels] * 2, 1)
h = self.proj(self.convs.forward(self.pre(x0), x_mask, g=g)) * x_mask
b, c, t = x0.shape
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
un_normalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
un_normalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
un_normalized_derivatives = h[..., 2 * self.num_bins:]
x1, log_abs_det = piecewise_rational_quadratic_transform(x1, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=reverse, tails='linear', tail_bound=self.tail_bound)
x = x0.cat(x1, dim=1) * x_mask
return x if reverse else (x, Tensor.sum(log_abs_det * x_mask, [1,2]))
class ResidualCouplingLayer:
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
assert channels % 2 == 0, "channels should be divisible by 2"
self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.mean_only = channels, hidden_channels, kernel_size, dilation_rate, n_layers, mean_only
self.half_channels = channels // 2
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = x.split([self.half_channels] * 2, 1)
stats = self.post(self.enc.forward(self.pre(x0) * x_mask, x_mask, g=g)) * x_mask
if not self.mean_only:
m, logs = stats.split([self.half_channels] * 2, 1)
else:
m = stats
logs = Tensor.zeros_like(m)
if not reverse: return x0.cat((m + x1 * logs.exp() * x_mask), dim=1)
return x0.cat(((x1 - m) * (-logs).exp() * x_mask), dim=1)
class Log:
def forward(self, x : Tensor, x_mask, reverse=False):
if not reverse:
y = x.maximum(1e-5).log() * x_mask
return y, (-y).sum([1, 2])
return x.exp() * x_mask
class Flip:
def forward(self, x: Tensor, *args, reverse=False, **kwargs):
return x.flip([1]) if reverse else (x.flip([1]), Tensor.zeros(x.shape[0], dtype=x.dtype).to(device=x.device))
class ElementwiseAffine:
def __init__(self, channels): self.m, self.logs = Tensor.zeros(channels, 1), Tensor.zeros(channels, 1)
def forward(self, x, x_mask, reverse=False, **kwargs): # x if reverse else y, logdet
return (x - self.m) * Tensor.exp(-self.logs) * x_mask if reverse \
else ((self.m + Tensor.exp(self.logs) * x) * x_mask, Tensor.sum(self.logs * x_mask, [1, 2]))
class MultiHeadAttention:
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
assert channels % n_heads == 0
self.channels, self.out_channels, self.n_heads, self.p_dropout, self.window_size, self.heads_share, self.block_length, self.proximal_bias, self.proximal_init = channels, out_channels, n_heads, p_dropout, window_size, heads_share, block_length, proximal_bias, proximal_init
self.attn, self.k_channels = None, channels // n_heads
self.conv_q, self.conv_k, self.conv_v = [nn.Conv1d(channels, channels, 1) for _ in range(3)]
self.conv_o = nn.Conv1d(channels, out_channels, 1)
if window_size is not None: self.emb_rel_k, self.emb_rel_v = [Tensor.randn(1 if heads_share else n_heads, window_size * 2 + 1, self.k_channels) * (self.k_channels ** -0.5) for _ in range(2)]
def forward(self, x, c, attn_mask=None):
q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask)
return self.conv_o(x)
def attention(self, query: Tensor, key: Tensor, value: Tensor, mask=None):# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = key.shape[0], key.shape[1], key.shape[2], query.shape[2]
query = query.reshape(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = (query / math.sqrt(self.k_channels)) @ key.transpose(-2, -1)
if self.window_size is not None:
assert t_s == t_t, "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
scores = scores + self._relative_position_to_absolute_position(rel_logits)
if mask is not None:
scores = Tensor.where(mask, scores, -1e4)
if self.block_length is not None:
assert t_s == t_t, "Local attention is only available for self-attention."
scores = Tensor.where(Tensor.ones_like(scores).triu(-self.block_length).tril(self.block_length), scores, -1e4)
p_attn = scores.softmax(axis=-1) # [b, n_h, t_t, t_s]
output = p_attn.matmul(value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
output = output.transpose(2, 3).contiguous().reshape(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y): return x.matmul(y.unsqueeze(0)) # x: [b, h, l, m], y: [h or 1, m, d], ret: [b, h, l, d]
def _matmul_with_relative_keys(self, x, y): return x.matmul(y.unsqueeze(0).transpose(-2, -1)) # x: [b, h, l, d], y: [h or 1, m, d], re, : [b, h, l, m]
def _get_relative_embeddings(self, relative_embeddings, length):
pad_length, slice_start_position = max(length - (self.window_size + 1), 0), max((self.window_size + 1) - length, 0)
padded_relative_embeddings = relative_embeddings if pad_length <= 0\
else relative_embeddings.pad(convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
return padded_relative_embeddings[:, slice_start_position:(slice_start_position + 2 * length - 1)] #used_relative_embeddings
def _relative_position_to_absolute_position(self, x: Tensor): # x: [b, h, l, 2*l-1] -> [b, h, l, l]
batch, heads, length, _ = x.shape
x = x.pad(convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
x_flat = x.reshape([batch, heads, length * 2 * length]).pad(convert_pad_shape([[0,0],[0,0],[0,length-1]]))
return x_flat.reshape([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
def _absolute_position_to_relative_position(self, x: Tensor): # x: [b, h, l, l] -> [b, h, l, 2*l-1]
batch, heads, length, _ = x.shape
x = x.pad(convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
x_flat = x.reshape([batch, heads, length**2 + length*(length -1)]).pad(convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
return x_flat.reshape([batch, heads, length, 2*length])[:,:,:,1:]
class FFN:
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
self.in_channels, self.out_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.activation, self.causal = in_channels, out_channels, filter_channels, kernel_size, p_dropout, activation, causal
self.padding = self._causal_padding if causal else self._same_padding
self.conv_1, self.conv_2 = nn.Conv1d(in_channels, filter_channels, kernel_size), nn.Conv1d(filter_channels, out_channels, kernel_size)
def forward(self, x, x_mask):
x = self.conv_1(self.padding(x * x_mask))
x = x * (1.702 * x).sigmoid() if self.activation == "gelu" else x.relu()
return self.conv_2(self.padding(x.dropout(self.p_dropout) * x_mask)) * x_mask
def _causal_padding(self, x):return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [self.kernel_size - 1, 0]]))
def _same_padding(self, x): return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [(self.kernel_size - 1) // 2, self.kernel_size // 2]]))
class Encoder:
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.window_size = hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, window_size
self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 = [], [], [], []
for _ in range(n_layers):
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
attn_mask, x = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1), x * x_mask
for i in range(self.n_layers):
y = self.attn_layers[i].forward(x, x, attn_mask).dropout(self.p_dropout)
x = self.norm_layers_1[i].forward(x + y)
y = self.ffn_layers[i].forward(x, x_mask).dropout(self.p_dropout)
x = self.norm_layers_2[i].forward(x + y)
return x * x_mask
DEFAULT_MIN_BIN_WIDTH, DEFAULT_MIN_BIN_HEIGHT, DEFAULT_MIN_DERIVATIVE = 1e-3, 1e-3, 1e-3
def piecewise_rational_quadratic_transform(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails=None, tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
if tails is None: spline_fn, spline_kwargs = rational_quadratic_spline, {}
else: spline_fn, spline_kwargs = unconstrained_rational_quadratic_spline, {'tails': tails, 'tail_bound': tail_bound}
return spline_fn(inputs=inputs, un_normalized_widths=un_normalized_widths, un_normalized_heights=un_normalized_heights, un_normalized_derivatives=un_normalized_derivatives, inverse=inverse, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative, **spline_kwargs)
def unconstrained_rational_quadratic_spline(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails='linear', tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
if not tails == 'linear': raise RuntimeError('{} tails are not implemented.'.format(tails))
constant = np.log(np.exp(1 - min_derivative) - 1).item()
un_normalized_derivatives = cat_lr(un_normalized_derivatives, constant, constant)
output, log_abs_det = rational_quadratic_spline(inputs=inputs.squeeze(dim=0).squeeze(dim=0), unnormalized_widths=un_normalized_widths.squeeze(dim=0).squeeze(dim=0), unnormalized_heights=un_normalized_heights.squeeze(dim=0).squeeze(dim=0), unnormalized_derivatives=un_normalized_derivatives.squeeze(dim=0).squeeze(dim=0), inverse=inverse, left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative)
return output.unsqueeze(dim=0).unsqueeze(dim=0), log_abs_det.unsqueeze(dim=0).unsqueeze(dim=0)
def rational_quadratic_spline(inputs: Tensor, unnormalized_widths: Tensor, unnormalized_heights: Tensor, unnormalized_derivatives: Tensor, inverse=False, left=0., right=1., bottom=0., top=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0: raise ValueError('Minimal bin width too large for the number of bins')
if min_bin_height * num_bins > 1.0: raise ValueError('Minimal bin height too large for the number of bins')
widths = min_bin_width + (1 - min_bin_width * num_bins) * unnormalized_widths.softmax(axis=-1)
cum_widths = cat_lr(((right - left) * widths[..., :-1].cumsum(axis=1) + left), left, right + 1e-6 if not inverse else right)
widths = cum_widths[..., 1:] - cum_widths[..., :-1]
derivatives = min_derivative + (unnormalized_derivatives.exp()+1).log()
heights = min_bin_height + (1 - min_bin_height * num_bins) * unnormalized_heights.softmax(axis=-1)
cum_heights = cat_lr(((top - bottom) * heights[..., :-1].cumsum(axis=1) + bottom), bottom, top + 1e-6 if inverse else top)
heights = cum_heights[..., 1:] - cum_heights[..., :-1]
bin_idx = ((inputs[..., None] >= (cum_heights if inverse else cum_widths)).sum(axis=-1) - 1)[..., None]
input_cum_widths = gather(cum_widths, bin_idx, axis=-1)[..., 0]
input_bin_widths = gather(widths, bin_idx, axis=-1)[..., 0]
input_cum_heights = gather(cum_heights, bin_idx, axis=-1)[..., 0]
input_delta = gather(heights / widths, bin_idx, axis=-1)[..., 0]
input_derivatives = gather(derivatives, bin_idx, axis=-1)[..., 0]
input_derivatives_plus_one = gather(derivatives[..., 1:], bin_idx, axis=-1)[..., 0]
input_heights = gather(heights, bin_idx, axis=-1)[..., 0]
if inverse:
a = ((inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + input_heights * (input_delta - input_derivatives))
b = (input_heights * input_derivatives - (inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta))
c = - input_delta * (inputs - input_cum_heights)
discriminant = b.square() - 4 * a * c
# assert (discriminant.numpy() >= 0).all()
root = (2 * c) / (-b - discriminant.sqrt())
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
derivative_numerator = input_delta.square() * (input_derivatives_plus_one * root.square() + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - root).square())
return root * input_bin_widths + input_cum_widths, -(derivative_numerator.log() - 2 * denominator.log())
theta = (inputs - input_cum_widths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - theta).pow(2))
return input_cum_heights + numerator / denominator, derivative_numerator.log() - 2 * denominator.log()
def sequence_mask(length: Tensor, max_length): return Tensor.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1)
def generate_path(duration: Tensor, mask: Tensor): # duration: [b, 1, t_x], mask: [b, 1, t_y, t_x]
b, _, t_y, t_x = mask.shape
path = sequence_mask(duration.cumsum(axis=2).reshape(b * t_x), t_y).cast(mask.dtype).reshape(b, t_x, t_y)
path = path - path.pad(convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
return path.unsqueeze(1).transpose(2, 3) * mask
def fused_add_tanh_sigmoid_multiply(input_a: Tensor, input_b: Tensor, n_channels: int):
n_channels_int, in_act = n_channels, input_a + input_b
t_act, s_act = in_act[:, :n_channels_int, :].tanh(), in_act[:, n_channels_int:, :].sigmoid()
return t_act * s_act
def cat_lr(t, left, right): return Tensor.full(get_shape(t), left).cat(t, dim=-1).cat(Tensor.full(get_shape(t), right), dim=-1)
def get_shape(tensor):
(shape := list(tensor.shape))[-1] = 1
return tuple(shape)
def convert_pad_shape(pad_shape): return tuple(tuple(x) for x in pad_shape)
def get_padding(kernel_size, dilation=1): return int((kernel_size*dilation - dilation)/2)
def gather(x, indices, axis):
indices = (indices < 0).where(indices + x.shape[axis], indices).transpose(0, axis)
permute_args = list(range(x.ndim))
permute_args[0], permute_args[axis] = permute_args[axis], permute_args[0]
permute_args.append(permute_args.pop(0))
x = x.permute(*permute_args)
reshape_arg = [1] * x.ndim + [x.shape[-1]]
return ((indices.unsqueeze(indices.ndim).expand(*indices.shape, x.shape[-1]) ==
Tensor.arange(x.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, x.shape[-1])) * x).sum(indices.ndim).transpose(0, axis)
def norm_except_dim(v, dim):
if dim == -1: return np.linalg.norm(v)
if dim == 0:
(output_shape := [1] * v.ndim)[0] = v.shape[0]
return np.linalg.norm(v.reshape(v.shape[0], -1), axis=1).reshape(output_shape)
if dim == v.ndim - 1:
(output_shape := [1] * v.ndim)[-1] = v.shape[-1]
return np.linalg.norm(v.reshape(-1, v.shape[-1]), axis=0).reshape(output_shape)
transposed_v = np.transpose(v, (dim,) + tuple(i for i in range(v.ndim) if i != dim))
return np.transpose(norm_except_dim(transposed_v, 0), (dim,) + tuple(i for i in range(v.ndim) if i != dim))
def weight_norm(v: Tensor, g: Tensor, dim):
v, g = v.numpy(), g.numpy()
return Tensor(v * (g / norm_except_dim(v, dim)))
# HPARAMS LOADING
def get_hparams_from_file(path):
with open(path, "r") as f:
data = f.read()
return HParams(**json.loads(data))
class HParams:
def __init__(self, **kwargs):
for k, v in kwargs.items(): self[k] = v if type(v) != dict else HParams(**v)
def keys(self): return self.__dict__.keys()
def items(self): return self.__dict__.items()
def values(self): return self.__dict__.values()
def __len__(self): return len(self.__dict__)
def __getitem__(self, key): return getattr(self, key)
def __setitem__(self, key, value): return setattr(self, key, value)
def __contains__(self, key): return key in self.__dict__
def __repr__(self): return self.__dict__.__repr__()
# MODEL LOADING
def load_model(symbols, hps, model) -> Synthesizer:
net_g = Synthesizer(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers = hps.data.n_speakers, **hps.model)
_ = load_checkpoint(fetch(model[1]), net_g, None)
return net_g
def load_checkpoint(checkpoint_path, model: Synthesizer, optimizer=None, skip_list=[]):
assert Path(checkpoint_path).is_file()
start_time = time.time()
checkpoint_dict = torch_load(checkpoint_path)
iteration, learning_rate = checkpoint_dict['iteration'], checkpoint_dict['learning_rate']
if optimizer: optimizer.load_state_dict(checkpoint_dict['optimizer'])
saved_state_dict = checkpoint_dict['model']
weight_g, weight_v, parent = None, None, None
for key, v in saved_state_dict.items():
if any(layer in key for layer in skip_list): continue
try:
obj, skip = model, False
for k in key.split('.'):
if k.isnumeric(): obj = obj[int(k)]
elif isinstance(obj, dict): obj = obj[k]
else:
if isinstance(obj, (LayerNorm, nn.LayerNorm)) and k in ["gamma", "beta"]:
k = "weight" if k == "gamma" else "bias"
elif k in ["weight_g", "weight_v"]:
parent, skip = obj, True
if k == "weight_g": weight_g = v
else: weight_v = v
if not skip: obj = getattr(obj, k)
if weight_g is not None and weight_v is not None:
setattr(obj, "weight_g", weight_g.numpy())
setattr(obj, "weight_v", weight_v.numpy())
obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
weight_g, weight_v, parent, skip = None, None, None, False
if not skip and obj.shape == v.shape: obj.assign(v.to(obj.device))
elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
except Exception as e: raise e
logging.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration}) in {time.time() - start_time:.4f}s")
return model, optimizer, learning_rate, iteration
# Used for cleaning input text and mapping to symbols
class TextMapper: # Based on https://github.com/keithito/tacotron
def __init__(self, symbols, apply_cleaners=True):
self.apply_cleaners, self.symbols, self._inflect = apply_cleaners, symbols, None
self._symbol_to_id, _id_to_symbol = {s: i for i, s in enumerate(symbols)}, {i: s for i, s in enumerate(symbols)}
self._whitespace_re, self._abbreviations = re.compile(r'\s+'), [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [('mrs', 'misess'), ('mr', 'mister'), ('dr', 'doctor'), ('st', 'saint'), ('co', 'company'), ('jr', 'junior'), ('maj', 'major'), ('gen', 'general'), ('drs', 'doctors'), ('rev', 'reverend'), ('lt', 'lieutenant'), ('hon', 'honorable'), ('sgt', 'sergeant'), ('capt', 'captain'), ('esq', 'esquire'), ('ltd', 'limited'), ('col', 'colonel'), ('ft', 'fort'), ]]
self.phonemizer = EspeakBackend(
language="en-us", punctuation_marks=Punctuation.default_marks(), preserve_punctuation=True, with_stress=True,
)
def text_to_sequence(self, text, cleaner_names):
if self.apply_cleaners:
for name in cleaner_names:
cleaner = getattr(self, name)
if not cleaner: raise ModuleNotFoundError('Unknown cleaner: %s' % name)
text = cleaner(text)
else: text = text.strip()
return [self._symbol_to_id[symbol] for symbol in text]
def get_text(self, text, add_blank=False, cleaners=('english_cleaners2',)):
text_norm = self.text_to_sequence(text, cleaners)
return Tensor(self.intersperse(text_norm, 0) if add_blank else text_norm, dtype=dtypes.int64)
def intersperse(self, lst, item):
(result := [item] * (len(lst) * 2 + 1))[1::2] = lst
return result
def phonemize(self, text, strip=True): return _phonemize(self.phonemizer, text, default_separator, strip, 1, False, False)
def filter_oov(self, text): return "".join(list(filter(lambda x: x in self._symbol_to_id, text)))
def base_english_cleaners(self, text): return self.collapse_whitespace(self.phonemize(self.expand_abbreviations(unidecode(text.lower()))))
def english_cleaners2(self, text): return self.base_english_cleaners(text)
def transliteration_cleaners(self, text): return self.collapse_whitespace(unidecode(text.lower()))
def cjke_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text).replace('ɑ', 'a').replace('ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')))
def cjke_cleaners2(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text)))
def cjks_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_lazy_ipa(text)))
def english_to_ipa2(self, text):
_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ ('r', 'ɹ'), ('ʤ', ''), ('ʧ', '')]]
return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _ipa_to_ipa2, self.mark_dark_l(self.english_to_ipa(text))).replace('...', '')
def mark_dark_l(self, text): return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ' + x.group(1), text)
def english_to_ipa(self, text):
import eng_to_ipa as ipa
return self.collapse_whitespace(ipa.convert(self.normalize_numbers(self.expand_abbreviations(unidecode(text).lower()))))
def english_to_lazy_ipa(self, text):
_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [('r', 'ɹ'), ('æ', 'e'), ('ɑ', 'a'), ('ɔ', 'o'), ('ð', 'z'), ('θ', 's'), ('ɛ', 'e'), ('ɪ', 'i'), ('ʊ', 'u'), ('ʒ', 'ʥ'), ('ʤ', 'ʥ'), ('ˈ', '')]]
return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _lazy_ipa, self.english_to_ipa(text))
def expand_abbreviations(self, text): return reduce(lambda t, abbr: re.sub(abbr[0], abbr[1], t), self._abbreviations, text)
def collapse_whitespace(self, text): return re.sub(self._whitespace_re, ' ', text)
def normalize_numbers(self, text):
import inflect
self._inflect = inflect.engine()
text = re.sub(re.compile(r'([0-9][0-9\,]+[0-9])'), self._remove_commas, text)
text = re.sub(re.compile(r'£([0-9\,]*[0-9]+)'), r'\1 pounds', text)
text = re.sub(re.compile(r'\$([0-9\.\,]*[0-9]+)'), self._expand_dollars, text)
text = re.sub(re.compile(r'([0-9]+\.[0-9]+)'), self._expand_decimal_point, text)
text = re.sub(re.compile(r'[0-9]+(st|nd|rd|th)'), self._expand_ordinal, text)
text = re.sub(re.compile(r'[0-9]+'), self._expand_number, text)
return text
def _remove_commas(self, m): return m.group(1).replace(',', '') # george won't like this
def _expand_dollars(self, m):
match = m.group(1)
parts = match.split('.')
if len(parts) > 2: return match + ' dollars' # Unexpected format
dollars, cents = int(parts[0]) if parts[0] else 0, int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents: return '%s %s, %s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars', cents, 'cent' if cents == 1 else 'cents')
if dollars: return '%s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars')
if cents: return '%s %s' % (cents, 'cent' if cents == 1 else 'cents')
return 'zero dollars'
def _expand_decimal_point(self, m): return m.group(1).replace('.', ' point ')
def _expand_ordinal(self, m): return self._inflect.number_to_words(m.group(0))
def _expand_number(self, _inflect, m):
num = int(m.group(0))
if 1000 < num < 3000:
if num == 2000: return 'two thousand'
if 2000 < num < 2010: return 'two thousand ' + self._inflect.number_to_words(num % 100)
if num % 100 == 0: return self._inflect.number_to_words(num // 100) + ' hundred'
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
return self._inflect.number_to_words(num, andword='')
#########################################################################################
# PAPER: https://arxiv.org/abs/2106.06103
# CODE: https://github.com/jaywalnut310/vits/tree/main
#########################################################################################
# INSTALLATION: this is based on default config, dependencies are for preprocessing.
# vctk, ljs | pip3 install unidecode phonemizer | phonemizer requires [eSpeak](https://espeak.sourceforge.net) backend to be installed on your system
# mmts-tts | pip3 install unidecode |
# uma_trilingual, cjks, voistock | pip3 install unidecode inflect eng_to_ipa |
#########################################################################################
# Some good speakers to try out, there may be much better ones, I only tried out a few:
# male vctk 1 | --model_to_use vctk --speaker_id 2
# male vctk 2 | --model_to_use vctk --speaker_id 6
# anime lady 1 | --model_to_use uma_trilingual --speaker_id 36
# anime lady 2 | --model_to_use uma_trilingual --speaker_id 121
#########################################################################################
VITS_PATH = Path(__file__).parents[1] / "weights/VITS/"
MODELS = { # config_url, weights_url
"ljs": ("https://raw.githubusercontent.com/jaywalnut310/vits/main/configs/ljs_base.json", "https://drive.google.com/uc?export=download&id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT&confirm=t"),
"vctk": ("https://huggingface.co/csukuangfj/vits-vctk/resolve/main/vctk_base.json", "https://huggingface.co/csukuangfj/vits-vctk/resolve/main/pretrained_vctk.pth"),
"mmts-tts": ("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/config.json", "https://huggingface.co/facebook/mms-tts/resolve/main/full_models/eng/G_100000.pth"),
"uma_trilingual": ("https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/raw/main/configs/uma_trilingual.json", "https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth"),
"cjks": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/model.pth"),
"voistock": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/model.pth"),
}
Y_LENGTH_ESTIMATE_SCALARS = {"ljs": 2.8, "vctk": 1.74, "mmts-tts": 1.9, "uma_trilingual": 2.3, "cjks": 3.3, "voistock": 3.1}
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("--model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
parser.add_argument("--speaker_id", type=int, default=6, help="Specify the speaker ID. Default is 6.")
parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
parser.add_argument("--text_to_synthesize", default="""Hello person. If the code you are contributing isn't some of the highest quality code you've written in your life, either put in the effort to make it great, or don't bother.""", help="Specify the text to synthesize. Default is a greeting message.")
parser.add_argument("--noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
parser.add_argument("--noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
parser.add_argument("--length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
parser.add_argument("--seed", type=int, default=1337, help="Specify the seed (set to None if no seed). Default is 1337.")
parser.add_argument("--num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
parser.add_argument("--sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
parser.add_argument("--emotion_path", type=str, default=None, help="Specify the path to emotion reference.")
parser.add_argument("--estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
args = parser.parse_args()
model_config = MODELS[args.model_to_use]
# Load the hyperparameters from the config file.
hps = get_hparams_from_file(fetch(model_config[0]))
# If model has multiple speakers, validate speaker id and retrieve name if available.
model_has_multiple_speakers = hps.data.n_speakers > 0
if model_has_multiple_speakers:
logging.info(f"Model has {hps.data.n_speakers} speakers")
if args.speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {args.speaker_id} is invalid for this model.")
speaker_name = "?"
if hps.__contains__("speakers"): # maps speaker ids to names
speakers = hps.speakers
if isinstance(speakers, List): speakers = {speaker: i for i, speaker in enumerate(speakers)}
speaker_name = next((key for key, value in speakers.items() if value == args.speaker_id), None)
logging.info(f"You selected speaker {args.speaker_id} (name: {speaker_name})")
# Load emotions if any. TODO: find an english model with emotions, this is untested atm.
emotion_embedding = None
if args.emotion_path is not None:
if args.emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(args.emotion_path), dtype=dtypes.int64).unsqueeze(0)
else: raise ValueError("Emotion path must be a .npy file.")
# Load symbols, instantiate TextMapper and clean the text.
if hps.__contains__("symbols"): symbols = hps.symbols
elif args.model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'")
text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
# Load the model.
if args.seed is not None:
Tensor.manual_seed(args.seed)
np.random.seed(args.seed)
net_g = load_model(text_mapper.symbols, hps, model_config)
logging.debug(f"Loaded model with hps: {hps}")
# Convert the input text to a tensor.
text_to_synthesize = args.text_to_synthesize
if args.model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
logging.debug(f"Converted input text to tensor \"{text_to_synthesize}\" -> Tensor({stn_tst.shape}): {stn_tst.numpy()}")
x_tst, x_tst_lengths = stn_tst.unsqueeze(0), Tensor([stn_tst.shape[0]], dtype=dtypes.int64)
sid = Tensor([args.speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
# Perform inference.
start_time = time.time()
audio_tensor = net_g.infer(x_tst, x_tst_lengths, sid, args.noise_scale, args.length_scale, args.noise_scale_w, emotion_embedding=emotion_embedding,
max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[args.model_to_use] if args.estimate_max_y_length else None)[0, 0].realize()
logging.info(f"Inference took {(time.time() - start_time):.2f}s")
# Save the audio output.
audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model_to_use}{f'_sid_{args.speaker_id}' if model_has_multiple_speakers else ''}_{args.base_name}.wav")
out_path.parent.mkdir(parents=True, exist_ok=True)
with wave.open(str(out_path), 'wb') as wav_file:
wav_file.setnchannels(args.num_channels)
wav_file.setsampwidth(args.sample_width)
wav_file.setframerate(hps.data.sampling_rate)
wav_file.setnframes(len(audio_data))
wav_file.writeframes(audio_data.tobytes())
logging.info(f"Saved audio output to {out_path}")

View file

@ -26,11 +26,13 @@ def color_temp(temp):
def color_voltage(voltage): return colored(f"{voltage/1000:>5.3f}V", "cyan")
def draw_bar(percentage, width=40, fill='|', empty=' ', opt_text='', color='cyan'):
percentage = 0.0 if percentage != percentage else percentage # NaN guard
percentage = max(0.0, min(1.0, float(percentage)))
filled_width = int(width * percentage)
if not opt_text: opt_text = f'{percentage*100:.1f}%'
bar = fill * filled_width + empty * (width - filled_width)
bar = (bar[:-len(opt_text)] + opt_text) if opt_text else bar
if opt_text and len(opt_text) <= len(bar): bar = (bar[:-len(opt_text)] + opt_text)
bar = colored(bar[:filled_width], color) + bar[filled_width:]
return f'[{bar}]'
@ -88,6 +90,7 @@ class SMICtx:
self.opened_pci_resources = {}
self.prev_lines_cnt = 0
self.prev_terminal_width = 0
self.prev_terminal_height = 0
remove_parts = ["Advanced Micro Devices, Inc. [AMD/ATI]", "VGA compatible controller:"]
lspci = subprocess.check_output(["lspci"]).decode("utf-8").splitlines()
@ -95,6 +98,20 @@ class SMICtx:
for k,v in self.lspci.items():
for part in remove_parts: self.lspci[k] = self.lspci[k].replace(part, "").strip().rstrip()
def _smuq10_round(self, v:int) -> int:
v = int(v)
return (v + 512) >> 10 # SMUQ10_ROUND
def _fmt_kb(self, kb:int) -> str:
kb = int(kb)
if kb < 1024: return f"{kb}KB"
mb = kb / 1024.0
if mb < 1024: return f"{mb:.1f}MB"
gb = mb / 1024.0
if gb < 1024: return f"{gb:.2f}GB"
tb = gb / 1024.0
return f"{tb:.2f}TB"
def _open_am_device(self, pcibus):
if pcibus not in self.opened_pci_resources:
bar_fds = {bar: os.open(f"/sys/bus/pci/devices/{pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
@ -116,6 +133,7 @@ class SMICtx:
def rescan_devs(self):
pattern = os.path.join('/tmp', 'am_*.lock')
for d in [f[8:-5] for f in glob.glob(pattern)]:
if d.startswith("usb"): continue
if d not in self.opened_pcidevs:
self._open_am_device(d)
@ -131,21 +149,53 @@ class SMICtx:
os.system('clear')
if DEBUG >= 2: print(f"Removed AM device {d.pcibus}")
def collect(self): return {d: d.smu.read_metrics() if d.pci_state == "D0" else None for d in self.devs}
def collect(self):
tables = {}
for dev in self.devs:
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): table_t = dev.smu.smu_mod.MetricsTableX_t
case (13,0,12): table_t = dev.smu.smu_mod.MetricsTableV2_t
case _: table_t = dev.smu.smu_mod.SmuMetricsExternal_t
tables[dev] = dev.smu.read_table(table_t, dev.smu.smu_mod.SMU_TABLE_SMU_METRICS) if dev.pci_state == "D0" else None
return tables
def get_gfx_activity(self, dev, metrics): return metrics.SmuMetrics.AverageGfxActivity
def get_mem_activity(self, dev, metrics): return metrics.SmuMetrics.AverageUclkActivity
def _pick_nonzero_avg(self, vals) -> int:
xs = [x for x in vals if x > 0]
return int(sum(xs) / len(xs)) if xs else 0
def get_gfx_activity(self, dev, metrics):
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return max(0, min(100, self._smuq10_round(metrics.SocketGfxBusy)))
case _: return metrics.SmuMetrics.AverageGfxActivity
def get_mem_activity(self, dev, metrics):
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return max(0, min(100, self._smuq10_round(metrics.DramBandwidthUtilization)))
case _: return metrics.SmuMetrics.AverageUclkActivity
def get_temps(self, dev, metrics, compact=False):
temps_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_TEMP_e__enumvalues.items()
if k < dev.smu.smu_mod.TEMP_COUNT and metrics.SmuMetrics.AvgTemperature[k] != 0]
if compact: temps_keys = [(k, name) for k, name in temps_keys if k in (dev.smu.smu_mod.TEMP_HOTSPOT, dev.smu.smu_mod.TEMP_MEM)]
return {name: metrics.SmuMetrics.AvgTemperature[k] for k, name in temps_keys}
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6):
temps = {
"Hotspot": self._smuq10_round(metrics.MaxSocketTemperature),
"HBM": self._smuq10_round(metrics.MaxHbmTemperature),
"VR": self._smuq10_round(metrics.MaxVrTemperature),
}
if compact: return {k: temps[k] for k in ("Hotspot", "HBM") if temps.get(k, 0) != 0}
return {k: v for k, v in temps.items() if v != 0}
case _:
temps_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_TEMP_e__enumvalues.items()
if k < dev.smu.smu_mod.TEMP_COUNT and metrics.SmuMetrics.AvgTemperature[k] != 0]
if compact: temps_keys = [(k, name) for k, name in temps_keys if k in (dev.smu.smu_mod.TEMP_HOTSPOT, dev.smu.smu_mod.TEMP_MEM)]
return {name: metrics.SmuMetrics.AvgTemperature[k] for k, name in temps_keys}
def get_voltage(self, dev, metrics, compact=False):
voltage_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_SVI_PLANE_e__enumvalues.items()
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return {}
case _:
voltage_keys = [(k, name) for k, name in dev.smu.smu_mod.c__EA_SVI_PLANE_e__enumvalues.items()
if k < dev.smu.smu_mod.SVI_PLANE_COUNT and metrics.SmuMetrics.AvgVoltage[k] != 0]
return {name: metrics.SmuMetrics.AvgVoltage[k] for k, name in voltage_keys}
return {name: metrics.SmuMetrics.AvgVoltage[k] for k, name in voltage_keys}
def get_busy_threshold(self, dev):
match dev.ip_ver[am.MP1_HWIP]:
@ -153,22 +203,40 @@ class SMICtx:
case _: return 15
def get_gfx_freq(self, dev, metrics):
return metrics.SmuMetrics.AverageGfxclkFrequencyPostDs if self.get_gfx_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
metrics.SmuMetrics.AverageGfxclkFrequencyPreDs
if metrics is None: return 0
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return self._smuq10_round(metrics.GfxclkFrequency[0])
case _:
return metrics.SmuMetrics.AverageGfxclkFrequencyPostDs if self.get_gfx_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
metrics.SmuMetrics.AverageGfxclkFrequencyPreDs
def get_mem_freq(self, dev, metrics):
return metrics.SmuMetrics.AverageMemclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
metrics.SmuMetrics.AverageMemclkFrequencyPreDs
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return self._smuq10_round(metrics.UclkFrequency)
case _:
return metrics.SmuMetrics.AverageMemclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
metrics.SmuMetrics.AverageMemclkFrequencyPreDs
def get_fckl_freq(self, dev, metrics):
return metrics.SmuMetrics.AverageFclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
metrics.SmuMetrics.AverageFclkFrequencyPreDs
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return self._smuq10_round(metrics.FclkFrequency)
case _:
return metrics.SmuMetrics.AverageFclkFrequencyPostDs if self.get_mem_activity(dev, metrics) <= self.get_busy_threshold(dev) else \
metrics.SmuMetrics.AverageFclkFrequencyPreDs
def get_fan_rpm_pwm(self, dev, metrics): return metrics.SmuMetrics.AvgFanRpm, metrics.SmuMetrics.AvgFanPwm
def get_fan_rpm_pwm(self, dev, metrics):
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return None, None
case _: return metrics.SmuMetrics.AvgFanRpm, metrics.SmuMetrics.AvgFanPwm
def get_power(self, dev, metrics): return metrics.SmuMetrics.AverageSocketPower, metrics.SmuMetrics.dGPU_W_MAX
def get_power(self, dev, metrics):
match dev.ip_ver[am.MP1_HWIP]:
case (13,0,6): return self._smuq10_round(metrics.SocketPower), self._smuq10_round(metrics.MaxSocketPowerLimit)
case _: return metrics.SmuMetrics.AverageSocketPower, metrics.SmuMetrics.dGPU_W_MAX
def get_mem_usage(self, dev):
return 0
usage = 0
pt_stack = [dev.mm.root_page_table]
while len(pt_stack) > 0:
@ -177,7 +245,7 @@ class SMICtx:
entry = pt.entries[i]
if (entry & am.AMDGPU_PTE_VALID) == 0: continue
if pt.lv!=am.AMDGPU_VM_PTB and not dev.gmc.is_pte_huge_page(entry):
if pt.lv!=am.AMDGPU_VM_PTB and not dev.gmc.is_pte_huge_page(pt.lv, entry):
pt_stack.append(AMPageTableEntry(dev, entry & 0x0000FFFFFFFFF000, lv=pt.lv+1))
continue
if (entry & am.AMDGPU_PTE_SYSTEM) != 0: continue
@ -219,23 +287,28 @@ class SMICtx:
temps_table_compact = ["Temps (°C):" + '/'.join([f"{color_temp(val)} {name}" for name, val in temps_data_compact.items()])]
fan_rpm, fan_pwm = self.get_fan_rpm_pwm(dev, metrics)
power_table = ["=== Power ==="] + [f"Fan Speed: {fan_rpm} RPM"] + [f"Fan Power: {fan_pwm}%"]
power_table = ["=== Power ==="]
power_table += ["Fan: N/A"] if fan_rpm is None or fan_pwm is None else [f"Fan Speed: {fan_rpm} RPM", f"Fan Power: {fan_pwm}%"]
total_power, max_power = self.get_power(dev, metrics)
power_line = [f"Power: " + draw_bar(total_power / max_power, 16, opt_text=f"{total_power}/{max_power}W")]
power_line_compact = [f"Power: " + draw_bar(total_power / max_power, activity_line_width, opt_text=f"{total_power}/{max_power}W")]
if max_power > 0:
power_line = [f"Power: " + draw_bar(total_power / max_power, 16, opt_text=f"{total_power}/{max_power}W")]
power_line_compact = [f"Power: " + draw_bar(total_power / max_power, activity_line_width, opt_text=f"{total_power}/{max_power}W")]
else:
power_line = ["Power: N/A"]
power_line_compact = ["Power: N/A"]
voltage_data = self.get_voltage(dev, metrics)
voltage_table = ["=== Voltages ==="] + [f"{name:<20}: {color_voltage(voltage)}" for name, voltage in voltage_data.items()]
voltage_table = None if not voltage_data else (["=== Voltages ==="] + [f"{name:<20}: {color_voltage(voltage)}" for name, voltage in voltage_data.items()])
gfx_freq = self.get_gfx_freq(dev, metrics)
mclk_freq = self.get_mem_freq(dev, metrics)
fclk_freq = self.get_fckl_freq(dev, metrics)
frequency_table = ["=== Frequencies ===", f"GFXCLK: {gfx_freq:>4} MHz", f"FCLK : {fclk_freq:>4} MHz", f"MCLK : {mclk_freq:>4} MHz"]
if self.prev_terminal_width >= 231:
power_table += power_line + [""] + voltage_table
power_table += power_line
if voltage_table is not None: power_table += [""] + voltage_table
activity_line += [""]
elif self.prev_terminal_width >= 171:
power_table += power_line + [""] + frequency_table
@ -307,4 +380,5 @@ if __name__ == "__main__":
smi_ctx.draw(args.list)
if args.list: break
time.sleep(1)
except KeyboardInterrupt: print("Exiting...")
except KeyboardInterrupt:
print("Exiting...")

14
extra/amdpci/hive_reset.py Executable file
View file

@ -0,0 +1,14 @@
#!/usr/bin/env python3
from tinygrad.helpers import Context
from tinygrad.runtime.support.system import System, PCIDevice, PCIDevImplBase
from tinygrad.runtime.support.am.amdev import AMDev
if __name__ == "__main__":
gpus = System.pci_scan_bus(0x1002, [(0xffff, [0x74a1, 0x75a0])])
pcidevs = [PCIDevice(f"reset:{gpu}", gpu, bars=[0, 2, 5]) for gpu in gpus]
amdevs = []
with Context(DEBUG=2):
for pcidev in pcidevs:
amdevs.append(AMDev(pcidev, reset_mode=True))
for amdev in amdevs: amdev.smu.mode1_reset()

View file

@ -1,48 +1,65 @@
import re, ctypes, sys, importlib
from tinygrad.helpers import getenv
from tinygrad.runtime.support.am.amdev import AMDev, AMRegister
class GFXFake:
def __init__(self): self.xccs = 8
class AMDFake(AMDev):
def __init__(self, devfmt, vram, doorbell, mmio, dma_regions=None):
self.devfmt, self.vram, self.doorbell64, self.mmio, self.dma_regions = devfmt, vram, doorbell, mmio, dma_regions
def __init__(self, pci_dev, dma_regions=None):
self.pci_dev, self.devfmt, self.dma_regions = pci_dev, pci_dev.pcibus, dma_regions
self.vram, self.doorbell64, self.mmio = self.pci_dev.map_bar(0), self.pci_dev.map_bar(2, fmt='Q'), self.pci_dev.map_bar(5, fmt='I')
self._run_discovery()
self._build_regs()
self.gfx = GFXFake()
amdev = importlib.import_module("tinygrad.runtime.support.am.amdev")
amdev.AMDev = AMDFake
from tinygrad.runtime.ops_amd import PCIIface
def parse_amdgpu_logs(log_content, register_names=None):
register_map = register_names
def parse_amdgpu_logs(log_content, register_names=None, *, only_xcc0: bool = False):
register_map = register_names or {}
final = ""
def replace_register(match):
register = match.group(1)
return f"Reading register {register_map.get(int(register, base=16), register)}"
reg = match.group(1)
return f"Reading register {register_map.get(int(reg, 16), reg)}"
pattern = r'Reading register (0x[0-9a-fA-F]+)'
processed_log = re.sub(pattern, replace_register, log_content)
processed_log = re.sub(r'Reading register (0x[0-9a-fA-F]+)', replace_register, log_content)
def replace_register_2(match):
register = match.group(1)
return f"Writing register {register_map.get(int(register, base=16), register)}"
reg = match.group(1)
return f"Writing register {register_map.get(int(reg, 16), reg)}"
processed_log = re.sub(r'Writing register (0x[0-9a-fA-F]+)', replace_register_2, processed_log)
# remove timing prefix
processed_log = re.sub(r'^\[\s*\d+(?:\.\d+)?\]\s*', '', processed_log, flags=re.MULTILINE)
# keep only xcc=0 lines (but keep lines with no xcc at all)
if only_xcc0:
kept = []
for line in processed_log.splitlines(True):
if "xcc=" not in line or re.search(r'\bxcc=0\b', line): kept.append(line)
processed_log = "".join(kept)
pattern = r'Writing register (0x[0-9a-fA-F]+)'
processed_log = re.sub(pattern, replace_register_2, processed_log)
return processed_log
def main():
only_xcc0 = bool(getenv("ONLY_XCC0", 0))
reg_names = {}
dev = PCIIface(None, 0)
for x, y in dev.dev_impl.__dict__.items():
if isinstance(y, AMRegister):
for inst, addr in y.addr.items(): reg_names[addr] = f"{x}, xcc={inst}"
for xcc, addr in y.addr.items():
reg_names[addr] = f"{x}, xcc={xcc}"
with open(sys.argv[1], 'r') as f:
log_content = log_content_them = f.read()
log_content = f.read()
processed_log = parse_amdgpu_logs(log_content, reg_names)
processed_log = parse_amdgpu_logs(log_content, reg_names, only_xcc0=only_xcc0)
with open(sys.argv[2], 'w') as f:
f.write(processed_log)
@ -51,5 +68,4 @@ if __name__ == '__main__':
if len(sys.argv) != 3:
print("Usage: <input_file_path> <output_file_path>")
sys.exit(1)
main()
main()

View file

@ -1,189 +0,0 @@
from typing import Tuple, List, NamedTuple, Any, Dict, Optional, Union, DefaultDict, cast
from tinygrad.codegen.opt.kernel import Ops, MemOp, UOp
from tinygrad.uop.ops import BinaryOps, UnaryOps
from tinygrad.dtype import DType, dtypes
from tinygrad.helpers import DEBUG
from tinygrad.uop.ops import Variable, NumNode, MulNode, DivNode, ModNode, LtNode, SumNode, AndNode
import functools
import math
from collections import defaultdict
_type_to_letter = {dtypes.float32: 'f', dtypes.bool: 'p', dtypes.int32: 'i', dtypes.int64: 'a', dtypes.uint32: 'u', dtypes.uint64: 'b', dtypes.float.vec(4): 'x', dtypes.uint8: 'uc', dtypes.float16: 'h',
dtypes.int8: 'c', dtypes.uint16: 'us', dtypes.float64: 'd'}
class Register(NamedTuple):
nm:str
dtype:DType
scalar:bool
off:Optional[int] = None
def __repr__(self): return self.nm if self.off is None else f"{self.nm}:{self.off}"
def subregs(self):
if self.dtype == dtypes.float.vec(4):
return [Register(self.nm, dtypes.float, False, off=off) for off in range(4)]
return []
class AssemblyInstruction(NamedTuple):
op: Ops
out: Optional[Register]
vin: List[Union[Register, int, float]]
arg: Any = None
# warp size of 32, s registers are shared across the warp, v are 32-wide vectors
class AssemblyLanguage:
supports_load3: bool = False
sin_is_sin2pi: bool = False
no_div: bool = False
#TODO: these should be global vars
cnts:DefaultDict[Tuple[DType, bool], int] = defaultdict(int)
tor: Dict[Any, Register] = {}
ins: List[AssemblyInstruction] = []
def type_to_letter(self,x): return _type_to_letter[x[0]].upper() if x[1] else _type_to_letter[x[0]]
def newreg(self, tok, dtype=dtypes.float32, scalar=False) -> Register:
self.tor[tok] = ret = Register(f"%{self.type_to_letter((dtype, scalar))}{self.cnts[(dtype, scalar)]}", dtype, scalar)
if dtype == dtypes.float.vec(4):
for off in range(4):
self.tor[tok] = Register(ret.nm, dtypes.float, ret.scalar, off)
self.cnts[(dtype, scalar)] += 1
return ret
def render_numnode(self, b) -> Register:
key = ("num", b)
if key not in self.tor: self.ins.append(AssemblyInstruction(Ops.LOAD, self.newreg(key, scalar=True, dtype=dtypes.int32), [], b))
return self.tor[key]
def render_alu(self, op, a:Register, b:Union[Register, int, float], dtype=dtypes.int32) -> Register:
key = (op, a, b)
if key not in self.tor:
#if not isinstance(b, Register): b = render_numnode(b)
self.ins.append(AssemblyInstruction(Ops.ALU, self.newreg(key, dtype=dtype, scalar=a.scalar and (not isinstance(b, Register) or b.scalar)), [a, b], op))
return self.tor[key]
def render_cast(self, a:Register, new_dtype:DType) -> Register:
if a.dtype == new_dtype: return a
key = (a, new_dtype)
if key not in self.tor:
self.ins.append(AssemblyInstruction(Ops.CAST, self.newreg(key, dtype=new_dtype), [a]))
return self.tor[key]
render_ops: Any = { Variable: lambda self, ops, ctx: ctx.tor[self], NumNode: lambda self, ops, ctx: ctx.render_numnode(self.b),
MulNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.MUL, self.a.render(ops, ctx), self.b),
DivNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.DIV, self.a.render(ops, ctx), self.b),
ModNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.MOD, self.a.render(ops, ctx), self.b),
LtNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.CMPLT, self.a.render(ops, ctx), self.b, dtype=dtypes.bool),
SumNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.render_alu(BinaryOps.ADD, a, b.render(ops,ctx)), self.nodes[1:], self.nodes[0].render(ops,ctx)),
AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.render_alu(BinaryOps.MUL, a, b.render(ops,ctx), dtype=dtypes.bool), self.nodes[1:], self.nodes[0].render(ops,ctx)) }
def addr_w_offset(self, args):
assert isinstance(args, MemOp)
idx = args.idx*args.memory_dtype.itemsize
off = 0 # TODO: should this be None?
if isinstance(idx, SumNode):
nums = [n.b for n in idx.nodes if isinstance(n, NumNode)]
if nums and nums[0] < 4096 and (idx-nums[0]).min >= 0: # TODO: different for each GPU?
idx -= nums[0]
off = cast(int, nums[0])
reg = idx.render(self.render_ops, self)
if self.supports_load3:
if reg.scalar:
new_reg = self.newreg((reg.nm, 'vec'), dtype=reg.dtype)
self.ins.append(AssemblyInstruction(Ops.ALU, new_reg, [reg], UnaryOps.NOOP))
reg = new_reg
return self.tor[args.name], reg, off
reg = self.render_alu(BinaryOps.ADD, self.render_cast(reg, dtypes.uint64), self.tor[args.name], dtype=dtypes.uint64)
return reg, None, off
def uops_to_asmstyle(lang, function_name:str, uops:List[UOp]):
#TODO: Do not use clear()
lang.ins.clear()
lang.tor.clear()
lang.cnts.clear()
buf_to_dtype = {args:dtype for uop,dtype,_,args,_ in uops if uop == Ops.DEFINE_GLOBAL}
global_size, local_size = [], []
skipload_branch = 0
lang.ins += [AssemblyInstruction(Ops.SPECIAL, lang.newreg(buf, dtype=dtypes.uint64, scalar=True), [], buf) for buf in buf_to_dtype]
for u in uops:
uop,dtype,vin,args,_ = u
if uop == Ops.DEFINE_LOCAL:
lang.ins.append(AssemblyInstruction(Ops.DEFINE_LOCAL, None, [], args))
lang.ins.append(AssemblyInstruction(Ops.ALU, lang.newreg(args[0], dtype=dtypes.uint64), [args[0]], UnaryOps.NOOP))
elif uop == Ops.LOOP:
if args[1] == "global":
for i,var in enumerate(args[0]):
global_size.append(var.max+1)
lang.ins.append(AssemblyInstruction(Ops.SPECIAL, lang.newreg(var, dtype=dtypes.int32), [], f"gid{len(args[0])-1-i}"))
elif args[1] == "local":
for i,var in enumerate(args[0]):
local_size.append(var.max+1)
lang.ins.append(AssemblyInstruction(Ops.SPECIAL, lang.newreg(var, dtype=dtypes.int32), [], f"lid{len(args[0])-1-i}"))
else:
for var in args[0]:
if not isinstance(var, NumNode): # TODO: why is this coming through?
lang.ins.append(AssemblyInstruction(Ops.LOAD, lang.newreg(var, dtype=dtypes.int32, scalar=True), [], 0))
lang.ins.append(AssemblyInstruction(Ops.LABEL, None, [], "$loop_"+var.expr))
elif uop == Ops.ENDLOOP:
if args[1] not in ["global", "local", "global+local"]:
for var in reversed(args[0]):
if not isinstance(var, NumNode): # TODO: why is this coming through?
lang.ins.append(AssemblyInstruction(Ops.ALU, lang.tor[var], [lang.tor[var], 1], BinaryOps.ADD))
pred = lang.render_alu(BinaryOps.CMPLT, lang.tor[var], var.max+1, dtypes.bool)
lang.ins.append(AssemblyInstruction(Ops.COND_BRANCH, None, [pred], ("$loop_"+var.expr, True)))
elif args[1] == "global+local":
for i, var in enumerate(reversed(args[0])):
lang.ins.append(AssemblyInstruction(Ops.ENDLOOP, None, [lang.tor[var]], (var.max+1, f"gid{i}")))
elif args[1] == 'local':
for i, var in enumerate(reversed(args[0])):
lang.ins.append(AssemblyInstruction(Ops.ENDLOOP, None, [lang.tor[var]], (var.max+1, f"lid{i}")))
elif uop == Ops.CAST:
# TODO: we should reconsider outputting CAST in the linearizer. these are needless copies
out = lang.newreg(u, dtype)
for i,sr in enumerate(out.subregs()):
lang.ins.append(AssemblyInstruction(Ops.ALU, sr, [lang.tor[vin[i]]], UnaryOps.NOOP))
elif uop == Ops.ALU:
out = lang.newreg(u, dtype) if u not in lang.tor else lang.tor[u]
# this is the only thing that can violate SSA
if args in [BinaryOps.CMPLT]:
pred_reg = lang.newreg((u, 'pred'), dtype=dtypes.bool)
lang.ins.append(AssemblyInstruction(Ops.ALU, pred_reg, [lang.tor[x] for x in vin], args))
lang.ins.append(AssemblyInstruction(Ops.CAST, out, [pred_reg], args))
elif args == BinaryOps.DIV and lang.no_div:
tmp = lang.newreg((u, "rcp"))
lang.ins.append(AssemblyInstruction(Ops.ALU, tmp, [lang.tor[vin[1]]], UnaryOps.RECIP))
lang.ins.append(AssemblyInstruction(Ops.ALU, out, [lang.tor[vin[0]], tmp], BinaryOps.MUL))
elif args == UnaryOps.SIN and lang.sin_is_sin2pi:
tmp = lang.newreg((u, "2pi"))
lang.ins.append(AssemblyInstruction(Ops.ALU, tmp, [lang.tor[vin[0]], 1/(math.pi*2)], BinaryOps.MUL))
lang.ins.append(AssemblyInstruction(Ops.ALU, out, [tmp], args))
else:
lang.ins.append(AssemblyInstruction(Ops.ALU, out, [lang.tor[x] for x in vin], args))
elif uop == Ops.DEFINE_REG:
reg = lang.newreg(u, dtype=dtype)
lang.ins.append(AssemblyInstruction(Ops.LOAD, reg, [], args))
elif uop == Ops.SPECIAL:
lang.tor[u] = lang.tor[args]
elif uop == Ops.CONST:
lang.ins.append(AssemblyInstruction(Ops.LOAD, lang.newreg(u, dtype=dtype), [], args))
elif uop == Ops.LOAD:
idx, treg, off = lang.addr_w_offset(args)
reg = lang.newreg(u, dtype=dtype, scalar=(idx.scalar and (not isinstance(treg, Register) or treg.scalar)))
if args.valid.min == 0:
lang.ins.append(AssemblyInstruction(Ops.LOAD, reg, [], 0))
if args.valid.max == 1:
pred = args.valid.render(lang.render_ops, lang)
lang.ins.append(AssemblyInstruction(Ops.COND_BRANCH, None, [pred], (f"$skipload_{skipload_branch}", False)))
if args.valid.max == 1:
# NOTE: you can't compute the index in here, because it assumes it's all available later
lang.ins.append(AssemblyInstruction(Ops.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
if args.valid.min == 0 and args.valid.max == 1:
lang.ins.append(AssemblyInstruction(Ops.LABEL, None, [], f"$skipload_{skipload_branch}"))
skipload_branch += 1
elif uop == Ops.STORE:
if args is None:
lang.ins.append(AssemblyInstruction(Ops.ALU, lang.tor[vin[0]], [lang.tor[vin[1]]], UnaryOps.NOOP))
else:
idx, treg, off = lang.addr_w_offset(args)
lang.ins.append(AssemblyInstruction(Ops.STORE, None, [idx, lang.tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
if DEBUG >= 4:
for tins in lang.ins: print(tins)
return global_size, local_size

View file

@ -1,177 +0,0 @@
import struct
from platform import system
from typing import Tuple, Dict, List, Optional
from tinygrad import dtypes
from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
from tinygrad.codegen.opt.kernel import Ops, UOp
from tinygrad.helpers import CI
from tinygrad.codegen.assembly import uops_to_asmstyle, AssemblyLanguage
def float_to_hex(x): return "%02X%02X%02X%02X" % tuple(struct.pack("f",x)[::-1])
def compute_offsets(total):
quotient, remainder = divmod(total, 4096)
return [4096]*quotient + [remainder] if remainder else [4096]*quotient
#NOTE: Darwin needs names to start with a "_"
def get_name(name): return ('_' if system() == 'Darwin' else '') + name
class ARM64Language(AssemblyLanguage): pass
def specialize_to_arm64(fn_nm, asm):
var_size = 16
prev_uop:Optional[Ops] = None
ins = []
x_regs = ['x' + str(i) for i in reversed(range(12))]
s_regs = ['s' + str(i) for i in reversed(range(3,32)) if i <= 7 or i >= 16]
type_to_reg = {dtypes.double: "d", dtypes.half: 'h', dtypes.float32: 's', dtypes.bool: 'w', dtypes.int8:'w', dtypes.int32: 'w', dtypes.int64: 'x', dtypes.uint8:'w', dtypes.uint32: 'w', dtypes.uint64: 'x'}
alu = {BinaryOps.ADD: "add", BinaryOps.SUB: "sub", BinaryOps.MUL: "mul", BinaryOps.DIV: "div", BinaryOps.MAX: "max",
BinaryOps.MOD: "", BinaryOps.CMPLT: "subs",
UnaryOps.NOOP: "mov", UnaryOps.NEG: "neg",
UnaryOps.SIN:'bl ' + get_name('sinf'), UnaryOps.LOG2: 'bl ' + get_name("log2f"), UnaryOps.EXP2: 'bl ' + get_name("exp2f"), UnaryOps.SQRT: 'bl ' + get_name("sqrtf"),
TernaryOps.MULACC: "madd", TernaryOps.WHERE: "fcsel"}
def mov_imm(value, reg):
# Manually move value into reg if value can't fit
if value.__class__ is not float and abs(value) > abs(65535):
ins.append(f"movz w15, #{value & 0xffff}")
ins.append(f"movk w15, #{(value >> 16) & 0xffff}, lsl #16")
ins.append(f"sxtw {reg}, w15")
elif reg[0] == 's':
ins.append(f"movz x15, 0x{float_to_hex(value)[4:]}")
ins.append(f"movk x15, 0x{float_to_hex(value)[:4]}, lsl #16")
ins.append("str x15, [sp, 16]")
ins.append(f"ldr {reg}, [sp, 16]")
else:
ins.append(f"mov {reg}, #{value}")
# Get variables intervals
live_range:Dict[str, List[int]] = {}
for i, (uop, out, vin, arg) in enumerate(asm):
for var in ([v for v in [out] + vin if v is not None and v.__class__ is not int]):
live_range[var.nm] = [i,i] if var.nm not in live_range else [live_range[var.nm][0], i]
mem_vars:Dict[str, int] = {}
rtor:Dict[str, str] = {}
def allocate_regs(mvars):
nonlocal var_size
for v in [v for v in mvars if v is not None and v.__class__ is not int and v.nm not in rtor]:
available_regs = s_regs if dtypes.is_float(v[1]) else x_regs
#NOTE: Very simple spill, everything that don't fit in regs goes to mem
if not available_regs:
# ARM needs the stack 16-byte aligned
var_size += 16
available_regs.append('s0' if dtypes.is_float(out[1]) else 'x12')
mem_vars[v.nm] = var_size
rtor[v.nm] = available_regs.pop()
temp_floats = ['s0', 's1', 's2']
temp_ints = ['x12', 'x13', 'x16']
for i, (uop, out, vin, arg) in enumerate(asm):
# Clear regs out of interval
for var, reg in list(rtor.items()):
available_regs = s_regs if reg[0] == 's' else x_regs
if var[1] not in 'B' and var not in mem_vars and i > live_range[var][1]:
available_regs.append(rtor.pop(var))
# Assign a registers to the variables using live ranges.
allocate_regs([out] + vin)
# Assign temp regs to vin and load them before direct use
for i, v in enumerate([v for v in vin if v.__class__ is not int and v.nm in mem_vars]):
rtor[v.nm] = temp_floats[i] if dtypes.is_float(v[1]) else temp_ints[i]
# ARM64 addressing constraints https://devblogs.microsoft.com/oldnewthing/20220728-00/?p=106912
ins.append(f"mov x15, {mem_vars[v.nm]}")
ins.append(f"ldr {rtor[v.nm]}, [sp, x15]")
if uop == Ops.SPECIAL:
if arg.startswith('data'):
# data 8 to n into the stack
if int(arg[4:]) >= 8:
ins.append(f"ldr x15, [x17, #{(int(arg[4:]) - 8) * 8}]")
ins.append(f"mov {rtor[out.nm]}, x15")
else:
ins.append(f"mov {rtor[out.nm]}, #0")
ins.append(f"loop_{arg}:")
elif uop == Ops.CAST:
if arg == BinaryOps.CMPLT:
if rtor[out.nm][0] == 's':
mov_imm(0.0, 's0')
mov_imm(1.0, 's1')
ins.append(f"fcsel {rtor[out.nm]}, s1, s0, lt")
if rtor[out.nm][0] == 'x':
mov_imm(0, 'x14')
mov_imm(1, 'x15')
ins.append(f"csel {rtor[out.nm]}, x15, x14, lt")
else:
ins.append(f"sxtw {rtor[out.nm]}, w{rtor[vin[0].nm][1:]}")
elif uop == Ops.ALU:
if len(vin)==2 and vin[1].__class__ is int: mov_imm(vin[1], 'x15')
if arg == BinaryOps.MUL and out.dtype == dtypes.bool:
ins.append(f"ands {','.join('x15' if v.__class__ is int else rtor[v.nm] for v in [out] + vin)}")
elif arg == TernaryOps.WHERE:
ins.append(f"fcmp {rtor[vin[0].nm]}, #0.0" if rtor[vin[0].nm][0] == 's' else f"cmp {rtor[vin[0].nm]}, #0")
ins.append(f"{alu[arg]} {rtor[out.nm]}, {rtor[vin[1].nm]}, {rtor[vin[2].nm]}, ne")
elif arg in [UnaryOps.LOG2, UnaryOps.SIN, UnaryOps.EXP2, UnaryOps.SQRT]:
#NOTE: Not a real instruction, use to emulate a ext call in unicorn
if CI: ins.append(f"{alu[arg]} {rtor[out.nm]} {rtor[vin[0].nm]}")
else:
save_regs = [k for k in rtor.keys() if k != out.nm and k not in mem_vars]
ins.append(f"sub sp, sp, #{(len(save_regs))*16}")
# Save the registers before they are cleared by func call
for i,k in enumerate(save_regs,1):
ins.append(f"str {rtor[k]}, [sp, #{16*i}]")
ins.append("stp x29, x30, [sp, #0]!")
ins.append("mov x29, sp")
ins.append(f"fmov s0, {rtor[vin[0].nm]}")
ins.append(alu[arg])
ins.append(f"fmov {rtor[out.nm]}, s0")
ins.append("mov sp, x29")
ins.append("ldp x29, x30, [sp], #0")
for i,k in enumerate(save_regs,1):
ins.append(f"ldr {rtor[k]}, [sp, #{16*i}]")
ins.append(f"add sp, sp, #{len(save_regs)*16}")
elif arg == BinaryOps.CMPLT:
ins.append(f"{alu[arg]} {','.join('x15' if v.__class__ is int else rtor[v.nm] for v in [out] + vin)}" if not dtypes.is_float(vin[0][1]) else f"fcmp {rtor[vin[0].nm]}, {rtor[vin[1].nm]}")
elif arg == BinaryOps.MOD:
rhs = 'x15' if vin[1].__class__ is int else rtor[vin[1].nm]
ins.append(f"udiv x14, {rtor[vin[0].nm]}, {rhs}")
ins.append(f"msub {rtor[out.nm]}, x14, {rhs}, {rtor[vin[0].nm]}")
else:
ins.append(f"{'f' if dtypes.is_float(vin[0][1]) else 's' if arg == BinaryOps.DIV else ''}{alu[arg]} {', '.join('x15' if v.__class__ is int else rtor[v.nm] for v in [out] + vin)}")
elif uop == Ops.LOAD:
if arg.__class__ in (int, float):
mov_imm(arg, rtor[out.nm])
else:
#NOTE: if need casting load var in s/h0 or x/w12 temp regs
reg_in = type_to_reg[arg[2]] + ('0' if dtypes.is_float(arg[2]) else '12') if arg[2] is not None else rtor[out.nm]
mov_imm(arg[0], "x15")
ins.append(f"add x15, {rtor[vin[0].nm]}, x15")
ins.append(f"ldr{'sb' if arg[2] is not None and arg[2] in (dtypes.int8, dtypes.uint8, dtypes.bool) else ''} {reg_in}, [x15]")
if arg[2] is not None: ins.append(f"{'fcvt' if arg[2] in [dtypes.half, dtypes.double] else 'scvtf'} {rtor[out.nm]}, {reg_in}")
elif uop == Ops.STORE:
#NOTE: if need casting load var in s/h0 or x/w12 temp regs
reg_out = (type_to_reg[arg[2]] + ('0' if dtypes.is_float(arg[2]) else '12') if arg[2] is not None else rtor[vin[1].nm])
if arg[2] is not None: ins.append(f"fcvt{'zs' if arg[2] not in [dtypes.half, dtypes.double] else '' } {reg_out}, {rtor[vin[1].nm]}")
ins.append(f"mov x15, #{arg[0]}")
ins.append(f"str {reg_out}, [{rtor[vin[0].nm]}, x15, lsl #0]")
elif uop == Ops.COND_BRANCH:
#TODO: this is a hack it shouldn't always be a cmp before a cond branch?
if prev_uop == Ops.LOAD:
ins.append(f"cmp {rtor[vin[0].nm]}, #0")
ins.append(f"b.{'lt' if arg[1] else 'ge'} {arg[0][1:]}")
elif uop == Ops.LABEL:
ins.append(f"{arg[1:]}:")
elif uop == Ops.ENDLOOP:
mov_imm(arg[0], "x15")
ins.append(f"add {rtor[vin[0].nm]}, {rtor[vin[0].nm]}, #1")
ins.append(f"cmp {rtor[vin[0].nm]}, x15")
ins.append(f"b.lt loop_{arg[1]}")
prev_uop = uop
# store regs into memory if needed
if out is not None and out.nm in mem_vars:
ins.append(f"mov x15, {mem_vars[out.nm]}")
ins.append(f"str {rtor[out.nm]}, [sp, x15]")
return "\n".join([f"//varsize {var_size}",".arch armv8-a",".text", f".global {get_name(fn_nm)}",".p2align 2", f"{get_name(fn_nm)}:", "mov x17, sp"] + [f"sub sp, sp, #{offset}" for offset in compute_offsets(var_size)]+ ins + [f"add sp, sp, #{offset}" for offset in compute_offsets(var_size)] +["ret", "\n"])
def uops_to_arm64_asm(fn_nm:str, uops:List[UOp]) -> Tuple[str, List[int], List[int], bool]:
lang = ARM64Language()
global_size, local_size = uops_to_asmstyle(lang, fn_nm, uops)
return specialize_to_arm64(fn_nm, lang.ins), global_size[::-1], local_size[::-1], True

View file

@ -1,105 +0,0 @@
from typing import List
import struct
from tinygrad.codegen.assembly import uops_to_asmstyle, AssemblyLanguage
from tinygrad.codegen.opt.kernel import Ops, UOp
from tinygrad import dtypes
from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
from tinygrad.runtime.ops_cuda import arch
dtype_to_nvtype = {dtypes.float32: "f32", dtypes.float16: "f16", dtypes.int64: "s64", dtypes.int32: "s32", dtypes.int8: "s8", dtypes.bool: "pred", dtypes.uint64: "u64", dtypes.uint32: "u32", dtypes.uint16: "u16", dtypes.uint8: "u8", "bits16": "b16", dtypes.float64: "f64"}
def float_to_hex(x): return "%02X%02X%02X%02X" % tuple(struct.pack("f",x)[::-1])
def ptx_needs_cast(dest_dtype, src_dtype): return dtypes.is_float(dest_dtype) and dtypes.is_int(src_dtype) or dtypes.is_int(dest_dtype) and dtypes.is_float(src_dtype) or (dtypes.is_float(src_dtype) and dtypes.is_float(dest_dtype) and dest_dtype.itemsize != src_dtype.itemsize)
def render_cast(ins, inp, out):
if inp.dtype == dtypes.bool and (dtypes.is_float(out.dtype) or dtypes.is_int(out.dtype)):
ins.append(f"selp.{dtype_to_nvtype[out.dtype]} {out}, {'0f3F800000, 0f00000000' if dtypes.is_float(out.dtype) else '1, 0'}, {inp};")
elif out.dtype == dtypes.bool:
if inp.dtype == dtypes.bool:
ins.append(f"mov.pred {out}, {inp};")
else:
ins.append(f"setp.ne.{dtype_to_nvtype[inp.dtype]} {out}, {'0f00000000' if dtypes.is_float(inp.dtype) else '0'}, {inp};")
else:
round_mod = ".rzi" if dtypes.is_int(out.dtype) and dtypes.is_float(inp.dtype) else '.rz' if dtypes.is_float(out.dtype) and (dtypes.is_int(inp.dtype) or dtypes.is_float(inp.dtype) and inp.dtype.itemsize > out.dtype.itemsize) else ''
ins.append(f"cvt{round_mod}.{dtype_to_nvtype[out.dtype]}.{dtype_to_nvtype[inp.dtype]} {out}, {inp};")
# https://docs.nvidia.com/cuda/parallel-thread-execution/#
class PTXLanguage(AssemblyLanguage):
supports_constant_folding: bool = True
def specialize_to_ptx(lang, function_name):
param_cnt = 0
ins = []
alu = {BinaryOps.ADD: "add", BinaryOps.SUB: "sub", BinaryOps.MUL: "mul", BinaryOps.DIV: "div", BinaryOps.MAX: "max",
BinaryOps.MOD: "rem", BinaryOps.CMPLT: "setp.lt", UnaryOps.SQRT: "sqrt.approx",
UnaryOps.NOOP: "mov", UnaryOps.NEG: "neg",
UnaryOps.SIN: "sin.approx", UnaryOps.LOG2: "lg2.approx", UnaryOps.EXP2: "ex2.approx.ftz",
TernaryOps.MULACC: "fma.rn", TernaryOps.WHERE: "selp"}
for uop, out, vin, arg in lang.ins:
if uop == Ops.ENDLOOP:
ins.append("bar.sync 0;")
elif uop == Ops.DEFINE_LOCAL:
ins.append(f".shared .align 4 .b8 {arg[0]}[{arg[1]*4}];")
elif uop == Ops.SPECIAL:
if arg.startswith('data'):
param_cnt += 1
ins.append(f"ld.param.u64 {out}, [{arg}];")
# TODO: we sometimes want this to be local, nvcc converts to global most of the time, not sure when we would need to?
# ins.append(f"cvta.to.global.u64 {out}, {out};")
elif arg.startswith('gid'):
ins.append(f"mov.u32 {out}, %ctaid.{'xyz'[int(arg[3:])]};")
elif arg.startswith('lid'):
ins.append(f"mov.u32 {out}, %tid.{'xyz'[int(arg[3:])]};")
elif uop == Ops.ALU:
if arg == BinaryOps.MUL and out.dtype == dtypes.bool:
ins.append(f"and.pred {out}, {', '.join(str(x) for x in vin)};")
else:
otype = vin[0].dtype if arg in [BinaryOps.CMPLT] else out.dtype
if arg == TernaryOps.WHERE:
if vin[0].dtype == dtypes.bool:
reg = vin[0]
else:
reg = lang.newreg((vin[0], 'bool'), dtypes.bool)
ins.append(f"setp.ne.{dtype_to_nvtype[vin[0].dtype]} {reg}, {'0f00000000' if dtypes.is_float(vin[0].dtype) else '0'}, {vin[0]};")
vin = vin[1:] + [reg]
ins.append(f"{alu[arg]}{'.lo' if arg == BinaryOps.MUL and out.dtype != dtypes.float32 else ''}{'.rn' if arg == BinaryOps.DIV and out.dtype == dtypes.float32 else ''}.{dtype_to_nvtype[otype]} {out}, {', '.join(str(x) for x in vin)};")
elif uop == Ops.LOAD:
if arg.__class__ in (int, float):
ins.append(f"mov.{dtype_to_nvtype[out.dtype]} {out}, {'0f'+float_to_hex(arg) if dtypes.is_float(out.dtype) else int(arg)};")
elif arg[2] is not None and (arg[2] == dtypes.bool or arg[2] != out.dtype):
dt = ('u16', dtypes.uint16) if arg[2] == dtypes.bool == out.dtype else ('u8', dtypes.uint8) if arg[2] == dtypes.bool else ('b16', dtypes.float16) if arg[2] == dtypes.half else (dtype_to_nvtype[arg[2]], arg[2])
reg = lang.newreg((out, dt[0]), dtype=dt[1])
ins.append(f"ld.{arg[1]}.{dt[0]} {reg}, [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}];")
render_cast(ins, reg, out)
else:
ins.append(f"ld.{arg[1]}.{dtype_to_nvtype[dtypes.float if arg[2] is None else arg[2]]} {out}, [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}];")
elif uop == Ops.STORE:
if ptx_needs_cast(dtypes.float if arg[2] is None else arg[2], vin[1].dtype) or arg[2] == dtypes.bool:
if arg[2] == dtypes.bool != vin[1].dtype:
prereg = lang.newreg((vin[1],'bool'), dtype=dtypes.bool)
render_cast(ins, vin[1], prereg)
else: prereg = vin[1]
reg = lang.newreg((prereg, dtypes.uint16 if arg[2] == dtypes.bool else arg[2]), dtype=dtypes.uint16 if arg[2] == dtypes.bool else dtypes.float if arg[2] is None else arg[2])
render_cast(ins, prereg, reg)
ins.append(f"st.{arg[1]}.{dtype_to_nvtype['bits16' if arg[2] == dtypes.float16 else dtypes.uint8 if arg[2] == dtypes.bool else dtypes.float if arg[2] is None else arg[2]]} [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}], {reg};")
else:
ins.append(f"st.{arg[1]}.{dtype_to_nvtype[dtypes.float if arg[2] is None else arg[2]]} [{vin[0]}{f'+{arg[0]}' if arg[0] is not None else ''}], {vin[1]};")
elif uop == Ops.CAST:
render_cast(ins, vin[0], out)
elif uop == Ops.LABEL:
ins.append(f"{arg}:")
elif uop == Ops.COND_BRANCH:
ins.append(f"@{'!' if not arg[1] else ''}{vin[0]} bra {arg[0]};")
ins_prefix = [".version 7.8", ".target " + arch(), ".address_size 64",
f".visible .entry {function_name}({', '.join(f'.param .u64 data{i}' for i in range(param_cnt))}) {{"]
for arg in [(dtype, lang.type_to_letter(dtype), c) for dtype,c in lang.cnts.items()]: ins_prefix.append(f".reg .{dtype_to_nvtype[arg[0][0]]} %{arg[1]}<{arg[2]}>;",)
ins = ins_prefix + ins
ins += ["ret;", "}"]
return '\n'.join(ins)
def uops_to_ptx_asm(function_name:str, uops:List[UOp]):
lang = PTXLanguage()
global_size, local_size = uops_to_asmstyle(lang, function_name, uops)
return specialize_to_ptx(lang, function_name), global_size[::-1], local_size[::-1], True

View file

@ -1,203 +0,0 @@
import yaml
from typing import Tuple, Set, Dict
from tinygrad import dtypes
from tinygrad.codegen.assembly import AssemblyCodegen, Register
from tinygrad.codegen.opt.kernel import Ops
from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
from tinygrad.runtime.ops_cl import ROCM_LLVM_PATH
# ugh, is this really needed?
from extra.helpers import enable_early_exec
early_exec = enable_early_exec()
boilerplate_start = """
.global _start
_start:
.rodata
.align 0x10
.global code.kd
.type code.kd,STT_OBJECT
.amdhsa_kernel code"""
code_start = """.end_amdhsa_kernel
.text
code:
"""
# https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
# https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md#initial-kernel-register-state
# RDNA3 is actually a SIMD machine!
class RDNACodegen(AssemblyCodegen):
supports_float4: bool = True
supports_float4_alu: bool = True
supports_load3: bool = True
sin_is_sin2pi: bool = True
no_div: bool = True
def specialize(self, asm) -> Tuple[str, str]:
args = []
for i,b in enumerate(self.bufs): args.append({'.address_space': 'global', '.name': f'buf_{i}', '.offset': i*8, '.size': 8, '.type_name': b.dtype.name+"*", '.value_kind': 'global_buffer'})
ins = []
v_cnt = 3 # v[0:2] is local_xyz
s_cnt = 5 # s[0:1] is the address, s[2:4] is global_xyz
dtype_to_rdnatype = {dtypes.float32: "f32", dtypes.int64: "i64", dtypes.int32: "i32", dtypes.uint64: "u64", dtypes.bool: "i32"}
alu = {BinaryOps.ADD: "add", BinaryOps.SUB: "sub", BinaryOps.MUL: "mul", TernaryOps.MULACC: "fma",
BinaryOps.MAX: "max", UnaryOps.RECIP: "rcp",
UnaryOps.NOOP: "mov", UnaryOps.SIN: "sin", UnaryOps.LOG2: "log", UnaryOps.EXP2: "exp",
BinaryOps.CMPLT: "cmp_lt"}
pend_regs:Set[Register] = set()
rtor:Dict[Register, str] = {}
def reg_in(x):
nonlocal pend_regs
#print("reg_in", x, rtor[x], pend_regs)
if x in pend_regs:
#print("clear")
ins.append('s_waitcnt lgkmcnt(0), vmcnt(0)')
pend_regs.clear()
return rtor[x]
def reg_out(x):
return rtor[x]
for uop, out, vin, arg in asm:
if uop == Ops.DEFINE_REGISTER:
if arg[0][0] in [dtypes.uint32, dtypes.uint64, dtypes.int64, dtypes.int32, dtypes.float32, dtypes.float.vec(4)]:
for i in range(arg[2]):
# TODO: Re-use gaps created by this to avoid wasting registers
align = int(arg[0][0].itemsize / 4)
if arg[0][1]:
s_cnt += s_cnt % align
reg_name = f"s[{s_cnt}:{s_cnt + align - 1}]" if align > 1 else f"s{s_cnt}"
s_cnt += align
else:
v_cnt += v_cnt % align
reg_name = f"v[{v_cnt}:{v_cnt + align - 1}]" if align > 1 else f"v{v_cnt}"
v_cnt += align
rtor[Register(f"%{arg[1]}{i}", *arg[0])] = reg_name
if arg[0][0] == dtypes.float.vec(4):
for off in range(4):
reg_name = f"s{s_cnt-align+off}" if arg[0][1] else f"v{v_cnt-align+off}"
rtor[Register(f"%{arg[1]}{i}", dtypes.float, False, off=off)] = reg_name
elif arg[0][0] == dtypes.bool:
for i in range(arg[2]):
reg_name = "scc" if arg[0][1] else "vcc_lo" # `_lo` suffix since we're running wavefront_size=32
rtor[Register(f"%{arg[1]}{i}", *arg[0])] = reg_name
else:
raise NotImplementedError("DEFINE_REGISTER not implemented for arg: ", arg)
elif uop == Ops.SPECIAL:
if arg.startswith('buf'):
i = int(arg[3:])
ins.append(f's_load_b64 {reg_out(out)}, s[0:1], {i*8}')
pend_regs.add(out)
for r in out.subregs(): pend_regs.add(r)
elif arg.startswith('gid'):
ins.append(f'v_mov_b32 {reg_out(out)}, s{2+int(arg[3])}')
# the docs lied, this is actually y
if int(arg[3]) == 2: ins.append("v_bfe_u32 v2, v0, 20, 10") # untested
if int(arg[3]) == 1: ins.append("v_bfe_u32 v1, v0, 10, 10")
elif int(arg[3]) == 0: ins.append("v_and_b32_e32 v0, 0x3ff, v0")
# get local size
offset = len(args)*8
args.append({".offset": offset, ".value_kind": f"hidden_group_size_{'xyz'[int(arg[3])]}", ".size": 8})
ins.append(f's_load_b32 s{2+int(arg[3])}, s[0:1], {offset}')
ins.append('s_waitcnt vmcnt(0) lgkmcnt(0)')
pend_regs.clear()
ins.append(f'v_mul_i32_i24 {reg_out(out)}, {reg_out(out)}, s{2+int(arg[3])}')
ins.append(f'v_add_nc_u32 {reg_out(out)}, v{int(arg[3])}, {reg_out(out)}')
elif uop == Ops.CONST:
if arg == float('inf'): arg = "0x7f800000"
elif arg == float('-inf'): arg = "0xff800000"
if out.dtype == dtypes.float.vec(4):
for off in range(4):
ins.append(f"{'s_' if out.scalar else 'v_'}mov_b32 {reg_out(Register(out.nm, dtypes.float, False, off=off))}, {arg}")
else:
ins.append(f"{'s_' if out.scalar else 'v_'}mov_b32 {reg_out(out)}, {arg}")
elif uop == Ops.ALU:
if arg in [BinaryOps.CMPLT]:
ins.append(f"{'s' if out.scalar else 'v'}_{alu[arg]}_{dtype_to_rdnatype[out.dtype]} {', '.join(reg_in(x) if x.__class__ is Register else str(x) for x in vin)}")
else:
alu_arg = alu[arg]
if arg == TernaryOps.MULACC and out == vin[2]:
alu_arg = "fmac"
vin = vin[0:2]
if out.dtype == dtypes.float.vec(4):
for rr in zip(*[x.subregs() if x.dtype == dtypes.float.vec(4) else [x,x,x,x] for x in [out]+vin]):
ins.append(f"{'s_' if rr[0].scalar else 'v_'}{alu_arg}_{dtype_to_rdnatype[rr[0].dtype]} {reg_out(rr[0])}, {', '.join(reg_in(x) if x.__class__ is Register else str(x) for x in rr[1:])}")
else:
ins.append(f"{'s_' if out.scalar else 'v_'}{alu_arg}_{dtype_to_rdnatype[out.dtype] if arg != UnaryOps.NOOP else 'b32'}{'_i24' if arg == BinaryOps.MUL and out.dtype != dtypes.float32 and not out.scalar else ''} {reg_out(out)}, {', '.join(reg_in(x) if x.__class__ is Register else str(x) for x in vin)}")
elif uop == Ops.LOAD:
if out.scalar:
# swap arg order
ins.append(f's_load_b32 {reg_out(out)}, {reg_in(vin[0])}, {reg_in(vin[1])} offset:{arg[0]}')
else:
ins.append(f'global_load_{"b128" if out.dtype == dtypes.float.vec(4) else "b32"} {reg_out(out)}, {reg_in(vin[1])}, {reg_in(vin[0])} offset:{arg[0]}')
pend_regs.add(out)
for r in out.subregs(): pend_regs.add(r)
elif uop == Ops.STORE:
ins.append(f'global_store_{"b128" if vin[1].dtype == dtypes.float.vec(4) else "b32"} {reg_in(vin[2])}, {reg_in(vin[1])}, {reg_in(vin[0])} offset:{arg[0]}')
elif uop == Ops.LABEL:
ins.append(f"{arg}:")
elif uop == Ops.COND_BRANCH:
ins.append(f"s_cbranch_scc{'1' if arg[1] else '0'} {arg[0]}")
elif uop == Ops.CAST:
if vin[0].dtype == dtypes.bool:
if out.dtype == dtypes.float32:
ins.append(f"v_cndmask_b32 {reg_out(out)}, 0.0, 1.0, {reg_in(vin[0])}")
else:
raise NotImplementedError(f"cast {vin[0].dtype} -> {out.dtype}")
else:
raise NotImplementedError(uop)
ins += ['s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)', 's_endpgm', 's_code_end']
# dual alu group
seen = set()
new_ins = []
for i,tins in enumerate(ins):
if tins in seen: continue
if tins.startswith("v_fmac_f32"):
for gins in reversed(ins[i+1:]):
if gins in seen: continue
if gins.startswith("v_fmac_f32"):
r0 = [int(x[1:].strip(',')) for x in tins.split(" ")[1:]]
r1 = [int(x[1:].strip(',')) for x in gins.split(" ")[1:]]
if r0[0]%2 == r1[0]%2: continue
if r0[1]%2 == r1[1]%2: continue
if r0[2]%2 == r1[2]%2: continue
new_ins.append(tins.replace("v_", "v_dual_")+" :: " + gins.replace("v_", "v_dual_"))
seen.add(tins)
seen.add(gins)
break
if tins not in seen:
new_ins.append(tins)
ins = new_ins
return 'code', self.assemble(args, ins, v_cnt, s_cnt)
def assemble(self, args, ins, v_cnt, s_cnt):
kernel_desc = {'.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,
'.amdhsa_next_free_vgpr': v_cnt, # this matters!
'.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,
'.amdhsa_next_free_sgpr': s_cnt,
'.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3, '.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1,
'.amdhsa_fp16_overflow': 0, '.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,
'.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,
'.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?
'.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0, '.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,
'.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0, '.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,
'.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}
metadata = {'amdhsa.kernels': [{'.args': args,
'.group_segment_fixed_size': 0, '.kernarg_segment_align': 8, '.kernarg_segment_size': args[-1][".offset"] + args[-1][".size"],
'.language': 'OpenCL C', '.language_version': [1, 2], '.max_flat_workgroup_size': 256,
'.name': 'code', '.private_segment_fixed_size': 0, '.sgpr_count': s_cnt, '.sgpr_spill_count': 0,
'.symbol': 'code.kd', '.uses_dynamic_stack': False, '.vgpr_count': v_cnt, '.vgpr_spill_count': 0,
'.wavefront_size': 32}],
'amdhsa.target': 'amdgcn-amd-amdhsa--gfx1100', 'amdhsa.version': [1, 2]}
code = boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) + "\n" + code_start + '\n'.join(ins) + "\n.amdgpu_metadata\n" + yaml.dump(metadata) + ".end_amdgpu_metadata"
obj = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], obj))
return asm

View file

@ -1,23 +0,0 @@
#!/usr/bin/env python3
import numpy as np
from tinygrad.runtime.ops_cuda import CUDAProgram, RawCUDABuffer
if __name__ == "__main__":
test = RawCUDABuffer.fromCPU(np.zeros(10, np.float32))
prg = CUDAProgram("test", """
.version 7.8
.target sm_86
.address_size 64
.visible .entry test(.param .u64 x) {
.reg .b32 %r<2>;
.reg .b64 %rd<3>;
ld.param.u64 %rd1, [x];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, 0x40000000; // 2.0 in float
st.global.u32 [%rd2], %r1;
ret;
}""", binary=True)
prg([1], [1], test)
print(test.toCPU())

View file

@ -0,0 +1,136 @@
import os, sys, struct
sys.path.append(os.getcwd())
# PROFILE=1 to use
#os.environ["PROFILE"] = "1"
os.environ["SQTT"] = "1"
os.environ["SQTT_ITRACE_SE_MASK"] = "1"
os.environ["SQTT_LIMIT_SE"] = "1"
import xml.etree.ElementTree as ET
from tinygrad import nn, Tensor, Device
from tinygrad.helpers import get_single_element
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.ops_amd import ProfileSQTTEvent
from extra.sqtt.attempt_sqtt_parse import parse_sqtt_print_packets
def disassemble(text, root:ET.Element):
i = 0
while i < len(text):
ins = struct.unpack("I", text[i:i+4])[0]
# 1. Get the encoding
did_match = False
for enc_el in root.findall("./ISA/Encodings/Encoding"):
mask = enc_el.findtext("EncodingIdentifierMask")
assert len(mask)%32 == 0
bit_mask = int(mask, 2)
iden = [int(x.text, 2) for x in enc_el.find("EncodingIdentifiers").findall("EncodingIdentifier")]
for ide in iden:
if ins&bit_mask == ide:
did_match = True
break
if did_match: break
if not did_match: raise RuntimeError(f"unknown instruction {ins:08X}")
if len(mask) >= 64: ins = (struct.unpack("I", text[i+4:i+8])[0]<<32) | ins
if len(mask) >= 96: ins = (struct.unpack("I", text[i+8:i+12])[0]<<64) | ins
encoding_name = enc_el.findtext("EncodingName")
#print(ET.tostring(enc_el).decode())
# 2. Parse the Fields for this Encoding
field_data = {}
for field in enc_el.findall("MicrocodeFormat/BitMap/Field"):
# Fields can be split into multiple ranges (RangeCount > 1)
ranges = sorted(field.findall("BitLayout/Range"), key=lambda x: int(x.attrib.get('Order')))
val = 0
current_shift = 0
for rng in ranges:
width = int(rng.find("BitCount").text)
chunk = (ins >> int(rng.find("BitOffset").text)) & ((1 << width) - 1)
val |= (chunk << current_shift)
current_shift += width
field_data[field.find("FieldName").text] = val
# this is already used
del field_data["ENCODING"]
# 3. Extract the instruction
did_match = False
for ins_el in root.findall("./ISA/Instructions/Instruction"):
ins_name = ins_el.findtext("InstructionName")
for ins_enc in ins_el.findall("InstructionEncodings/InstructionEncoding"):
if ins_enc.findtext("EncodingName") == encoding_name:
opcode = int(ins_enc.findtext("Opcode"))
if "OP" in field_data and opcode == field_data["OP"]:
did_match = True
del field_data["OP"]
break
if did_match: break
if did_match: break
#print(ET.tostring(ins_enc).decode())
#print()
#print(field_data)
if not did_match:
print(f"{i:4X} : {ins:16x} -- {encoding_name}")
elif did_match:
params = []
#print(ET.tostring(ins_el).decode())
# 4. Extract the opcodes
for op_ins in ins_enc.findall("Operands/Operand"):
op_type = op_ins.findtext("OperandType")
op_size = op_ins.findtext("OperandSize")
op_fmt = op_ins.findtext("DataFormatName")
op_field_name = op_ins.findtext("FieldName")
if op_field_name is None: continue
assert op_field_name in field_data
# loop through operands for compare
for op_el in root.findall("./ISA/OperandTypes/OperandType"):
test_op_type = op_el.findtext("OperandTypeName")
val_dict = {}
for op_val in op_el.findall("OperandPredefinedValues/PredefinedValue"):
val_dict[int(op_val.findtext("Value"))] = op_val.findtext("Name")
if op_type == test_op_type:
if field_data[op_field_name] in val_dict:
print(op_type, op_size, op_fmt)
params.append(val_dict[field_data[op_field_name]])
else:
params.append(f"{op_type}({field_data[op_field_name]})")
del field_data[op_field_name]
#print(op_type, op_size, op_fmt, op_el, op_field_name,
# field_data[op_field_name],
# val_dict.get(field_data[op_field_name], "<UNK>"))
#print(ET.tostring(op_el).decode())
print(f"{i:4X} : {ins:16x} -- {ins_name.lower()} {', '.join(params)}", field_data)
# advance
i += len(mask) // 8
#print(ET.tostring(root).decode())
if __name__ == "__main__":
# human readable manual at https://docs.amd.com/v/u/en-US/rdna35_instruction_set_architecture
fns = nn.state.zip_extract(Tensor.from_url("https://gpuopen.com/download/machine-readable-isa/latest/"))
xml_str = fns['amdgpu_isa_rdna3_5.xml'].to("CPU").data()
with open("/tmp/rdna35.xml", "wb") as f: f.write(bytes(xml_str))
root = ET.fromstring(xml_str)
a = Tensor.empty(16)+1
for ei in a.schedule():
ei.lower()
# get text
_, hdr, _ = elf_loader(ei.prg.lib)
text = get_single_element([x for x in hdr if x.name==".text"]).content
# llvm disassembler
Device["AMD"].compiler.disassemble(ei.prg.lib)
# run program
ei.run()
sqtt_events = [e for e in Device["AMD"].profile_events if isinstance(e, ProfileSQTTEvent)]
for e in sqtt_events[0:1]: # only the first SE
parse_sqtt_print_packets(e.blob)
disassemble(text[:0x40], root)

View file

@ -0,0 +1,15 @@
from tinygrad import Tensor, nn
import xml.etree.ElementTree as ET
if __name__ == "__main__":
# human readable manual at https://docs.amd.com/v/u/en-US/rdna35_instruction_set_architecture
fns = nn.state.zip_extract(Tensor.from_url("https://gpuopen.com/download/machine-readable-isa/latest/"))
xml_str = fns['amdgpu_isa_rdna3_5.xml'].to("CPU").data()
root = ET.fromstring(xml_str)
for op_el in root.findall("./ISA/OperandTypes/OperandType"):
op_name = op_el.findtext("OperandTypeName")
val_dict = {}
for op_val in op_el.findall("OperandPredefinedValues/PredefinedValue"):
val_dict[int(op_val.findtext("Value"))] = op_val.findtext("Name")
print(op_name, val_dict)

View file

@ -1,42 +0,0 @@
import numpy as np
from PIL import Image
from pathlib import Path
import sys
cwd = Path.cwd()
sys.path.append(cwd.as_posix())
sys.path.append((cwd / 'test').as_posix())
from extra.datasets import fetch_mnist
from tqdm import trange
def augment_img(X, rotate=10, px=3):
Xaug = np.zeros_like(X)
for i in trange(len(X)):
im = Image.fromarray(X[i])
im = im.rotate(np.random.randint(-rotate,rotate), resample=Image.BICUBIC)
w, h = X.shape[1:]
#upper left, lower left, lower right, upper right
quad = np.random.randint(-px,px,size=(8)) + np.array([0,0,0,h,w,h,w,0])
im = im.transform((w, h), Image.QUAD, quad, resample=Image.BICUBIC)
Xaug[i] = im
return Xaug
if __name__ == "__main__":
import matplotlib.pyplot as plt
X_train, Y_train, X_test, Y_test = fetch_mnist()
X_train = X_train.reshape(-1, 28, 28).astype(np.uint8)
X_test = X_test.reshape(-1, 28, 28).astype(np.uint8)
X = np.vstack([X_train[:1]]*10+[X_train[1:2]]*10)
fig, a = plt.subplots(2,len(X))
Xaug = augment_img(X)
for i in range(len(X)):
a[0][i].imshow(X[i], cmap='gray')
a[1][i].imshow(Xaug[i],cmap='gray')
a[0][i].axis('off')
a[1][i].axis('off')
plt.show()
#create some nice gifs for doc?!
for i in range(10):
im = Image.fromarray(X_train[7353+i])
im_aug = [Image.fromarray(x) for x in augment_img(np.array([X_train[7353+i]]*100))]
im.save(f"aug{i}.gif", save_all=True, append_images=im_aug, duration=100, loop=0)

View file

@ -1,39 +0,0 @@
from typing import List, Dict, cast
import ctypes
from tinygrad.helpers import dedup, cpu_time_execution, DEBUG
from tinygrad.engine.jit import GraphRunner, GraphException
from tinygrad.device import Buffer, Device
from tinygrad.engine.realize import ExecItem, CompiledRunner
from tinygrad.uop.ops import Variable
from tinygrad.runtime.ops_cpu import ClangProgram
from tinygrad.renderer.cstyle import ClangRenderer
render_dtype = ClangRenderer().render_dtype
class ClangGraph(GraphRunner):
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[str, int]):
super().__init__(jit_cache, input_rawbuffers, var_vals)
if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
prgs = '\n'.join(dedup([cast(CompiledRunner, ji.prg).p.src for ji in jit_cache]))
args = [f"{render_dtype(x.dtype)}* arg{i}" for i,x in enumerate(input_rawbuffers)]
args += sorted([f"int {v}" for v in var_vals])
code = ["void batched("+','.join(args)+") {"]
for ji in jit_cache:
args = []
for buf in ji.bufs:
assert buf is not None
if buf in input_rawbuffers:
args.append(f"arg{input_rawbuffers.index(buf)}")
else:
args.append(f"({render_dtype(buf.dtype)}*)0x{ctypes.addressof(buf._buf):X}")
args += [x.expr for x in cast(CompiledRunner, ji.prg).p.vars]
code.append(f" {cast(CompiledRunner, ji.prg).p.function_name}({','.join(args)});")
code.append("}")
if DEBUG >= 4: print("\n".join(code))
compiler = Device["CPU"].compiler
assert compiler is not None
self._prg = ClangProgram("batched", compiler.compile(prgs+"\n"+"\n".join(code))) # no point in caching the pointers
def __call__(self, rawbufs: List[Buffer], var_vals: Dict[str, int], wait=False):
return cpu_time_execution(
lambda: self._prg(*[x._buf for x in rawbufs], *[x[1] for x in sorted(var_vals.items(), key=lambda x: x[0])]), enable=wait)

View file

@ -1,27 +0,0 @@
import ctypes
from typing import Tuple
import tinygrad.runtime.autogen.hip as hip
from tinygrad.helpers import init_c_var, time_execution_cuda_style
from tinygrad.runtime.ops_hip import check, hip_set_device
from tinygrad.runtime.graph.cuda import CUDAGraph
# TODO: this is only used in graph
def hip_time_execution(cb, enable=False): return time_execution_cuda_style(cb, hip.hipEvent_t, hip.hipEventCreate, hip.hipEventRecord, hip.hipEventSynchronize, hip.hipEventDestroy, hip.hipEventElapsedTime, enable=enable) # noqa: E501
class HIPGraph(CUDAGraph):
def __del__(self):
if hasattr(self, 'graph'): check(hip.hipGraphDestroy(self.graph))
if hasattr(self, 'instance'): check(hip.hipGraphExecDestroy(self.instance))
def set_device(self): hip_set_device(self.dev)
def encode_args_info(self): return (hip.hipDeviceptr_t, (1,2,3))
def graph_create(self): return init_c_var(hip.hipGraph_t(), lambda x: check(hip.hipGraphCreate(ctypes.byref(x), 0)))
def graph_instantiate(self, graph):
return init_c_var(hip.hipGraphExec_t(), lambda x: check(hip.hipGraphInstantiate(ctypes.byref(x), graph, None, None, 0)))
def graph_add_kernel_node(self, graph, c_deps, c_params):
return init_c_var(hip.hipGraphNode_t(), lambda x: check(hip.hipGraphAddKernelNode(ctypes.byref(x), graph, c_deps, ctypes.sizeof(c_deps)//8 if c_deps else 0, ctypes.byref(c_params)))) # noqa: E501
def graph_launch(self, *args, wait=False): return hip_time_execution(lambda: check(hip.hipGraphLaunch(*args)), enable=wait)
def graph_exec_kernel_node_set_params(self, *args): return check(hip.hipGraphExecKernelNodeSetParams(*args))
def build_kernel_node_params(self, prg, global_size, local_size, c_config):
return hip.hipKernelNodeParams(hip.dim3(*local_size), c_config, ctypes.cast(prg.clprg.prg, ctypes.c_void_p), hip.dim3(*global_size), None, 0)
def set_kernel_node_launch_dims(self, node, global_size: Tuple[int, int, int], local_size: Tuple[int, int, int]):
node.blockDim.x, node.blockDim.y, node.blockDim.z, node.gridDim.x, node.gridDim.y, node.gridDim.z = *local_size, *global_size

View file

@ -1,143 +0,0 @@
import ctypes, collections
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.helpers import init_c_var
def check(status):
if status != 0:
hsa.hsa_status_string(status, ctypes.byref(status_str := ctypes.POINTER(ctypes.c_char)()))
raise RuntimeError(f"HSA Error {status}: {ctypes.string_at(status_str).decode()}")
# Precalulated AQL info
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
EMPTY_SIGNAL = hsa.hsa_signal_t()
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
DISPATCH_KERNEL_HEADER |= hsa.HSA_PACKET_TYPE_KERNEL_DISPATCH << hsa.HSA_PACKET_HEADER_TYPE
BARRIER_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
BARRIER_HEADER |= hsa.HSA_PACKET_TYPE_BARRIER_AND << hsa.HSA_PACKET_HEADER_TYPE
class AQLQueue:
def __init__(self, device, sz=-1):
self.device = device
check(hsa.hsa_agent_get_info(self.device.agent, hsa.HSA_AGENT_INFO_QUEUE_MAX_SIZE, ctypes.byref(max_queue_size := ctypes.c_uint32())))
queue_size = min(max_queue_size.value, sz) if sz != -1 else max_queue_size.value
null_func = ctypes.CFUNCTYPE(None, hsa.hsa_status_t, ctypes.POINTER(hsa.struct_hsa_queue_s), ctypes.c_void_p)()
self.hw_queue = init_c_var(ctypes.POINTER(hsa.hsa_queue_t)(), lambda x: check(
hsa.hsa_queue_create(self.device.agent, queue_size, hsa.HSA_QUEUE_TYPE_SINGLE, null_func, None, (1<<32)-1, (1<<32)-1, ctypes.byref(x))))
self.next_doorbell_index = 0
self.queue_base = self.hw_queue.contents.base_address
self.queue_size = self.hw_queue.contents.size * AQL_PACKET_SIZE # in bytes
self.write_addr = self.queue_base
self.write_addr_end = self.queue_base + self.queue_size - 1 # precalc saves some time
self.available_packet_slots = self.hw_queue.contents.size
check(hsa.hsa_amd_queue_set_priority(self.hw_queue, hsa.HSA_AMD_QUEUE_PRIORITY_HIGH))
check(hsa.hsa_amd_profiling_set_profiler_enabled(self.hw_queue, 1))
def __del__(self):
if hasattr(self, 'hw_queue'): check(hsa.hsa_queue_destroy(self.hw_queue))
def submit_kernel(self, prg, global_size, local_size, kernargs, completion_signal=None):
if self.available_packet_slots == 0: self._wait_queue()
packet = hsa.hsa_kernel_dispatch_packet_t.from_address(self.write_addr)
packet.workgroup_size_x = local_size[0]
packet.workgroup_size_y = local_size[1]
packet.workgroup_size_z = local_size[2]
packet.reserved0 = 0
packet.grid_size_x = global_size[0] * local_size[0]
packet.grid_size_y = global_size[1] * local_size[1]
packet.grid_size_z = global_size[2] * local_size[2]
packet.private_segment_size = prg.private_segment_size
packet.group_segment_size = prg.group_segment_size
packet.kernel_object = prg.handle
packet.kernarg_address = kernargs
packet.reserved2 = 0
packet.completion_signal = completion_signal if completion_signal else EMPTY_SIGNAL
packet.setup = DISPATCH_KERNEL_SETUP
packet.header = DISPATCH_KERNEL_HEADER
self._submit_packet()
def submit_barrier(self, wait_signals=None, completion_signal=None):
assert wait_signals is None or len(wait_signals) <= 5
if self.available_packet_slots == 0: self._wait_queue()
packet = hsa.hsa_barrier_and_packet_t.from_address(self.write_addr)
packet.reserved0 = 0
packet.reserved1 = 0
for i in range(5):
packet.dep_signal[i] = wait_signals[i] if wait_signals and len(wait_signals) > i else EMPTY_SIGNAL
packet.reserved2 = 0
packet.completion_signal = completion_signal if completion_signal else EMPTY_SIGNAL
packet.header = BARRIER_HEADER
self._submit_packet()
def blit_packets(self, packet_addr, packet_cnt):
if self.available_packet_slots < packet_cnt: self._wait_queue(packet_cnt)
tail_blit_packets = min((self.queue_base + self.queue_size - self.write_addr) // AQL_PACKET_SIZE, packet_cnt)
rem_packet_cnt = packet_cnt - tail_blit_packets
ctypes.memmove(self.write_addr, packet_addr, AQL_PACKET_SIZE * tail_blit_packets)
if rem_packet_cnt > 0: ctypes.memmove(self.queue_base, packet_addr + AQL_PACKET_SIZE * tail_blit_packets, AQL_PACKET_SIZE * rem_packet_cnt)
self._submit_packet(packet_cnt)
def wait(self):
self.submit_barrier([], finish_signal := self.device.alloc_signal(reusable=True))
hsa.hsa_signal_wait_scacquire(finish_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
self.available_packet_slots = self.queue_size // AQL_PACKET_SIZE
def _wait_queue(self, need_packets=1):
while self.available_packet_slots < need_packets:
rindex = hsa.hsa_queue_load_read_index_relaxed(self.hw_queue)
self.available_packet_slots = self.queue_size // AQL_PACKET_SIZE - (self.next_doorbell_index - rindex)
def _submit_packet(self, cnt=1):
self.available_packet_slots -= cnt
self.next_doorbell_index += cnt
hsa.hsa_queue_store_write_index_relaxed(self.hw_queue, self.next_doorbell_index)
hsa.hsa_signal_store_screlease(self.hw_queue.contents.doorbell_signal, self.next_doorbell_index-1)
self.write_addr += AQL_PACKET_SIZE * cnt
if self.write_addr > self.write_addr_end:
self.write_addr = self.queue_base + (self.write_addr - self.queue_base) % self.queue_size
def scan_agents():
agents = collections.defaultdict(list)
@ctypes.CFUNCTYPE(hsa.hsa_status_t, hsa.hsa_agent_t, ctypes.c_void_p)
def __scan_agents(agent, data):
status = hsa.hsa_agent_get_info(agent, hsa.HSA_AGENT_INFO_DEVICE, ctypes.byref(device_type := hsa.hsa_device_type_t()))
if status == 0: agents[device_type.value].append(agent)
return hsa.HSA_STATUS_SUCCESS
hsa.hsa_iterate_agents(__scan_agents, None)
return agents
def find_memory_pool(agent, segtyp=-1, location=-1):
@ctypes.CFUNCTYPE(hsa.hsa_status_t, hsa.hsa_amd_memory_pool_t, ctypes.c_void_p)
def __filter_amd_memory_pools(mem_pool, data):
check(hsa.hsa_amd_memory_pool_get_info(mem_pool, hsa.HSA_AMD_MEMORY_POOL_INFO_SEGMENT, ctypes.byref(segment := hsa.hsa_amd_segment_t())))
if segtyp >= 0 and segment.value != segtyp: return hsa.HSA_STATUS_SUCCESS
check(hsa.hsa_amd_memory_pool_get_info(mem_pool, hsa.HSA_AMD_MEMORY_POOL_INFO_LOCATION, ctypes.byref(loc:=hsa.hsa_amd_memory_pool_location_t())))
if location >= 0 and loc.value != location: return hsa.HSA_STATUS_SUCCESS
check(hsa.hsa_amd_memory_pool_get_info(mem_pool, hsa.HSA_AMD_MEMORY_POOL_INFO_SIZE, ctypes.byref(sz := ctypes.c_size_t())))
if sz.value == 0: return hsa.HSA_STATUS_SUCCESS
ret = ctypes.cast(data, ctypes.POINTER(hsa.hsa_amd_memory_pool_t))
ret[0] = mem_pool
return hsa.HSA_STATUS_INFO_BREAK
hsa.hsa_amd_agent_iterate_memory_pools(agent, __filter_amd_memory_pools, ctypes.byref(region := hsa.hsa_amd_memory_pool_t()))
return region

View file

@ -1,171 +0,0 @@
import ctypes, collections, time, itertools
from typing import List, Any, Dict, cast, Optional, Tuple
from tinygrad.helpers import init_c_var, round_up
from tinygrad.device import Buffer, BufferSpec
from tinygrad.device import Compiled, Device
from tinygrad.uop.ops import Variable
from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
from tinygrad.engine.jit import MultiGraphRunner, GraphException
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.runtime.support.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL
def dedup_signals(signals): return [hsa.hsa_signal_t(hndl) for hndl in set([x.handle for x in signals if isinstance(x, hsa.hsa_signal_t)])]
class VirtAQLQueue(AQLQueue):
def __init__(self, device, sz):
self.device = device
self.virt_queue = (hsa.hsa_kernel_dispatch_packet_t * sz)()
self.queue_base = self.write_addr = ctypes.addressof(self.virt_queue)
self.packets_count = 0
self.available_packet_slots = sz
def _wait_queue(self, need_packets=1): assert False, f"VirtQueue is too small to handle {self.packets_count+need_packets} packets!"
def _submit_packet(self):
self.write_addr += AQL_PACKET_SIZE
self.packets_count += 1
self.available_packet_slots -= 1
class HSAGraph(MultiGraphRunner):
def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[str, int]):
super().__init__(jit_cache, input_rawbuffers, var_vals)
# Check all jit items are compatible.
compiled_devices = set()
for ji in self.jit_cache:
if isinstance(ji.prg, CompiledRunner): compiled_devices.add(ji.prg.dev)
elif isinstance(ji.prg, BufferXfer):
for x in ji.bufs[0:2]: compiled_devices.add(Device[cast(Buffer, x).device])
else: raise GraphException
if any(not isinstance(d, HSADevice) for d in compiled_devices): raise GraphException
self.devices: List[HSADevice] = list(compiled_devices) #type:ignore
# Allocate kernel args.
kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
for ji in self.jit_cache:
if isinstance(ji.prg, CompiledRunner): kernargs_size[ji.prg.dev] += round_up(ctypes.sizeof(ji.prg._prg.args_struct_t), 16)
kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz, BufferSpec()) for dev,sz in kernargs_size.items()}
# Fill initial arguments.
self.ji_kargs_structs: Dict[int, ctypes.Structure] = {}
for j,ji in enumerate(self.jit_cache):
if not isinstance(ji.prg, CompiledRunner): continue
self.ji_kargs_structs[j] = ji.prg._prg.args_struct_t.from_address(kernargs_ptrs[ji.prg.dev])
kernargs_ptrs[ji.prg.dev] += round_up(ctypes.sizeof(ji.prg._prg.args_struct_t), 16)
for i in range(len(ji.bufs)): self.ji_kargs_structs[j].__setattr__(f'f{i}', cast(Buffer, ji.bufs[i])._buf)
for i in range(len(ji.prg.p.vars)): self.ji_kargs_structs[j].__setattr__(f'v{i}', var_vals[ji.prg.p.vars[i].expr])
# Build queues.
self.virt_aql_queues: Dict[Compiled, VirtAQLQueue] = {dev:VirtAQLQueue(dev, 2*len(self.jit_cache)+16) for dev in self.devices}
self.packets = {}
self.transfers = []
self.ji_to_transfer: Dict[int, int] = {} # faster to store transfers as list and update using this mapping table.
self.signals_to_reset: List[hsa.hsa_signal_t] = []
self.signals_to_devices: Dict[ctypes.c_uint64, List[HSADevice]] = {}
self.profile_info: Dict[Compiled, List[Tuple[Any, ...]]] = collections.defaultdict(list)
# Special packet to wait for the world.
self.kickoff_signals: Dict[HSADevice, hsa.hsa_signal_t] = {dev:self.alloc_signal(reset_on_start=True) for dev in self.devices}
for dev in self.devices: self.virt_aql_queues[dev].submit_barrier([], self.kickoff_signals[dev])
for j,ji in enumerate(self.jit_cache):
if isinstance(ji.prg, CompiledRunner):
wait_signals = self.access_resources(ji.bufs, ji.prg.p.outs, new_dependency=j, sync_with_aql_packets=False)
for i in range(0, len(wait_signals), 5):
self.virt_aql_queues[ji.prg.dev].submit_barrier(wait_signals[i:i+5])
self.packets[j] = hsa.hsa_kernel_dispatch_packet_t.from_address(self.virt_aql_queues[ji.prg.dev].write_addr)
sync_signal = self.alloc_signal(reset_on_start=True) if PROFILE else None
self.virt_aql_queues[ji.prg.dev].submit_kernel(ji.prg._prg, *ji.prg.p.launch_dims(var_vals), #type:ignore
ctypes.addressof(self.ji_kargs_structs[j]), completion_signal=sync_signal)
if PROFILE: self.profile_info[ji.prg.dev].append((sync_signal, ji.prg._prg.name, False))
elif isinstance(ji.prg, BufferXfer):
dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]]
dest_dev, src_dev = cast(HSADevice, Device[dest.device]), cast(HSADevice, Device[src.device])
sync_signal = self.alloc_signal(reset_on_start=True, wait_on=[dest_dev, src_dev])
wait_signals = self.access_resources([dest, src], write=[0], new_dependency=sync_signal, sync_with_aql_packets=True)
self.transfers.append([dest._buf, dest_dev.agent, src._buf, src_dev.agent, dest.nbytes, len(wait_signals),
(hsa.hsa_signal_t*len(wait_signals))(*wait_signals), sync_signal, hsa.HSA_AMD_SDMA_ENGINE_0, True])
self.ji_to_transfer[j] = len(self.transfers) - 1
if PROFILE: self.profile_info[src_dev].append((sync_signal, f"transfer: HSA:{src_dev.device_id} -> HSA:{dest_dev.device_id}", True))
# Wait for all active signals to finish the graph
wait_signals_to_finish: Dict[HSADevice, List[hsa.hsa_signal_t]] = collections.defaultdict(list)
for v in dedup_signals(list(self.w_dependency_map.values()) + list(itertools.chain.from_iterable(self.r_dependency_map.values()))):
for dev in self.signals_to_devices[v.handle]:
wait_signals_to_finish[dev].append(v)
self.finish_signal = init_c_var(hsa.hsa_signal_t(), lambda x: check(hsa.hsa_amd_signal_create(1, 0, None, 0, ctypes.byref(x))))
for dev in self.devices:
wait_signals = wait_signals_to_finish[dev]
for i in range(0, max(1, len(wait_signals)), 5):
self.virt_aql_queues[dev].submit_barrier(wait_signals[i:i+5], completion_signal=self.finish_signal if i+5>=len(wait_signals) else None)
# Zero signals to allow graph to start and execute.
for sig in self.signals_to_reset: hsa.hsa_signal_silent_store_relaxed(sig, 0)
hsa.hsa_signal_silent_store_relaxed(self.finish_signal, 0)
def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[str, int], wait=False) -> Optional[float]:
# Wait and restore signals
hsa.hsa_signal_wait_scacquire(self.finish_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
for sig in self.signals_to_reset: hsa.hsa_signal_silent_store_relaxed(sig, 1)
hsa.hsa_signal_silent_store_relaxed(self.finish_signal, len(self.devices))
# Update rawbuffers
for (j,i),input_idx in self.input_replace.items():
if j in self.ji_kargs_structs:
self.ji_kargs_structs[j].__setattr__(f'f{i}', input_rawbuffers[input_idx]._buf)
else:
if i == 0: self.transfers[self.ji_to_transfer[j]][0] = input_rawbuffers[input_idx]._buf # dest
elif i == 1: self.transfers[self.ji_to_transfer[j]][2] = input_rawbuffers[input_idx]._buf # src
# Update var_vals
for j in self.jc_idx_with_updatable_var_vals:
for i,v in enumerate(cast(CompiledRunner, self.jit_cache[j].prg).p.vars):
self.ji_kargs_structs[j].__setattr__(f'v{i}', var_vals[v.expr])
# Update launch dims
for j in self.jc_idx_with_updatable_launch_dims:
gl, lc = cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals)
self.packets[j].workgroup_size_x = lc[0]
self.packets[j].workgroup_size_y = lc[1]
self.packets[j].workgroup_size_z = lc[2]
self.packets[j].grid_size_x = gl[0] * lc[0]
self.packets[j].grid_size_y = gl[1] * lc[1]
self.packets[j].grid_size_z = gl[2] * lc[2]
for dev in self.devices:
dev.flush_hdp()
dev.hw_queue.blit_packets(self.virt_aql_queues[dev].queue_base, self.virt_aql_queues[dev].packets_count)
for transfer_data in self.transfers:
check(hsa.hsa_amd_memory_async_copy_on_engine(*transfer_data))
et = None
if wait:
st = time.perf_counter()
hsa.hsa_signal_wait_scacquire(self.finish_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
et = time.perf_counter() - st
for profdev,profdata in self.profile_info.items(): Profiler.tracked_signals[profdev] += profdata
return et
def alloc_signal(self, reset_on_start=False, wait_on=None):
sync_signal = init_c_var(hsa.hsa_signal_t(), lambda x: check(hsa.hsa_amd_signal_create(1, 0, None, 0, ctypes.byref(x))))
if reset_on_start: self.signals_to_reset.append(sync_signal)
if wait_on is not None: self.signals_to_devices[sync_signal.handle] = wait_on
return sync_signal
def dependency_as_signal(self, dep, sync_with_aql_packets) -> Optional[hsa.hsa_signal_t]:
if isinstance(dep, hsa.hsa_signal_t): return dep
elif sync_with_aql_packets and isinstance(packet := self.packets.get(dep), hsa.hsa_kernel_dispatch_packet_t):
if packet.completion_signal.handle == EMPTY_SIGNAL.handle: packet.completion_signal = self.alloc_signal(reset_on_start=True)
return packet.completion_signal
return None
def access_resources(self, rawbufs, write, new_dependency, sync_with_aql_packets=False):
rdeps = self._access_resources(rawbufs, write, new_dependency)
wait_signals = [self.dependency_as_signal(dep, sync_with_aql_packets=sync_with_aql_packets) for dep in rdeps]
if sync_with_aql_packets: wait_signals += [self.kickoff_signals[cast(HSADevice, Device[rawbuf.device])] for rawbuf in rawbufs]
return dedup_signals(wait_signals)

View file

@ -1,275 +0,0 @@
from __future__ import annotations
import ctypes, functools, subprocess, io, atexit, collections, json
from typing import Tuple, TypeVar, List, Dict, Any
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv, PROFILE
from tinygrad.device import Compiled, Compiler, CompileError, BufferSpec, LRUAllocator
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.support.hsa import check, scan_agents, find_memory_pool, AQLQueue
from tinygrad.runtime.support.hip_comgr import compile_hip
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
class HSAProfiler:
def __init__(self):
self.tracked_signals = collections.defaultdict(list)
self.collected_events: List[Tuple[Any, ...]] = []
self.copy_timings = hsa.hsa_amd_profiling_async_copy_time_t()
self.disp_timings = hsa.hsa_amd_profiling_dispatch_time_t()
def track(self, signal, device, name, is_copy=False): self.tracked_signals[device].append((signal, name, is_copy))
def process(self, device):
# Process all tracked signals, should be called before any of tracked signals are reused.
for sig,name,is_copy in self.tracked_signals[device]:
if is_copy: check(hsa.hsa_amd_profiling_get_async_copy_time(sig, ctypes.byref(timings := self.copy_timings)))
else: check(hsa.hsa_amd_profiling_get_dispatch_time(device.agent, sig, ctypes.byref(timings := self.disp_timings))) #type:ignore
self.collected_events.append((device.device_id, 1 if is_copy else 0, name, timings.start, timings.end))
self.tracked_signals.pop(device)
def save(self, path):
mjson = []
for i in range(len(HSADevice.devices)):
mjson.append({"name": "process_name", "ph": "M", "pid": i, "args": {"name": "HSA"}})
mjson.append({"name": "thread_name", "ph": "M", "pid": i, "tid": 0, "args": {"name": "AQL"}})
mjson.append({"name": "thread_name", "ph": "M", "pid": i, "tid": 1, "args": {"name": "SDMA"}})
for dev_id,queue_id,name,st,et in self.collected_events:
mjson.append({"name": name, "ph": "B", "pid": dev_id, "tid": queue_id, "ts": st*1e-3})
mjson.append({"name": name, "ph": "E", "pid": dev_id, "tid": queue_id, "ts": et*1e-3})
with open(path, "w") as f: f.write(json.dumps({"traceEvents": mjson}))
print(f"Saved HSA profile to {path}")
Profiler = HSAProfiler()
class HSACompiler(Compiler):
def __init__(self, arch:str):
self.arch = arch
super().__init__(f"compile_hip_{self.arch}")
def compile(self, src:str) -> bytes:
try: return compile_hip(src, self.arch)
except RuntimeError as e: raise CompileError(e)
class HSAProgram:
def __init__(self, device:HSADevice, name:str, lib:bytes):
self.device, self.name, self.lib = device, name, lib
if DEBUG >= 6:
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
self.exec = init_c_var(hsa.hsa_executable_t(), lambda x: check(hsa.hsa_executable_create_alt(hsa.HSA_PROFILE_FULL, hsa.HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, None, ctypes.byref(x)))) # noqa: E501
self.code_reader = init_c_var(hsa.hsa_code_object_reader_t(),
lambda x: check(hsa.hsa_code_object_reader_create_from_memory(lib, len(lib), ctypes.byref(x))))
check(hsa.hsa_executable_load_agent_code_object(self.exec, self.device.agent, self.code_reader, None, None))
check(hsa.hsa_executable_freeze(self.exec, None))
self.kernel = init_c_var(hsa.hsa_executable_symbol_t(), lambda x: check(hsa.hsa_executable_get_symbol_by_name(self.exec, (name+".kd").encode("utf-8"), ctypes.byref(self.device.agent), ctypes.byref(x)))) # noqa: E501
self.handle = init_c_var(ctypes.c_uint64(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, ctypes.byref(x)))) # noqa: E501
self.kernargs_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
self.group_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
self.private_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
def __del__(self):
self.device.synchronize()
if hasattr(self, 'code_reader'): check(hsa.hsa_code_object_reader_destroy(self.code_reader))
if hasattr(self, 'exec'): check(hsa.hsa_executable_destroy(self.exec))
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
if not hasattr(self, "args_struct_t"):
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
kernargs = None
if self.kernargs_segment_size > 0:
kernargs = self.device.alloc_kernargs(self.kernargs_segment_size)
args_st = self.args_struct_t.from_address(kernargs)
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i])
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
self.device.flush_hdp()
signal = self.device.alloc_signal(reusable=True) if wait or PROFILE else None
self.device.hw_queue.submit_kernel(self, global_size, local_size, kernargs, completion_signal=signal)
if PROFILE: Profiler.track(signal, self.device, self.name)
if wait:
hsa.hsa_signal_wait_scacquire(signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
check(hsa.hsa_amd_profiling_get_dispatch_time(self.device.agent, signal, ctypes.byref(timings := hsa.hsa_amd_profiling_dispatch_time_t())))
return (timings.end - timings.start) * self.device.clocks_to_time
T = TypeVar("T")
CHUNK_SIZE, PAGE_SIZE = 256*1024*1024, 0x1000
class HSAAllocator(LRUAllocator):
def __init__(self, device:HSADevice):
self.device = device
super().__init__()
def _alloc(self, size:int, options:BufferSpec):
if options.host:
check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem))
return mem.value
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
return buf.value
def _free(self, opaque:T, options:BufferSpec):
HSADevice.synchronize_system()
check(hsa.hsa_amd_memory_pool_free(opaque))
def _copyin(self, dest:T, src: memoryview):
# Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets.
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
mem = self._alloc(src.nbytes, BufferSpec(host=True))
ctypes.memmove(mem, from_mv(src), src.nbytes)
check(hsa.hsa_amd_memory_async_copy_on_engine(dest, self.device.agent, mem, HSADevice.cpu_agent, src.nbytes, 1, ctypes.byref(sync_signal),
copy_signal := self.device.alloc_signal(reusable=True), hsa.HSA_AMD_SDMA_ENGINE_0, True))
self.device.hw_queue.submit_barrier([copy_signal])
self.device.delayed_free.append(mem)
if PROFILE: Profiler.track(copy_signal, self.device, f"copyin: CPU -> HSA:{self.device.device_id}", is_copy=True)
def copy_from_fd(self, dest, fd, offset, size):
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
if not hasattr(self, 'hb'):
self.hb = [self._alloc(CHUNK_SIZE, BufferSpec(host=True)) for _ in range(2)]
self.hb_signals = [self.device.alloc_signal(reusable=False) for _ in range(2)]
self.hb_polarity = 0
self.sdma = [hsa.HSA_AMD_SDMA_ENGINE_0, hsa.HSA_AMD_SDMA_ENGINE_1]
for sig in self.hb_signals: hsa.hsa_signal_store_relaxed(sig, 0)
fo = io.FileIO(fd, "a+b", closefd=False)
fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
copies_called = 0
copied_in = 0
for local_offset in range(0, size+minor_offset, CHUNK_SIZE):
local_size = min(round_up(size+minor_offset, PAGE_SIZE)-local_offset, CHUNK_SIZE)
copy_size = min(local_size-minor_offset, size-copied_in)
if copy_size == 0: break
hsa.hsa_signal_wait_scacquire(self.hb_signals[self.hb_polarity], hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
self.device.reusable_signals.append(self.hb_signals[self.hb_polarity]) # it's free now and can be reused
self.hb_signals[self.hb_polarity] = self.device.alloc_signal(reusable=False)
fo.readinto(to_mv(self.hb[self.hb_polarity], local_size))
check(hsa.hsa_amd_memory_async_copy_on_engine(dest+copied_in, self.device.agent, self.hb[self.hb_polarity]+minor_offset, HSADevice.cpu_agent,
copy_size, 1, ctypes.byref(sync_signal), self.hb_signals[self.hb_polarity],
self.sdma[self.hb_polarity], True))
copied_in += copy_size
self.hb_polarity = (self.hb_polarity + 1) % len(self.hb)
minor_offset = 0 # only on the first
copies_called += 1
wait_signals = [self.hb_signals[self.hb_polarity - 1]]
if copies_called > 1: wait_signals.append(self.hb_signals[self.hb_polarity])
self.device.hw_queue.submit_barrier(wait_signals)
def _copyout(self, dest:memoryview, src:T):
HSADevice.synchronize_system()
copy_signal = self.device.alloc_signal(reusable=True)
c_agents = (hsa.hsa_agent_t*2)(self.device.agent, HSADevice.cpu_agent)
check(hsa.hsa_amd_memory_lock_to_pool(from_mv(dest), dest.nbytes, c_agents, 2, HSADevice.cpu_mempool, 0, ctypes.byref(addr:=ctypes.c_void_p())))
check(hsa.hsa_amd_memory_async_copy(addr, HSADevice.cpu_agent, src, self.device.agent, dest.nbytes, 0, None, copy_signal))
hsa.hsa_signal_wait_scacquire(copy_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
check(hsa.hsa_amd_memory_unlock(from_mv(dest)))
if PROFILE: Profiler.track(copy_signal, self.device, f"copyout: HSA:{self.device.device_id} -> CPU", is_copy=True)
def transfer(self, dest:T, src:T, sz:int, src_dev=None, dest_dev=None):
src_dev.hw_queue.submit_barrier([], sync_signal_1 := src_dev.alloc_signal(reusable=True))
dest_dev.hw_queue.submit_barrier([], sync_signal_2 := dest_dev.alloc_signal(reusable=True))
c_wait_signal = (hsa.hsa_signal_t*2)(sync_signal_1, sync_signal_2)
check(hsa.hsa_amd_memory_async_copy_on_engine(dest, dest_dev.agent, src, src_dev.agent, sz, 2, c_wait_signal,
copy_signal := dest_dev.alloc_signal(reusable=False), hsa.HSA_AMD_SDMA_ENGINE_0, True))
src_dev.hw_queue.submit_barrier([copy_signal])
dest_dev.hw_queue.submit_barrier([copy_signal])
if PROFILE: Profiler.track(copy_signal, src_dev, f"transfer: HSA:{src_dev.device_id} -> HSA:{dest_dev.device_id}", is_copy=True)
class HSADevice(Compiled):
devices: List[HSADevice] = []
agents: Dict[int, List[hsa.hsa_agent_t]] = {}
cpu_agent: hsa.hsa_agent_t
cpu_mempool: hsa.hsa_amd_memory_pool_t
def __init__(self, device:str=""):
if not HSADevice.agents:
check(hsa.hsa_init())
atexit.register(hsa_terminate)
HSADevice.agents = scan_agents()
HSADevice.cpu_agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_CPU][0]
HSADevice.cpu_mempool = find_memory_pool(HSADevice.cpu_agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_CPU)
if PROFILE: check(hsa.hsa_amd_profiling_async_copy_enable(1))
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU][self.device_id]
self.gpu_mempool = find_memory_pool(self.agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_GPU)
self.hw_queue = AQLQueue(self)
HSADevice.devices.append(self)
check(hsa.hsa_agent_get_info(self.agent, hsa.HSA_AGENT_INFO_NAME, ctypes.byref(agent_name_buf := ctypes.create_string_buffer(256))))
self.arch = ctypes.string_at(agent_name_buf).decode()
check(hsa.hsa_system_get_info(hsa.HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ctypes.byref(gpu_freq := ctypes.c_uint64())))
self.clocks_to_time: float = 1 / gpu_freq.value
check(hsa.hsa_agent_get_info(self.agent, hsa.HSA_AMD_AGENT_INFO_HDP_FLUSH, ctypes.byref(hdp_flush := hsa.hsa_amd_hdp_flush_t())))
self.hdp_flush = hdp_flush
self.delayed_free: List[int] = []
self.reusable_signals: List[hsa.hsa_signal_t] = []
from tinygrad.runtime.graph.hsa import HSAGraph
super().__init__(device, HSAAllocator(self), HIPRenderer(), HSACompiler(self.arch), functools.partial(HSAProgram, self), HSAGraph)
# Finish init: preallocate some signals + space for kernargs
self.signal_pool = [init_c_var(hsa.hsa_signal_t(), lambda x: check(hsa.hsa_signal_create(1, 0, None, ctypes.byref(x)))) for _ in range(4096)]
self._new_kernargs_region(16 << 20) # initial region size is 16mb
def synchronize(self):
self.hw_queue.wait()
for sig in self.reusable_signals: hsa.hsa_signal_silent_store_relaxed(sig, 1)
self.signal_pool.extend(self.reusable_signals)
self.reusable_signals.clear()
for opaque_to_free in self.delayed_free: check(hsa.hsa_amd_memory_pool_free(opaque_to_free))
self.delayed_free.clear()
self.kernarg_next_addr = self.kernarg_start_addr
Profiler.process(self)
@staticmethod
def synchronize_system():
for d in HSADevice.devices: d.synchronize()
def alloc_signal(self, reusable=False):
if len(self.signal_pool): signal = self.signal_pool.pop()
else: check(hsa.hsa_amd_signal_create(1, 0, None, 0, ctypes.byref(signal := hsa.hsa_signal_t())))
# reusable means a signal could be reused after synchronize for the device it's allocated from is called.
if reusable: self.reusable_signals.append(signal)
return signal
def alloc_kernargs(self, sz):
if self.kernarg_next_addr + sz >= self.kernarg_start_addr + self.kernarg_pool_sz: self._new_kernargs_region(int(self.kernarg_pool_sz * 2))
result = self.kernarg_next_addr
self.kernarg_next_addr = round_up(self.kernarg_next_addr + sz, 16)
return result
def _new_kernargs_region(self, sz:int):
if hasattr(self, 'kernarg_start_addr'): self.delayed_free.append(self.kernarg_start_addr)
self.kernarg_start_addr: int = self.allocator._alloc(sz, BufferSpec())
self.kernarg_next_addr = self.kernarg_start_addr
self.kernarg_pool_sz: int = sz
def flush_hdp(self): self.hdp_flush.HDP_MEM_FLUSH_CNTL[0] = 1
def hsa_terminate():
# Need to stop/delete aql queue before hsa shut down, this leads to gpu hangs.
for dev in HSADevice.devices:
Profiler.process(dev)
del dev.hw_queue
# hsa_shut_down cleans up all hsa-related resources.
hsa.hsa_shut_down()
HSADevice.synchronize = lambda: None #type:ignore
HSAProgram.__del__ = lambda _: None #type:ignore
if Profiler.collected_events: Profiler.save("/tmp/profile.json")

View file

@ -1,127 +0,0 @@
from typing import Dict, Set
import yaml
from tinygrad.codegen.uops import UOpGraph, UOps, UOp
from tinygrad.uop.ops import BinaryOps
from tinygrad.dtype import dtypes
def uops_to_rdna(function_name:str, uops:UOpGraph) -> str:
replace: Dict[UOp, UOp] = {}
seen: Set[UOp] = set()
for u in uops:
if u in seen: continue
seen.add(u)
for o,n in replace.items():
if o in u.vin and u is not n:
u.vin = tuple(n if x == o else x for x in u.vin)
# pointer indexing
if u.uop in {UOps.LOAD, UOps.STORE} and u.vin[0].dtype.itemsize > 1:
val = UOp(UOps.CONST, dtypes.int, tuple(), arg=u.vin[0].dtype.itemsize, insert_at=uops.uops.index(u))
ptr = UOp(UOps.ALU, dtypes.int, (u.vin[1], val), arg=BinaryOps.MUL, insert_at=uops.uops.index(u))
u.vin = (u.vin[0], ptr) + u.vin[2:]
#uops.print()
args = []
ins = []
v_cnt = 3 # v[0:2] is local_xyz
s_cnt = 5 # s[0:1] is the address, s[2:4] is global_xyz
r: Dict[UOp, str] = {}
for u in uops:
if u.uop == UOps.SPECIAL:
if u.arg.startswith("lidx"):
r[u] = f'v{u.src[0].arg}'
elif u.arg.startswith("gidx"):
r[u] = f's{2+u.src[0].arg}'
else:
raise NotImplementedError
elif u.uop == UOps.CONST:
#r[u] = u.arg
# TODO: sometimes we can use s
#r[u] = f"s{s_cnt}"
#s_cnt += 1
#ins.append(f"s_mov_b32 {r[u]}, {u.arg}")
r[u] = f"v{v_cnt}"
v_cnt += 1
ins.append(f"v_mov_b32 {r[u]}, {u.arg}")
elif u.uop == UOps.ALU:
if u.arg == BinaryOps.ADD:
r[u] = f"v{v_cnt}"
v_cnt += 1
ins.append(f"v_add_f32_e32 {r[u]}, {r[u.vin[0]]}, {r[u.vin[1]]}")
elif u.arg == BinaryOps.MUL:
r[u] = f"v{v_cnt}"
v_cnt += 1
if dtypes.is_float(u.dtype):
ins.append(f"v_mul_f32_e32 {r[u]}, {r[u.vin[0]]}, {r[u.vin[1]]}")
else:
ins.append(f"v_mul_u32_u24 {r[u]}, {r[u.vin[0]]}, {r[u.vin[1]]}")
else:
raise NotImplementedError
elif u.uop == UOps.LOAD:
r[u] = f"v{v_cnt}"
v_cnt += 1
ins.append(f"global_load_b32 {r[u]}, {r[u.vin[1]]}, {r[u.vin[0]]}")
ins.append("s_waitcnt vmcnt(0)")
elif u.uop == UOps.STORE:
ins.append(f"global_store_b32 {r[u.vin[1]]}, {r[u.vin[2]]}, {r[u.vin[0]]}")
elif u.uop == UOps.DEFINE_GLOBAL:
i = u.arg[0]
args.append({'.address_space': 'global', '.name': f'buf_{i}', '.offset': i*8, '.size': 8,
'.type_name': u.dtype.name+"*", '.value_kind': 'global_buffer'})
s_cnt += s_cnt%2 # skip
r[u] = f"s[{s_cnt}:{s_cnt+1}]"
s_cnt += 2
ins.append(f"s_load_b64 {r[u]}, s[0:1], {i*8}")
ins.append("s_waitcnt lgkmcnt(0)")
else:
raise NotImplementedError(f"can't render {u.uop}")
# *** boilerplate rendering ***
metadata = {
'amdhsa.kernels': [{'.args': args,
'.group_segment_fixed_size': 0, '.kernarg_segment_align': 8, '.kernarg_segment_size': args[-1][".offset"] + args[-1][".size"],
'.language': 'OpenCL C', '.language_version': [1, 2], '.max_flat_workgroup_size': 256,
'.name': function_name, '.private_segment_fixed_size': 0, '.sgpr_count': s_cnt, '.sgpr_spill_count': 0,
'.symbol': f'{function_name}.kd', '.uses_dynamic_stack': False, '.vgpr_count': v_cnt, '.vgpr_spill_count': 0,
'.wavefront_size': 32}],
'amdhsa.target': 'amdgcn-amd-amdhsa--gfx1100', 'amdhsa.version': [1, 2]}
boilerplate_start = f"""
.rodata
.global {function_name}.kd
.type {function_name}.kd,STT_OBJECT
.align 0x10
.amdhsa_kernel {function_name}"""
kernel_desc = {
'.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,
'.amdhsa_next_free_vgpr': v_cnt, # this matters!
'.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,
'.amdhsa_next_free_sgpr': s_cnt,
'.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3,
'.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0,
'.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,
'.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,
'.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?
'.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0,
'.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,
'.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0,
'.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,
'.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}
code_start = f""".end_amdhsa_kernel
.text
.global {function_name}
.type {function_name},@function
.p2align 8
{function_name}:
"""
ins += ['s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)', 's_endpgm', 's_code_end']
return ".amdgpu_metadata\n" + yaml.dump(metadata) + ".end_amdgpu_metadata" + \
boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) + "\n" + code_start + \
'\n'.join(ins) + f"\n.size {function_name}, .-{function_name}"

View file

@ -1,131 +0,0 @@
from typing import Dict, List, Final, Callable, DefaultDict
from collections import defaultdict
from tinygrad.uop.ops import UnaryOps, BinaryOps, TernaryOps, Op
from tinygrad.helpers import DType, PtrDType, dtypes, ImageDType, DEBUG, getenv
from tinygrad.codegen.opt.kernel import UOp, Ops
from triton.compiler import compile as triton_compile
import linecache
import math
import re
triton_dtypes = {dtypes.double: "tl.float64", dtypes.float32: "tl.float32", dtypes.float16: "tl.float16", dtypes.bool: "tl.int1", dtypes.int8: "tl.int8", dtypes.uint8: "tl.uint8", dtypes.int32: "tl.int32", dtypes.int64: "tl.int64", dtypes.uint32: "tl.uint32", dtypes.uint64: "tl.uint64", dtypes.int16: "tl.int16", dtypes.uint16: "tl.uint16"}
signature_dtypes = {dtypes.double: "fp64",dtypes.float32: "fp32", dtypes.float16: "fp16", dtypes.bool: "i8", dtypes.int8: "i1", dtypes.uint8: "u8", dtypes.int32: "i32", dtypes.int64: "i64", dtypes.uint32: "u32", dtypes.uint64: "u64", dtypes.int16: "i16", dtypes.uint16: "u16"}
def next_power_of_2(x):
return 1 << (x - 1).bit_length()
def render_valid(valid):
return '(' * (len(valid) -1) + ') and '.join(valid) if len(valid) else 'True'
#NOTE Triton requires matching dimensions for load/store, disable this and see TestOps::test_output_padded_conv_transpose2d fail to compile
def fill_dims_for_idx(idx, dims):
return "(" + idx + "+ (" + (f"0*({'+'.join(d for d in dims)})))") if len(dims) else idx
def get_max(var):
if isinstance(var, int): return var
return re.sub(r'\[(.*?)\]', '', str(var))[1:-1]
#NOTE can be removed after https://github.com/gpuocelot/gpuocelot/issues/8 gets resolved
def remove_single_scalar_curly_braces(ptx_code):
return '\n'.join([re.sub(r'\{\s*(%\w+)\s*\}', r'\1', line) for line in ptx_code.split('\n')])
def render_const(args,dtype:DType):
return (('-' if args<0 else '') + 'tl.where(1,float("inf"),0)') if math.isinf(args) else ('tl.where(1,float("nan"),0)' if math.isnan(args) else f"{int(args)}" if dtypes.is_int(dtype) else str(args))
def render_cast(x:str, dtype:DType, bitcast=False):
return f"{x}.to({triton_dtypes[dtype]}, bitcast={bitcast})"
def define_scalar(local_size, dtype, args):
if len(local_size) > 0: return f"tl.full(({','.join([str(next_power_of_2(x)) for x in local_size])},),{render_const(args,dtype)}, dtype={triton_dtypes[dtype]})"
return render_const(args,dtype)
def uops_to_triton(function_name:str, uops:List[UOp]):
local_size: List[int] = []
depth = 1
signatures, dims, bufs, kernel, valid = [], [], [], [], [] #type: ignore
c: DefaultDict[str, int] = defaultdict(int)
r: Dict[UOp, str] = {}
def ssa(u, prefix="t"):
nonlocal c, r
c[prefix] += 1
r[u]=f"{prefix}{c[prefix]-1}"
return r[u]
child_count: DefaultDict[UOp, int] = defaultdict(int)
for ru in uops:
for v in ru.vin:
child_count[v] += 1
def kk(s): kernel.append(" "*depth+s)
code_for_op: Final[Dict[Op, Callable]] = {
UnaryOps.EXP2: lambda x,dtype,: f"tl.math.exp2({x})",
UnaryOps.LOG2: lambda x,dtype,: f"tl.math.log2({x})",
UnaryOps.SIN: lambda x,dtype: f"tl.sin({x})",
UnaryOps.SQRT: lambda x,dtype: f"tl.sqrt({x})",
UnaryOps.NEG: lambda x,dtype: f"-{x}",
BinaryOps.ADD: lambda x,y,dtype: f"({x}+{y})", BinaryOps.SUB: lambda x,y,: f"({x}-{y})",
BinaryOps.MUL: lambda x,y,dtype: f"({x}*{y})", BinaryOps.DIV: lambda x,y,: f"({x}/{y})" if y != '0.0' else f"{x}*tl.where({x}==0.0, float('nan'), float('inf'))",
BinaryOps.MAX: lambda x,y,dtype: f"tl.maximum({x},{y})",
BinaryOps.CMPLT: lambda x,y,dtype: f"({x}<{y})",
BinaryOps.MOD: lambda x,y,dtype: f"tl.abs({x})%tl.abs({y})*tl.where({x}<0,-1,1)",
TernaryOps.MULACC: lambda x,y,z,dtype: f"(({x}*{y})+{z})",
TernaryOps.WHERE: lambda x,y,z,dtype: f"tl.where({x},{y},{z})",
}
def int_div(x,y): return f"({x}//{y})" if y != '0' else f"{x}*tl.where({x}==0, float('nan'), float('inf'))"
for u in uops:
uop,dtype,vin,args = u.uop,u.dtype,u.vin,u.arg
if uop == Ops.LOOP:
kk(f"for {ssa(u, 'ridx')} in range({vin[0].arg}, {r[vin[1]]}):")
depth += 1
elif uop == Ops.END: depth -= 1
elif uop == Ops.ALU:
assert dtype is not None
val = code_for_op[args](*[r[x] for x in vin])
if child_count[u] <=1 or dtypes.is_int(dtype): r[u] = int_div(*[r[x] for x in vin]) if args == BinaryOps.DIV and dtypes.is_int(dtype) else val
else: kk(f"{ssa(u, 'alu')} = ({val})")
elif uop == Ops.LOAD:
assert dtype is not None
if len(vin) == 2: kk(f"{ssa(u, 'val')} = {render_cast(f'tl.load({r[vin[0]]} + { fill_dims_for_idx(r[vin[1]], dims)}, mask = {render_valid(valid)})', dtype)}")
else: kk(f"{ssa(u, 'val')} = {render_cast(f'tl.where({r[vin[2]]}, tl.load({r[vin[0]]}+{fill_dims_for_idx(r[vin[1]],dims)} , mask={render_valid(valid+[r[vin[2]]])}), 0.0)', dtype)}")
elif uop == Ops.DEFINE_REG: kk(f"{ssa(u, 'acc')} = {define_scalar(local_size, dtype, args).replace('//', '/')}")
elif uop == Ops.CONST: r[u] = define_scalar([], dtype, args)
elif uop == Ops.ASSIGN:
kk(f"{r[vin[0]]} = {r[vin[1]].replace('//', '/')}")
r[u] = r[vin[0]]
elif uop == Ops.STORE:
assert not isinstance(dtype, ImageDType), "unimplemented: image store"
kk(f"{'if '+r[vin[3]]+': ' if len(vin)>3 else ''}tl.store({r[vin[0]]} + {r[vin[1]]}, {r[vin[2]].replace('//', '/')}, mask = {render_valid(valid)}) ")
elif uop == Ops.DEFINE_GLOBAL:
bufs.append(args)
signatures.append("*" if isinstance(dtype, PtrDType) else "" + signature_dtypes[dtype])
r[u] = args
elif uop == Ops.SPECIAL:
dims.append(args[1])
valid.append(f"{args[1]}<{get_max(args[2])}")
if args[1].startswith("g"): kk(f"{args[1]} = tl.program_id({args[0]}) # {args[2]}")
elif args[1].startswith("l"):
kk(f"{args[1]} = tl.arange({0}, {next_power_of_2(args[2])})")
local_size.append(args[2])
r[u] = args[1]
elif uop == Ops.CAST and dtype is not None: r[u] = render_cast(r[vin[0]], dtype, isinstance(args, tuple) and args[1])
else: raise NotImplementedError(f"unimplemented: {uop}")
prg = f"import triton\nimport triton.language as tl\ntl.core.TRITON_MAX_TENSOR_NUMEL = float('inf')\n@triton.jit\ndef {function_name}("+','.join(bufs)+"):\n"
for i, line in enumerate(list(filter(lambda line: "tl.arange" in line, kernel))): kernel[kernel.index(line)] += f"[{', '.join([':' if i == j else 'None' for j in range(len(local_size))])}]"
prg += "\n".join(kernel)
acc_local_size = 1
for x in local_size: acc_local_size *= next_power_of_2(x)
local_size = [acc_local_size] + [1] * (len(local_size) - 1)
if DEBUG >= 4: print(prg)
getlines = linecache.getlines
linecache.getlines = lambda filename, module_globals=None: prg.splitlines(keepends=True) if "<triton>" == filename else getlines(filename, module_globals)
exec(compile(prg, "<triton>", "exec"), globals()) # pylint: disable=W0122\
compiled = triton_compile(globals()[function_name], signature=",".join(signatures), device_type="cuda", debug=False, cc=(35 if getenv("CUDACPU", 0) else None))
prg = remove_single_scalar_curly_braces(compiled.asm["ptx"].split(".file")[0].split(".visible .func")[0])
max_local_size = [int(x) for x in prg.split(".maxntid ")[1].split("\n")[0].split(", ")]
for i in range(len(local_size)): local_size[i] = min(local_size[i], max_local_size[i])
return prg, {"shared":compiled.metadata["shared"], "local_size":local_size + [1]*(3-len(local_size))}

View file

@ -1,199 +0,0 @@
import json
import pathlib
import zipfile
import numpy as np
from tinygrad.helpers import fetch
import pycocotools._mask as _mask
from examples.mask_rcnn import Masker
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
iou = _mask.iou
merge = _mask.merge
frPyObjects = _mask.frPyObjects
BASEDIR = pathlib.Path(__file__).parent / "COCO"
BASEDIR.mkdir(exist_ok=True)
def create_dict(key_row, val_row, rows): return {row[key_row]:row[val_row] for row in rows}
if not pathlib.Path(BASEDIR/'val2017').is_dir():
fn = fetch('http://images.cocodataset.org/zips/val2017.zip')
with zipfile.ZipFile(fn, 'r') as zip_ref:
zip_ref.extractall(BASEDIR)
fn.unlink()
if not pathlib.Path(BASEDIR/'annotations').is_dir():
fn = fetch('http://images.cocodataset.org/annotations/annotations_trainval2017.zip')
with zipfile.ZipFile(fn, 'r') as zip_ref:
zip_ref.extractall(BASEDIR)
fn.unlink()
with open(BASEDIR/'annotations/instances_val2017.json', 'r') as f:
annotations_raw = json.loads(f.read())
images = annotations_raw['images']
categories = annotations_raw['categories']
annotations = annotations_raw['annotations']
file_name_to_id = create_dict('file_name', 'id', images)
id_to_width = create_dict('id', 'width', images)
id_to_height = create_dict('id', 'height', images)
json_category_id_to_contiguous_id = {v['id']: i + 1 for i, v in enumerate(categories)}
contiguous_category_id_to_json_id = {v:k for k,v in json_category_id_to_contiguous_id.items()}
def encode(bimask):
if len(bimask.shape) == 3:
return _mask.encode(bimask)
elif len(bimask.shape) == 2:
h, w = bimask.shape
return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
def decode(rleObjs):
if type(rleObjs) == list:
return _mask.decode(rleObjs)
else:
return _mask.decode([rleObjs])[:,:,0]
def area(rleObjs):
if type(rleObjs) == list:
return _mask.area(rleObjs)
else:
return _mask.area([rleObjs])[0]
def toBbox(rleObjs):
if type(rleObjs) == list:
return _mask.toBbox(rleObjs)
else:
return _mask.toBbox([rleObjs])[0]
def convert_prediction_to_coco_bbox(file_name, prediction):
coco_results = []
try:
original_id = file_name_to_id[file_name]
if len(prediction) == 0:
return coco_results
image_width = id_to_width[original_id]
image_height = id_to_height[original_id]
prediction = prediction.resize((image_width, image_height))
prediction = prediction.convert("xywh")
boxes = prediction.bbox.numpy().tolist()
scores = prediction.get_field("scores").numpy().tolist()
labels = prediction.get_field("labels").numpy().tolist()
mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
coco_results.extend(
[
{
"image_id": original_id,
"category_id": mapped_labels[k],
"bbox": box,
"score": scores[k],
}
for k, box in enumerate(boxes)
]
)
except Exception as e:
print(file_name, e)
return coco_results
masker = Masker(threshold=0.5, padding=1)
def convert_prediction_to_coco_mask(file_name, prediction):
coco_results = []
try:
original_id = file_name_to_id[file_name]
if len(prediction) == 0:
return coco_results
image_width = id_to_width[original_id]
image_height = id_to_height[original_id]
prediction = prediction.resize((image_width, image_height))
masks = prediction.get_field("mask")
scores = prediction.get_field("scores").numpy().tolist()
labels = prediction.get_field("labels").numpy().tolist()
masks = masker([masks], [prediction])[0].numpy()
rles = [
encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0]
for mask in masks
]
for rle in rles:
rle["counts"] = rle["counts"].decode("utf-8")
mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
coco_results.extend(
[
{
"image_id": original_id,
"category_id": mapped_labels[k],
"segmentation": rle,
"score": scores[k],
}
for k, rle in enumerate(rles)
]
)
except Exception as e:
print(file_name, e)
return coco_results
def accumulate_predictions_for_coco(coco_results, json_result_file, rm=False):
path = pathlib.Path(json_result_file)
if rm and path.exists(): path.unlink()
with open(path, "a") as f:
for s in coco_results:
f.write(json.dumps(s))
f.write('\n')
def remove_dup(l):
seen = set()
seen_add = seen.add
return [x for x in l if not (x in seen or seen_add(x))]
class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super(NpEncoder, self).default(obj)
def evaluate_predictions_on_coco(json_result_file, iou_type="bbox"):
coco_results = []
with open(json_result_file, "r") as f:
for line in f:
coco_results.append(json.loads(line))
coco_gt = COCO(str(BASEDIR/'annotations/instances_val2017.json'))
set_of_json = remove_dup([json.dumps(d, cls=NpEncoder) for d in coco_results])
unique_list = [json.loads(s) for s in set_of_json]
with open(f'{json_result_file}.flattend', "w") as f:
json.dump(unique_list, f)
coco_dt = coco_gt.loadRes(str(f'{json_result_file}.flattend'))
coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
return coco_eval
def iterate(files, bs=1):
batch = []
for file in files:
batch.append(file)
if len(batch) >= bs: yield batch; batch = []
if len(batch) > 0: yield batch; batch = []

View file

@ -1 +0,0 @@
disasm.so

View file

@ -1,5 +0,0 @@
From the Freedreno project
https://gallium.readthedocs.io/en/latest/gallium/drivers/freedreno.html
In Mesa3D, so licensed MIT.

View file

@ -1,22 +0,0 @@
import ctypes
import os
import pathlib
import struct
from hexdump import hexdump
fxn = None
def disasm_raw(buf):
global fxn
if fxn is None:
shared = pathlib.Path(__file__).parent / "disasm.so"
if not shared.is_file():
os.system(f'cd {pathlib.Path(__file__).parent} && gcc -shared disasm-a3xx.c -o disasm.so')
fxn = ctypes.CDLL(shared.as_posix())['disasm']
fxn(buf, len(buf))
def disasm(buf):
def _read_lib(off): return struct.unpack("I", buf[off:off+4])[0]
image_offset = _read_lib(0xc0)
image_size = _read_lib(0x100)
disasm_raw(buf[image_offset:image_offset+image_size])

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,906 +0,0 @@
/*
* Mesa 3-D graphics library
*
* Copyright (C) 1999-2008 Brian Paul All Rights Reserved.
* Copyright (C) 2009 VMware, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef SHADER_ENUMS_H
#define SHADER_ENUMS_H
#include <stdbool.h>
/* Project-wide (GL and Vulkan) maximum. */
#define MAX_DRAW_BUFFERS 8
#ifdef __cplusplus
extern "C" {
#endif
/**
* Shader stages.
*
* The order must match how shaders are ordered in the pipeline.
* The GLSL linker assumes that if i<j, then the j-th shader is
* executed later than the i-th shader.
*/
typedef enum
{
MESA_SHADER_NONE = -1,
MESA_SHADER_VERTEX = 0,
MESA_SHADER_TESS_CTRL = 1,
MESA_SHADER_TESS_EVAL = 2,
MESA_SHADER_GEOMETRY = 3,
MESA_SHADER_FRAGMENT = 4,
MESA_SHADER_COMPUTE = 5,
/* must be last so it doesn't affect the GL pipeline */
MESA_SHADER_KERNEL = 6,
} gl_shader_stage;
static inline bool
gl_shader_stage_is_compute(gl_shader_stage stage)
{
return stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL;
}
/**
* Number of STATE_* values we need to address any GL state.
* Used to dimension arrays.
*/
#define STATE_LENGTH 5
typedef short gl_state_index16; /* see enum gl_state_index */
const char *gl_shader_stage_name(gl_shader_stage stage);
/**
* Translate a gl_shader_stage to a short shader stage name for debug
* printouts and error messages.
*/
const char *_mesa_shader_stage_to_string(unsigned stage);
/**
* Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
* for debug printouts and error messages.
*/
const char *_mesa_shader_stage_to_abbrev(unsigned stage);
/**
* GL related stages (not including CL)
*/
#define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
/**
* All stages
*/
#define MESA_ALL_SHADER_STAGES (MESA_SHADER_KERNEL + 1)
/**
* Indexes for vertex program attributes.
* GL_NV_vertex_program aliases generic attributes over the conventional
* attributes. In GL_ARB_vertex_program shader the aliasing is optional.
* In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
* generic attributes are distinct/separate).
*/
typedef enum
{
VERT_ATTRIB_POS,
VERT_ATTRIB_NORMAL,
VERT_ATTRIB_COLOR0,
VERT_ATTRIB_COLOR1,
VERT_ATTRIB_FOG,
VERT_ATTRIB_COLOR_INDEX,
VERT_ATTRIB_EDGEFLAG,
VERT_ATTRIB_TEX0,
VERT_ATTRIB_TEX1,
VERT_ATTRIB_TEX2,
VERT_ATTRIB_TEX3,
VERT_ATTRIB_TEX4,
VERT_ATTRIB_TEX5,
VERT_ATTRIB_TEX6,
VERT_ATTRIB_TEX7,
VERT_ATTRIB_POINT_SIZE,
VERT_ATTRIB_GENERIC0,
VERT_ATTRIB_GENERIC1,
VERT_ATTRIB_GENERIC2,
VERT_ATTRIB_GENERIC3,
VERT_ATTRIB_GENERIC4,
VERT_ATTRIB_GENERIC5,
VERT_ATTRIB_GENERIC6,
VERT_ATTRIB_GENERIC7,
VERT_ATTRIB_GENERIC8,
VERT_ATTRIB_GENERIC9,
VERT_ATTRIB_GENERIC10,
VERT_ATTRIB_GENERIC11,
VERT_ATTRIB_GENERIC12,
VERT_ATTRIB_GENERIC13,
VERT_ATTRIB_GENERIC14,
VERT_ATTRIB_GENERIC15,
VERT_ATTRIB_MAX
} gl_vert_attrib;
const char *gl_vert_attrib_name(gl_vert_attrib attrib);
/**
* Symbolic constats to help iterating over
* specific blocks of vertex attributes.
*
* VERT_ATTRIB_FF
* includes all fixed function attributes as well as
* the aliased GL_NV_vertex_program shader attributes.
* VERT_ATTRIB_TEX
* include the classic texture coordinate attributes.
* Is a subset of VERT_ATTRIB_FF.
* VERT_ATTRIB_GENERIC
* include the OpenGL 2.0+ GLSL generic shader attributes.
* These alias the generic GL_ARB_vertex_shader attributes.
* VERT_ATTRIB_MAT
* include the generic shader attributes used to alias
* varying material values for the TNL shader programs.
* They are located at the end of the generic attribute
* block not to overlap with the generic 0 attribute.
*/
#define VERT_ATTRIB_FF(i) (VERT_ATTRIB_POS + (i))
#define VERT_ATTRIB_FF_MAX VERT_ATTRIB_GENERIC0
#define VERT_ATTRIB_TEX(i) (VERT_ATTRIB_TEX0 + (i))
#define VERT_ATTRIB_TEX_MAX MAX_TEXTURE_COORD_UNITS
#define VERT_ATTRIB_GENERIC(i) (VERT_ATTRIB_GENERIC0 + (i))
#define VERT_ATTRIB_GENERIC_MAX MAX_VERTEX_GENERIC_ATTRIBS
#define VERT_ATTRIB_MAT0 \
(VERT_ATTRIB_GENERIC_MAX - VERT_ATTRIB_MAT_MAX)
#define VERT_ATTRIB_MAT(i) \
VERT_ATTRIB_GENERIC((i) + VERT_ATTRIB_MAT0)
#define VERT_ATTRIB_MAT_MAX MAT_ATTRIB_MAX
/**
* Bitflags for vertex attributes.
* These are used in bitfields in many places.
*/
/*@{*/
#define VERT_BIT_POS BITFIELD_BIT(VERT_ATTRIB_POS)
#define VERT_BIT_NORMAL BITFIELD_BIT(VERT_ATTRIB_NORMAL)
#define VERT_BIT_COLOR0 BITFIELD_BIT(VERT_ATTRIB_COLOR0)
#define VERT_BIT_COLOR1 BITFIELD_BIT(VERT_ATTRIB_COLOR1)
#define VERT_BIT_FOG BITFIELD_BIT(VERT_ATTRIB_FOG)
#define VERT_BIT_COLOR_INDEX BITFIELD_BIT(VERT_ATTRIB_COLOR_INDEX)
#define VERT_BIT_EDGEFLAG BITFIELD_BIT(VERT_ATTRIB_EDGEFLAG)
#define VERT_BIT_TEX0 BITFIELD_BIT(VERT_ATTRIB_TEX0)
#define VERT_BIT_TEX1 BITFIELD_BIT(VERT_ATTRIB_TEX1)
#define VERT_BIT_TEX2 BITFIELD_BIT(VERT_ATTRIB_TEX2)
#define VERT_BIT_TEX3 BITFIELD_BIT(VERT_ATTRIB_TEX3)
#define VERT_BIT_TEX4 BITFIELD_BIT(VERT_ATTRIB_TEX4)
#define VERT_BIT_TEX5 BITFIELD_BIT(VERT_ATTRIB_TEX5)
#define VERT_BIT_TEX6 BITFIELD_BIT(VERT_ATTRIB_TEX6)
#define VERT_BIT_TEX7 BITFIELD_BIT(VERT_ATTRIB_TEX7)
#define VERT_BIT_POINT_SIZE BITFIELD_BIT(VERT_ATTRIB_POINT_SIZE)
#define VERT_BIT_GENERIC0 BITFIELD_BIT(VERT_ATTRIB_GENERIC0)
#define VERT_BIT(i) BITFIELD_BIT(i)
#define VERT_BIT_ALL BITFIELD_RANGE(0, VERT_ATTRIB_MAX)
#define VERT_BIT_FF(i) VERT_BIT(i)
#define VERT_BIT_FF_ALL BITFIELD_RANGE(0, VERT_ATTRIB_FF_MAX)
#define VERT_BIT_TEX(i) VERT_BIT(VERT_ATTRIB_TEX(i))
#define VERT_BIT_TEX_ALL \
BITFIELD_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)
#define VERT_BIT_GENERIC(i) VERT_BIT(VERT_ATTRIB_GENERIC(i))
#define VERT_BIT_GENERIC_ALL \
BITFIELD_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
#define VERT_BIT_MAT(i) VERT_BIT(VERT_ATTRIB_MAT(i))
#define VERT_BIT_MAT_ALL \
BITFIELD_RANGE(VERT_ATTRIB_MAT(0), VERT_ATTRIB_MAT_MAX)
/*@}*/
#define MAX_VARYING 32 /**< number of float[4] vectors */
/**
* Indexes for vertex shader outputs, geometry shader inputs/outputs, and
* fragment shader inputs.
*
* Note that some of these values are not available to all pipeline stages.
*
* When this enum is updated, the following code must be updated too:
* - vertResults (in prog_print.c's arb_output_attrib_string())
* - fragAttribs (in prog_print.c's arb_input_attrib_string())
* - _mesa_varying_slot_in_fs()
*/
typedef enum
{
VARYING_SLOT_POS,
VARYING_SLOT_COL0, /* COL0 and COL1 must be contiguous */
VARYING_SLOT_COL1,
VARYING_SLOT_FOGC,
VARYING_SLOT_TEX0, /* TEX0-TEX7 must be contiguous */
VARYING_SLOT_TEX1,
VARYING_SLOT_TEX2,
VARYING_SLOT_TEX3,
VARYING_SLOT_TEX4,
VARYING_SLOT_TEX5,
VARYING_SLOT_TEX6,
VARYING_SLOT_TEX7,
VARYING_SLOT_PSIZ, /* Does not appear in FS */
VARYING_SLOT_BFC0, /* Does not appear in FS */
VARYING_SLOT_BFC1, /* Does not appear in FS */
VARYING_SLOT_EDGE, /* Does not appear in FS */
VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */
VARYING_SLOT_CLIP_DIST0,
VARYING_SLOT_CLIP_DIST1,
VARYING_SLOT_CULL_DIST0,
VARYING_SLOT_CULL_DIST1,
VARYING_SLOT_PRIMITIVE_ID, /* Does not appear in VS */
VARYING_SLOT_LAYER, /* Appears as VS or GS output */
VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
VARYING_SLOT_FACE, /* FS only */
VARYING_SLOT_PNTC, /* FS only */
VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
VARYING_SLOT_BOUNDING_BOX0, /* Only appears as TCS output. */
VARYING_SLOT_BOUNDING_BOX1, /* Only appears as TCS output. */
VARYING_SLOT_VIEW_INDEX,
VARYING_SLOT_VIEWPORT_MASK, /* Does not appear in FS */
VARYING_SLOT_VAR0, /* First generic varying slot */
/* the remaining are simply for the benefit of gl_varying_slot_name()
* and not to be construed as an upper bound:
*/
VARYING_SLOT_VAR1,
VARYING_SLOT_VAR2,
VARYING_SLOT_VAR3,
VARYING_SLOT_VAR4,
VARYING_SLOT_VAR5,
VARYING_SLOT_VAR6,
VARYING_SLOT_VAR7,
VARYING_SLOT_VAR8,
VARYING_SLOT_VAR9,
VARYING_SLOT_VAR10,
VARYING_SLOT_VAR11,
VARYING_SLOT_VAR12,
VARYING_SLOT_VAR13,
VARYING_SLOT_VAR14,
VARYING_SLOT_VAR15,
VARYING_SLOT_VAR16,
VARYING_SLOT_VAR17,
VARYING_SLOT_VAR18,
VARYING_SLOT_VAR19,
VARYING_SLOT_VAR20,
VARYING_SLOT_VAR21,
VARYING_SLOT_VAR22,
VARYING_SLOT_VAR23,
VARYING_SLOT_VAR24,
VARYING_SLOT_VAR25,
VARYING_SLOT_VAR26,
VARYING_SLOT_VAR27,
VARYING_SLOT_VAR28,
VARYING_SLOT_VAR29,
VARYING_SLOT_VAR30,
VARYING_SLOT_VAR31,
} gl_varying_slot;
#define VARYING_SLOT_MAX (VARYING_SLOT_VAR0 + MAX_VARYING)
#define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX)
#define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING)
#define MAX_VARYINGS_INCL_PATCH (VARYING_SLOT_TESS_MAX - VARYING_SLOT_VAR0)
const char *gl_varying_slot_name(gl_varying_slot slot);
/**
* Bitflags for varying slots.
*/
/*@{*/
#define VARYING_BIT_POS BITFIELD64_BIT(VARYING_SLOT_POS)
#define VARYING_BIT_COL0 BITFIELD64_BIT(VARYING_SLOT_COL0)
#define VARYING_BIT_COL1 BITFIELD64_BIT(VARYING_SLOT_COL1)
#define VARYING_BIT_FOGC BITFIELD64_BIT(VARYING_SLOT_FOGC)
#define VARYING_BIT_TEX0 BITFIELD64_BIT(VARYING_SLOT_TEX0)
#define VARYING_BIT_TEX1 BITFIELD64_BIT(VARYING_SLOT_TEX1)
#define VARYING_BIT_TEX2 BITFIELD64_BIT(VARYING_SLOT_TEX2)
#define VARYING_BIT_TEX3 BITFIELD64_BIT(VARYING_SLOT_TEX3)
#define VARYING_BIT_TEX4 BITFIELD64_BIT(VARYING_SLOT_TEX4)
#define VARYING_BIT_TEX5 BITFIELD64_BIT(VARYING_SLOT_TEX5)
#define VARYING_BIT_TEX6 BITFIELD64_BIT(VARYING_SLOT_TEX6)
#define VARYING_BIT_TEX7 BITFIELD64_BIT(VARYING_SLOT_TEX7)
#define VARYING_BIT_TEX(U) BITFIELD64_BIT(VARYING_SLOT_TEX0 + (U))
#define VARYING_BITS_TEX_ANY BITFIELD64_RANGE(VARYING_SLOT_TEX0, \
MAX_TEXTURE_COORD_UNITS)
#define VARYING_BIT_PSIZ BITFIELD64_BIT(VARYING_SLOT_PSIZ)
#define VARYING_BIT_BFC0 BITFIELD64_BIT(VARYING_SLOT_BFC0)
#define VARYING_BIT_BFC1 BITFIELD64_BIT(VARYING_SLOT_BFC1)
#define VARYING_BITS_COLOR (VARYING_BIT_COL0 | \
VARYING_BIT_COL1 | \
VARYING_BIT_BFC0 | \
VARYING_BIT_BFC1)
#define VARYING_BIT_EDGE BITFIELD64_BIT(VARYING_SLOT_EDGE)
#define VARYING_BIT_CLIP_VERTEX BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX)
#define VARYING_BIT_CLIP_DIST0 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)
#define VARYING_BIT_CLIP_DIST1 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)
#define VARYING_BIT_CULL_DIST0 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)
#define VARYING_BIT_CULL_DIST1 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)
#define VARYING_BIT_PRIMITIVE_ID BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID)
#define VARYING_BIT_LAYER BITFIELD64_BIT(VARYING_SLOT_LAYER)
#define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
#define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
#define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
#define VARYING_BIT_BOUNDING_BOX0 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX0)
#define VARYING_BIT_BOUNDING_BOX1 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX1)
#define VARYING_BIT_VIEWPORT_MASK BITFIELD64_BIT(VARYING_SLOT_VIEWPORT_MASK)
#define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
/*@}*/
/**
* Bitflags for system values.
*/
#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
#define SYSTEM_BIT_LOCAL_INVOCATION_ID ((uint64_t)1 << SYSTEM_VALUE_LOCAL_INVOCATION_ID)
/**
* If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
* one of these values. If a NIR variable's mode is nir_var_system_value, it
* will be one of these values.
*/
typedef enum
{
/**
* \name System values applicable to all shaders
*/
/*@{*/
/**
* Builtin variables added by GL_ARB_shader_ballot.
*/
/*@{*/
/**
* From the GL_ARB_shader-ballot spec:
*
* "A sub-group is a collection of invocations which execute in lockstep.
* The variable <gl_SubGroupSizeARB> is the maximum number of
* invocations in a sub-group. The maximum <gl_SubGroupSizeARB>
* supported in this extension is 64."
*
* The spec defines this as a uniform. However, it's highly unlikely that
* implementations actually treat it as a uniform (which is loaded from a
* constant buffer). Most likely, this is an implementation-wide constant,
* or perhaps something that depends on the shader stage.
*/
SYSTEM_VALUE_SUBGROUP_SIZE,
/**
* From the GL_ARB_shader_ballot spec:
*
* "The variable <gl_SubGroupInvocationARB> holds the index of the
* invocation within sub-group. This variable is in the range 0 to
* <gl_SubGroupSizeARB>-1, where <gl_SubGroupSizeARB> is the total
* number of invocations in a sub-group."
*/
SYSTEM_VALUE_SUBGROUP_INVOCATION,
/**
* From the GL_ARB_shader_ballot spec:
*
* "The <gl_SubGroup??MaskARB> variables provide a bitmask for all
* invocations, with one bit per invocation starting with the least
* significant bit, according to the following table,
*
* variable equation for bit values
* -------------------- ------------------------------------
* gl_SubGroupEqMaskARB bit index == gl_SubGroupInvocationARB
* gl_SubGroupGeMaskARB bit index >= gl_SubGroupInvocationARB
* gl_SubGroupGtMaskARB bit index > gl_SubGroupInvocationARB
* gl_SubGroupLeMaskARB bit index <= gl_SubGroupInvocationARB
* gl_SubGroupLtMaskARB bit index < gl_SubGroupInvocationARB
*/
SYSTEM_VALUE_SUBGROUP_EQ_MASK,
SYSTEM_VALUE_SUBGROUP_GE_MASK,
SYSTEM_VALUE_SUBGROUP_GT_MASK,
SYSTEM_VALUE_SUBGROUP_LE_MASK,
SYSTEM_VALUE_SUBGROUP_LT_MASK,
/*@}*/
/**
* Builtin variables added by VK_KHR_subgroups
*/
/*@{*/
SYSTEM_VALUE_NUM_SUBGROUPS,
SYSTEM_VALUE_SUBGROUP_ID,
/*@}*/
/*@}*/
/**
* \name Vertex shader system values
*/
/*@{*/
/**
* OpenGL-style vertex ID.
*
* Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
* OpenGL 3.3 core profile spec says:
*
* "gl_VertexID holds the integer index i implicitly passed by
* DrawArrays or one of the other drawing commands defined in section
* 2.8.3."
*
* Section 2.8.3 (Drawing Commands) of the same spec says:
*
* "The commands....are equivalent to the commands with the same base
* name (without the BaseVertex suffix), except that the ith element
* transferred by the corresponding draw call will be taken from
* element indices[i] + basevertex of each enabled array."
*
* Additionally, the overview in the GL_ARB_shader_draw_parameters spec
* says:
*
* "In unextended GL, vertex shaders have inputs named gl_VertexID and
* gl_InstanceID, which contain, respectively the index of the vertex
* and instance. The value of gl_VertexID is the implicitly passed
* index of the vertex being processed, which includes the value of
* baseVertex, for those commands that accept it."
*
* gl_VertexID gets basevertex added in. This differs from DirectX where
* SV_VertexID does \b not get basevertex added in.
*
* \note
* If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
* equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
* \c SYSTEM_VALUE_BASE_VERTEX.
*
* \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
*/
SYSTEM_VALUE_VERTEX_ID,
/**
* Instanced ID as supplied to gl_InstanceID
*
* Values assigned to gl_InstanceID always begin with zero, regardless of
* the value of baseinstance.
*
* Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
* says:
*
* "gl_InstanceID holds the integer instance number of the current
* primitive in an instanced draw call (see section 10.5)."
*
* Through a big chain of pseudocode, section 10.5 describes that
* baseinstance is not counted by gl_InstanceID. In that section, notice
*
* "If an enabled vertex attribute array is instanced (it has a
* non-zero divisor as specified by VertexAttribDivisor), the element
* index that is transferred to the GL, for all vertices, is given by
*
* floor(instance/divisor) + baseinstance
*
* If an array corresponding to an attribute required by a vertex
* shader is not enabled, then the corresponding element is taken from
* the current attribute state (see section 10.2)."
*
* Note that baseinstance is \b not included in the value of instance.
*/
SYSTEM_VALUE_INSTANCE_ID,
/**
* Vulkan InstanceIndex.
*
* InstanceIndex = gl_InstanceID + gl_BaseInstance
*/
SYSTEM_VALUE_INSTANCE_INDEX,
/**
* DirectX-style vertex ID.
*
* Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
* the value of basevertex.
*
* \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
*/
SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
/**
* Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
* functions.
*
* \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
*/
SYSTEM_VALUE_BASE_VERTEX,
/**
* Depending on the type of the draw call (indexed or non-indexed),
* is the value of \c basevertex passed to \c glDrawElementsBaseVertex and
* similar, or is the value of \c first passed to \c glDrawArrays and
* similar.
*
* \note
* It can be used to calculate the \c SYSTEM_VALUE_VERTEX_ID as
* \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus \c SYSTEM_VALUE_FIRST_VERTEX.
*
* \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_VERTEX_ID
*/
SYSTEM_VALUE_FIRST_VERTEX,
/**
* If the Draw command used to start the rendering was an indexed draw
* or not (~0/0). Useful to calculate \c SYSTEM_VALUE_BASE_VERTEX as
* \c SYSTEM_VALUE_IS_INDEXED_DRAW & \c SYSTEM_VALUE_FIRST_VERTEX.
*/
SYSTEM_VALUE_IS_INDEXED_DRAW,
/**
* Value of \c baseinstance passed to instanced draw entry points
*
* \sa SYSTEM_VALUE_INSTANCE_ID
*/
SYSTEM_VALUE_BASE_INSTANCE,
/**
* From _ARB_shader_draw_parameters:
*
* "Additionally, this extension adds a further built-in variable,
* gl_DrawID to the shading language. This variable contains the index
* of the draw currently being processed by a Multi* variant of a
* drawing command (such as MultiDrawElements or
* MultiDrawArraysIndirect)."
*
* If GL_ARB_multi_draw_indirect is not supported, this is always 0.
*/
SYSTEM_VALUE_DRAW_ID,
/*@}*/
/**
* \name Geometry shader system values
*/
/*@{*/
SYSTEM_VALUE_INVOCATION_ID, /**< (Also in Tessellation Control shader) */
/*@}*/
/**
* \name Fragment shader system values
*/
/*@{*/
SYSTEM_VALUE_FRAG_COORD,
SYSTEM_VALUE_POINT_COORD,
SYSTEM_VALUE_FRONT_FACE,
SYSTEM_VALUE_SAMPLE_ID,
SYSTEM_VALUE_SAMPLE_POS,
SYSTEM_VALUE_SAMPLE_MASK_IN,
SYSTEM_VALUE_HELPER_INVOCATION,
SYSTEM_VALUE_COLOR0,
SYSTEM_VALUE_COLOR1,
/*@}*/
/**
* \name Tessellation Evaluation shader system values
*/
/*@{*/
SYSTEM_VALUE_TESS_COORD,
SYSTEM_VALUE_VERTICES_IN, /**< Tessellation vertices in input patch */
SYSTEM_VALUE_PRIMITIVE_ID,
SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT, /**< TCS input for passthru TCS */
SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT, /**< TCS input for passthru TCS */
/*@}*/
/**
* \name Compute shader system values
*/
/*@{*/
SYSTEM_VALUE_LOCAL_INVOCATION_ID,
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
SYSTEM_VALUE_GLOBAL_INVOCATION_INDEX,
SYSTEM_VALUE_WORK_GROUP_ID,
SYSTEM_VALUE_NUM_WORK_GROUPS,
SYSTEM_VALUE_LOCAL_GROUP_SIZE,
SYSTEM_VALUE_GLOBAL_GROUP_SIZE,
SYSTEM_VALUE_WORK_DIM,
SYSTEM_VALUE_USER_DATA_AMD,
/*@}*/
/** Required for VK_KHR_device_group */
SYSTEM_VALUE_DEVICE_INDEX,
/** Required for VK_KHX_multiview */
SYSTEM_VALUE_VIEW_INDEX,
/**
* Driver internal vertex-count, used (for example) for drivers to
* calculate stride for stream-out outputs. Not externally visible.
*/
SYSTEM_VALUE_VERTEX_CNT,
/**
* Required for AMD_shader_explicit_vertex_parameter and also used for
* varying-fetch instructions.
*
* The _SIZE value is "primitive size", used to scale i/j in primitive
* space to pixel space.
*/
SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE,
SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID,
SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE,
SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL,
/**
* IR3 specific geometry shader and tesselation control shader system
* values that packs invocation id, thread id and vertex id. Having this
* as a nir level system value lets us do the unpacking in nir.
*/
SYSTEM_VALUE_GS_HEADER_IR3,
SYSTEM_VALUE_TCS_HEADER_IR3,
SYSTEM_VALUE_MAX /**< Number of values */
} gl_system_value;
const char *gl_system_value_name(gl_system_value sysval);
/**
* The possible interpolation qualifiers that can be applied to a fragment
* shader input in GLSL.
*
* Note: INTERP_MODE_NONE must be 0 so that memsetting the
* ir_variable data structure to 0 causes the default behavior.
*/
enum glsl_interp_mode
{
INTERP_MODE_NONE = 0,
INTERP_MODE_SMOOTH,
INTERP_MODE_FLAT,
INTERP_MODE_NOPERSPECTIVE,
INTERP_MODE_EXPLICIT,
INTERP_MODE_COUNT /**< Number of interpolation qualifiers */
};
enum glsl_interface_packing {
GLSL_INTERFACE_PACKING_STD140,
GLSL_INTERFACE_PACKING_SHARED,
GLSL_INTERFACE_PACKING_PACKED,
GLSL_INTERFACE_PACKING_STD430
};
const char *glsl_interp_mode_name(enum glsl_interp_mode qual);
/**
* Fragment program results
*/
typedef enum
{
FRAG_RESULT_DEPTH = 0,
FRAG_RESULT_STENCIL = 1,
/* If a single color should be written to all render targets, this
* register is written. No FRAG_RESULT_DATAn will be written.
*/
FRAG_RESULT_COLOR = 2,
FRAG_RESULT_SAMPLE_MASK = 3,
/* FRAG_RESULT_DATAn are the per-render-target (GLSL gl_FragData[n]
* or ARB_fragment_program fragment.color[n]) color results. If
* any are written, FRAG_RESULT_COLOR will not be written.
* FRAG_RESULT_DATA1 and up are simply for the benefit of
* gl_frag_result_name() and not to be construed as an upper bound
*/
FRAG_RESULT_DATA0 = 4,
FRAG_RESULT_DATA1,
FRAG_RESULT_DATA2,
FRAG_RESULT_DATA3,
FRAG_RESULT_DATA4,
FRAG_RESULT_DATA5,
FRAG_RESULT_DATA6,
FRAG_RESULT_DATA7,
} gl_frag_result;
const char *gl_frag_result_name(gl_frag_result result);
#define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
/**
* \brief Layout qualifiers for gl_FragDepth.
*
* Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
* a layout qualifier.
*
* \see enum ir_depth_layout
*/
enum gl_frag_depth_layout
{
FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
FRAG_DEPTH_LAYOUT_ANY,
FRAG_DEPTH_LAYOUT_GREATER,
FRAG_DEPTH_LAYOUT_LESS,
FRAG_DEPTH_LAYOUT_UNCHANGED
};
/**
* \brief Buffer access qualifiers
*/
enum gl_access_qualifier
{
ACCESS_COHERENT = (1 << 0),
ACCESS_RESTRICT = (1 << 1),
ACCESS_VOLATILE = (1 << 2),
ACCESS_NON_READABLE = (1 << 3),
ACCESS_NON_WRITEABLE = (1 << 4),
/** The access may use a non-uniform buffer or image index */
ACCESS_NON_UNIFORM = (1 << 5),
/* This has the same semantics as NIR_INTRINSIC_CAN_REORDER, only to be
* used with loads. In other words, it means that the load can be
* arbitrarily reordered, or combined with other loads to the same address.
* It is implied by ACCESS_NON_WRITEABLE together with ACCESS_RESTRICT, and
* a lack of ACCESS_COHERENT and ACCESS_VOLATILE.
*/
ACCESS_CAN_REORDER = (1 << 6),
/** Use as little cache space as possible. */
ACCESS_STREAM_CACHE_POLICY = (1 << 7),
};
/**
* \brief Blend support qualifiers
*/
enum gl_advanced_blend_mode
{
BLEND_NONE = 0x0000,
BLEND_MULTIPLY = 0x0001,
BLEND_SCREEN = 0x0002,
BLEND_OVERLAY = 0x0004,
BLEND_DARKEN = 0x0008,
BLEND_LIGHTEN = 0x0010,
BLEND_COLORDODGE = 0x0020,
BLEND_COLORBURN = 0x0040,
BLEND_HARDLIGHT = 0x0080,
BLEND_SOFTLIGHT = 0x0100,
BLEND_DIFFERENCE = 0x0200,
BLEND_EXCLUSION = 0x0400,
BLEND_HSL_HUE = 0x0800,
BLEND_HSL_SATURATION = 0x1000,
BLEND_HSL_COLOR = 0x2000,
BLEND_HSL_LUMINOSITY = 0x4000,
BLEND_ALL = 0x7fff,
};
enum blend_func
{
BLEND_FUNC_ADD,
BLEND_FUNC_SUBTRACT,
BLEND_FUNC_REVERSE_SUBTRACT,
BLEND_FUNC_MIN,
BLEND_FUNC_MAX,
};
enum blend_factor
{
BLEND_FACTOR_ZERO,
BLEND_FACTOR_SRC_COLOR,
BLEND_FACTOR_DST_COLOR,
BLEND_FACTOR_SRC_ALPHA,
BLEND_FACTOR_DST_ALPHA,
BLEND_FACTOR_CONSTANT_COLOR,
BLEND_FACTOR_CONSTANT_ALPHA,
BLEND_FACTOR_SRC_ALPHA_SATURATE,
};
enum gl_tess_spacing
{
TESS_SPACING_UNSPECIFIED,
TESS_SPACING_EQUAL,
TESS_SPACING_FRACTIONAL_ODD,
TESS_SPACING_FRACTIONAL_EVEN,
};
/**
* A compare function enum for use in compiler lowering passes. This is in
* the same order as GL's compare functions (shifted down by GL_NEVER), and is
* exactly the same as gallium's PIPE_FUNC_*.
*/
enum compare_func
{
COMPARE_FUNC_NEVER,
COMPARE_FUNC_LESS,
COMPARE_FUNC_EQUAL,
COMPARE_FUNC_LEQUAL,
COMPARE_FUNC_GREATER,
COMPARE_FUNC_NOTEQUAL,
COMPARE_FUNC_GEQUAL,
COMPARE_FUNC_ALWAYS,
};
/**
* Arrangements for grouping invocations from NV_compute_shader_derivatives.
*
* The extension provides new layout qualifiers that support two different
* arrangements of compute shader invocations for the purpose of derivative
* computation. When specifying
*
* layout(derivative_group_quadsNV) in;
*
* compute shader invocations are grouped into 2x2x1 arrays whose four local
* invocation ID values follow the pattern:
*
* +-----------------+------------------+
* | (2x+0, 2y+0, z) | (2x+1, 2y+0, z) |
* +-----------------+------------------+
* | (2x+0, 2y+1, z) | (2x+1, 2y+1, z) |
* +-----------------+------------------+
*
* where Y increases from bottom to top. When specifying
*
* layout(derivative_group_linearNV) in;
*
* compute shader invocations are grouped into 2x2x1 arrays whose four local
* invocation index values follow the pattern:
*
* +------+------+
* | 4n+0 | 4n+1 |
* +------+------+
* | 4n+2 | 4n+3 |
* +------+------+
*
* If neither layout qualifier is specified, derivatives in compute shaders
* return zero, which is consistent with the handling of built-in texture
* functions like texture() in GLSL 4.50 compute shaders.
*/
enum gl_derivative_group {
DERIVATIVE_GROUP_NONE = 0,
DERIVATIVE_GROUP_QUADS,
DERIVATIVE_GROUP_LINEAR,
};
enum float_controls
{
FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE = 0x0000,
FLOAT_CONTROLS_DENORM_PRESERVE_FP16 = 0x0001,
FLOAT_CONTROLS_DENORM_PRESERVE_FP32 = 0x0002,
FLOAT_CONTROLS_DENORM_PRESERVE_FP64 = 0x0004,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 = 0x0008,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 = 0x0010,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64 = 0x0020,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 = 0x0040,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 = 0x0080,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64 = 0x0100,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 = 0x0200,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 = 0x0400,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64 = 0x0800,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 = 0x1000,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 = 0x2000,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 = 0x4000,
};
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* SHADER_ENUMS_H */

View file

@ -1,326 +0,0 @@
/**************************************************************************
*
* Copyright 2008 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
#ifndef BITSCAN_H
#define BITSCAN_H
#include <assert.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#if defined(__POPCNT__)
#include <popcntintrin.h>
#endif
//#include "c99_compat.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* Find first bit set in word. Least significant bit is 1.
* Return 0 if no bits set.
*/
#ifdef HAVE___BUILTIN_FFS
#define ffs __builtin_ffs
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
static inline
int ffs(int i)
{
unsigned long index;
if (_BitScanForward(&index, i))
return index + 1;
else
return 0;
}
#else
extern
int ffs(int i);
#endif
#ifdef HAVE___BUILTIN_FFSLL
#define ffsll __builtin_ffsll
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
static inline int
ffsll(long long int i)
{
unsigned long index;
if (_BitScanForward64(&index, i))
return index + 1;
else
return 0;
}
#else
extern int
ffsll(long long int val);
#endif
/* Destructively loop over all of the bits in a mask as in:
*
* while (mymask) {
* int i = u_bit_scan(&mymask);
* ... process element i
* }
*
*/
static inline int
u_bit_scan(unsigned *mask)
{
const int i = ffs(*mask) - 1;
*mask ^= (1u << i);
return i;
}
static inline int
u_bit_scan64(uint64_t *mask)
{
const int i = ffsll(*mask) - 1;
*mask ^= (((uint64_t)1) << i);
return i;
}
/* Determine if an unsigned value is a power of two.
*
* \note
* Zero is treated as a power of two.
*/
static inline bool
util_is_power_of_two_or_zero(unsigned v)
{
return (v & (v - 1)) == 0;
}
/* Determine if an uint64_t value is a power of two.
*
* \note
* Zero is treated as a power of two.
*/
static inline bool
util_is_power_of_two_or_zero64(uint64_t v)
{
return (v & (v - 1)) == 0;
}
/* Determine if an unsigned value is a power of two.
*
* \note
* Zero is \b not treated as a power of two.
*/
static inline bool
util_is_power_of_two_nonzero(unsigned v)
{
/* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT. The latter
* indicates the existence of the __builtin_popcount function. The former
* indicates that _mm_popcnt_u32 exists and is a native instruction.
*
* The other alternative is to use SSE 4.2 compile-time flags. This has
* two drawbacks. First, there is currently no build infrastructure for
* SSE 4.2 (only 4.1), so that would have to be added. Second, some AMD
* CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
*/
#ifdef __POPCNT__
return _mm_popcnt_u32(v) == 1;
#else
return v != 0 && (v & (v - 1)) == 0;
#endif
}
/* For looping over a bitmask when you want to loop over consecutive bits
* manually, for example:
*
* while (mask) {
* int start, count, i;
*
* u_bit_scan_consecutive_range(&mask, &start, &count);
*
* for (i = 0; i < count; i++)
* ... process element (start+i)
* }
*/
static inline void
u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
{
if (*mask == 0xffffffff) {
*start = 0;
*count = 32;
*mask = 0;
return;
}
*start = ffs(*mask) - 1;
*count = ffs(~(*mask >> *start)) - 1;
*mask &= ~(((1u << *count) - 1) << *start);
}
static inline void
u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
{
if (*mask == ~0ull) {
*start = 0;
*count = 64;
*mask = 0;
return;
}
*start = ffsll(*mask) - 1;
*count = ffsll(~(*mask >> *start)) - 1;
*mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
}
/**
* Find last bit set in a word. The least significant bit is 1.
* Return 0 if no bits are set.
* Essentially ffs() in the reverse direction.
*/
static inline unsigned
util_last_bit(unsigned u)
{
#if defined(HAVE___BUILTIN_CLZ)
return u == 0 ? 0 : 32 - __builtin_clz(u);
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
unsigned long index;
if (_BitScanReverse(&index, u))
return index + 1;
else
return 0;
#else
unsigned r = 0;
while (u) {
r++;
u >>= 1;
}
return r;
#endif
}
/**
* Find last bit set in a word. The least significant bit is 1.
* Return 0 if no bits are set.
* Essentially ffsll() in the reverse direction.
*/
static inline unsigned
util_last_bit64(uint64_t u)
{
#if defined(HAVE___BUILTIN_CLZLL)
return u == 0 ? 0 : 64 - __builtin_clzll(u);
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
unsigned long index;
if (_BitScanReverse64(&index, u))
return index + 1;
else
return 0;
#else
unsigned r = 0;
while (u) {
r++;
u >>= 1;
}
return r;
#endif
}
/**
* Find last bit in a word that does not match the sign bit. The least
* significant bit is 1.
* Return 0 if no bits are set.
*/
static inline unsigned
util_last_bit_signed(int i)
{
if (i >= 0)
return util_last_bit(i);
else
return util_last_bit(~(unsigned)i);
}
/* Returns a bitfield in which the first count bits starting at start are
* set.
*/
static inline unsigned
u_bit_consecutive(unsigned start, unsigned count)
{
assert(start + count <= 32);
if (count == 32)
return ~0;
return ((1u << count) - 1) << start;
}
static inline uint64_t
u_bit_consecutive64(unsigned start, unsigned count)
{
assert(start + count <= 64);
if (count == 64)
return ~(uint64_t)0;
return (((uint64_t)1 << count) - 1) << start;
}
/**
* Return number of bits set in n.
*/
static inline unsigned
util_bitcount(unsigned n)
{
#if defined(HAVE___BUILTIN_POPCOUNT)
return __builtin_popcount(n);
#else
/* K&R classic bitcount.
*
* For each iteration, clear the LSB from the bitfield.
* Requires only one iteration per set bit, instead of
* one iteration per bit less than highest set bit.
*/
unsigned bits;
for (bits = 0; n; bits++) {
n &= n - 1;
}
return bits;
#endif
}
static inline unsigned
util_bitcount64(uint64_t n)
{
#ifdef HAVE___BUILTIN_POPCOUNTLL
return __builtin_popcountll(n);
#else
return util_bitcount(n) + util_bitcount(n >> 32);
#endif
}
#ifdef __cplusplus
}
#endif
#endif /* BITSCAN_H */

View file

@ -1,261 +0,0 @@
/*
* Mesa 3-D graphics library
*
* Copyright (C) 2006 Brian Paul All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* \file bitset.h
* \brief Bitset of arbitrary size definitions.
* \author Michal Krol
*/
#ifndef BITSET_H
#define BITSET_H
//#include "util/bitscan.h"
//#include "util/macros.h"
/****************************************************************************
* generic bitset implementation
*/
#define BITSET_WORD unsigned int
#define BITSET_WORDBITS (sizeof (BITSET_WORD) * 8)
/* bitset declarations
*/
#define BITSET_WORDS(bits) (((bits) + BITSET_WORDBITS - 1) / BITSET_WORDBITS)
#define BITSET_DECLARE(name, bits) BITSET_WORD name[BITSET_WORDS(bits)]
/* bitset operations
*/
#define BITSET_COPY(x, y) memcpy( (x), (y), sizeof (x) )
#define BITSET_EQUAL(x, y) (memcmp( (x), (y), sizeof (x) ) == 0)
#define BITSET_ZERO(x) memset( (x), 0, sizeof (x) )
#define BITSET_ONES(x) memset( (x), 0xff, sizeof (x) )
#define BITSET_BITWORD(b) ((b) / BITSET_WORDBITS)
#define BITSET_BIT(b) (1u << ((b) % BITSET_WORDBITS))
/* single bit operations
*/
#define BITSET_TEST(x, b) (((x)[BITSET_BITWORD(b)] & BITSET_BIT(b)) != 0)
#define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
#define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
/* bit range operations
*/
#define BITSET_TEST_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
(((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \
(assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0))
#define BITSET_SET_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
((x)[BITSET_BITWORD(b)] |= BITSET_RANGE(b, e)) : \
(assert (!"BITSET_SET_RANGE: bit range crosses word boundary"), 0))
#define BITSET_CLEAR_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
(assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
/* Get first bit set in a bitset.
*/
static inline int
__bitset_ffs(const BITSET_WORD *x, int n)
{
int i;
for (i = 0; i < n; i++) {
if (x[i])
return ffs(x[i]) + BITSET_WORDBITS * i;
}
return 0;
}
#define BITSET_FFS(x) __bitset_ffs(x, ARRAY_SIZE(x))
static inline unsigned
__bitset_next_set(unsigned i, BITSET_WORD *tmp,
const BITSET_WORD *set, unsigned size)
{
unsigned bit, word;
/* NOTE: The initial conditions for this function are very specific. At
* the start of the loop, the tmp variable must be set to *set and the
* initial i value set to 0. This way, if there is a bit set in the first
* word, we ignore the i-value and just grab that bit (so 0 is ok, even
* though 0 may be returned). If the first word is 0, then the value of
* `word` will be 0 and we will go on to look at the second word.
*/
word = BITSET_BITWORD(i);
while (*tmp == 0) {
word++;
if (word >= BITSET_WORDS(size))
return size;
*tmp = set[word];
}
/* Find the next set bit in the non-zero word */
bit = ffs(*tmp) - 1;
/* Unset the bit */
*tmp &= ~(1ull << bit);
return word * BITSET_WORDBITS + bit;
}
/**
* Iterates over each set bit in a set
*
* @param __i iteration variable, bit number
* @param __set the bitset to iterate (will not be modified)
* @param __size number of bits in the set to consider
*/
#define BITSET_FOREACH_SET(__i, __set, __size) \
for (BITSET_WORD __tmp = *(__set), *__foo = &__tmp; __foo != NULL; __foo = NULL) \
for (__i = 0; \
(__i = __bitset_next_set(__i, &__tmp, __set, __size)) < __size;)
#ifdef __cplusplus
/**
* Simple C++ wrapper of a bitset type of static size, with value semantics
* and basic bitwise arithmetic operators. The operators defined below are
* expected to have the same semantics as the same operator applied to other
* fundamental integer types. T is the name of the struct to instantiate
* it as, and N is the number of bits in the bitset.
*/
#define DECLARE_BITSET_T(T, N) struct T { \
EXPLICIT_CONVERSION \
operator bool() const \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
if (words[i]) \
return true; \
return false; \
} \
\
T & \
operator=(int x) \
{ \
const T c = {{ (BITSET_WORD)x }}; \
return *this = c; \
} \
\
friend bool \
operator==(const T &b, const T &c) \
{ \
return BITSET_EQUAL(b.words, c.words); \
} \
\
friend bool \
operator!=(const T &b, const T &c) \
{ \
return !(b == c); \
} \
\
friend bool \
operator==(const T &b, int x) \
{ \
const T c = {{ (BITSET_WORD)x }}; \
return b == c; \
} \
\
friend bool \
operator!=(const T &b, int x) \
{ \
return !(b == x); \
} \
\
friend T \
operator~(const T &b) \
{ \
T c; \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
c.words[i] = ~b.words[i]; \
return c; \
} \
\
T & \
operator|=(const T &b) \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
words[i] |= b.words[i]; \
return *this; \
} \
\
friend T \
operator|(const T &b, const T &c) \
{ \
T d = b; \
d |= c; \
return d; \
} \
\
T & \
operator&=(const T &b) \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
words[i] &= b.words[i]; \
return *this; \
} \
\
friend T \
operator&(const T &b, const T &c) \
{ \
T d = b; \
d &= c; \
return d; \
} \
\
bool \
test(unsigned i) const \
{ \
return BITSET_TEST(words, i); \
} \
\
T & \
set(unsigned i) \
{ \
BITSET_SET(words, i); \
return *this; \
} \
\
T & \
clear(unsigned i) \
{ \
BITSET_CLEAR(words, i); \
return *this; \
} \
\
BITSET_WORD words[BITSET_WORDS(N)]; \
}
#endif
#endif

View file

@ -1,262 +0,0 @@
/**************************************************************************
*
* Copyright 2006 VMware, Inc., Bismarck, ND. USA.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
**************************************************************************/
/**
* \file
* List macros heavily inspired by the Linux kernel
* list handling. No list looping yet.
*
* Is not threadsafe, so common operations need to
* be protected using an external mutex.
*/
#ifndef _UTIL_LIST_H_
#define _UTIL_LIST_H_
#include <stdbool.h>
#include <stddef.h>
#include <assert.h>
#ifdef DEBUG
# define list_assert(cond, msg) assert(cond && msg)
#else
# define list_assert(cond, msg) (void)(0 && (cond))
#endif
struct list_head
{
struct list_head *prev;
struct list_head *next;
};
static inline void list_inithead(struct list_head *item)
{
item->prev = item;
item->next = item;
}
static inline void list_add(struct list_head *item, struct list_head *list)
{
item->prev = list;
item->next = list->next;
list->next->prev = item;
list->next = item;
}
static inline void list_addtail(struct list_head *item, struct list_head *list)
{
item->next = list;
item->prev = list->prev;
list->prev->next = item;
list->prev = item;
}
static inline bool list_is_empty(const struct list_head *list);
static inline void list_replace(struct list_head *from, struct list_head *to)
{
if (list_is_empty(from)) {
list_inithead(to);
} else {
to->prev = from->prev;
to->next = from->next;
from->next->prev = to;
from->prev->next = to;
}
}
static inline void list_del(struct list_head *item)
{
item->prev->next = item->next;
item->next->prev = item->prev;
item->prev = item->next = NULL;
}
static inline void list_delinit(struct list_head *item)
{
item->prev->next = item->next;
item->next->prev = item->prev;
item->next = item;
item->prev = item;
}
static inline bool list_is_empty(const struct list_head *list)
{
return list->next == list;
}
/**
* Returns whether the list has exactly one element.
*/
static inline bool list_is_singular(const struct list_head *list)
{
return list->next != NULL && list->next != list && list->next->next == list;
}
static inline unsigned list_length(const struct list_head *list)
{
struct list_head *node;
unsigned length = 0;
for (node = list->next; node != list; node = node->next)
length++;
return length;
}
static inline void list_splice(struct list_head *src, struct list_head *dst)
{
if (list_is_empty(src))
return;
src->next->prev = dst;
src->prev->next = dst->next;
dst->next->prev = src->prev;
dst->next = src->next;
}
static inline void list_splicetail(struct list_head *src, struct list_head *dst)
{
if (list_is_empty(src))
return;
src->prev->next = dst;
src->next->prev = dst->prev;
dst->prev->next = src->next;
dst->prev = src->prev;
}
static inline void list_validate(const struct list_head *list)
{
struct list_head *node;
assert(list->next->prev == list && list->prev->next == list);
for (node = list->next; node != list; node = node->next)
assert(node->next->prev == node && node->prev->next == node);
}
#define LIST_ENTRY(__type, __item, __field) \
((__type *)(((char *)(__item)) - offsetof(__type, __field)))
/**
* Cast from a pointer to a member of a struct back to the containing struct.
*
* 'sample' MUST be initialized, or else the result is undefined!
*/
#ifndef container_of
#define container_of(ptr, sample, member) \
(void *)((char *)(ptr) \
- ((char *)&(sample)->member - (char *)(sample)))
#endif
#define list_first_entry(ptr, type, member) \
LIST_ENTRY(type, (ptr)->next, member)
#define list_last_entry(ptr, type, member) \
LIST_ENTRY(type, (ptr)->prev, member)
#define LIST_FOR_EACH_ENTRY(pos, head, member) \
for (pos = NULL, pos = container_of((head)->next, pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))
#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member) \
for (pos = NULL, pos = container_of((head)->next, pos, member), \
storage = container_of(pos->member.next, pos, member); \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.next, storage, member))
#define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member) \
for (pos = NULL, pos = container_of((head)->prev, pos, member), \
storage = container_of(pos->member.prev, pos, member); \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.prev, storage, member))
#define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member) \
for (pos = NULL, pos = container_of((start), pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))
#define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member) \
for (pos = NULL, pos = container_of((start), pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.prev, pos, member))
#define list_for_each_entry(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->next, member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.next, member), \
list_assert(pos == __next, "use _safe iterator"), \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_safe(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->next, member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = __next, \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_rev(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->prev, member), \
*__prev = LIST_ENTRY(type, pos->member.prev, member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.prev, member), \
list_assert(pos == __prev, "use _safe iterator"), \
__prev = LIST_ENTRY(type, __prev->member.prev, member))
#define list_for_each_entry_safe_rev(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->prev, member), \
*__prev = LIST_ENTRY(type, pos->member.prev, member); \
&pos->member != (head); \
pos = __prev, \
__prev = LIST_ENTRY(type, __prev->member.prev, member))
#define list_for_each_entry_from(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.next, member))
#define list_for_each_entry_from_safe(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = __next, \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_from_rev(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.prev, member))
#define list_pair_for_each_entry(type, pos1, pos2, head1, head2, member) \
for (type *pos1 = LIST_ENTRY(type, (head1)->next, member), \
*pos2 = LIST_ENTRY(type, (head2)->next, member); \
&pos1->member != (head1) && &pos2->member != (head2); \
pos1 = LIST_ENTRY(type, pos1->member.next, member), \
pos2 = LIST_ENTRY(type, pos2->member.next, member))
#endif /*_UTIL_LIST_H_*/

View file

@ -1,346 +0,0 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef UTIL_MACROS_H
#define UTIL_MACROS_H
#include <assert.h>
/* Compute the size of an array */
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
/* For compatibility with Clang's __has_builtin() */
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
/**
* __builtin_expect macros
*/
#if !defined(HAVE___BUILTIN_EXPECT)
# define __builtin_expect(x, y) (x)
#endif
#ifndef likely
# ifdef HAVE___BUILTIN_EXPECT
# define likely(x) __builtin_expect(!!(x), 1)
# define unlikely(x) __builtin_expect(!!(x), 0)
# else
# define likely(x) (x)
# define unlikely(x) (x)
# endif
#endif
/**
* Static (compile-time) assertion.
* Basically, use COND to dimension an array. If COND is false/zero the
* array size will be -1 and we'll get a compilation error.
*/
#define STATIC_ASSERT(COND) \
do { \
(void) sizeof(char [1 - 2*!(COND)]); \
} while (0)
/**
* Unreachable macro. Useful for suppressing "control reaches end of non-void
* function" warnings.
*/
#if defined(HAVE___BUILTIN_UNREACHABLE) || __has_builtin(__builtin_unreachable)
#define unreachable(str) \
do { \
assert(!str); \
__builtin_unreachable(); \
} while (0)
#elif defined (_MSC_VER)
#define unreachable(str) \
do { \
assert(!str); \
__assume(0); \
} while (0)
#else
#define unreachable(str) assert(!str)
#endif
/**
* Assume macro. Useful for expressing our assumptions to the compiler,
* typically for purposes of silencing warnings.
*/
#if __has_builtin(__builtin_assume)
#define assume(expr) \
do { \
assert(expr); \
__builtin_assume(expr); \
} while (0)
#elif defined HAVE___BUILTIN_UNREACHABLE
#define assume(expr) ((expr) ? ((void) 0) \
: (assert(!"assumption failed"), \
__builtin_unreachable()))
#elif defined (_MSC_VER)
#define assume(expr) __assume(expr)
#else
#define assume(expr) assert(expr)
#endif
/* Attribute const is used for functions that have no effects other than their
* return value, and only rely on the argument values to compute the return
* value. As a result, calls to it can be CSEed. Note that using memory
* pointed to by the arguments is not allowed for const functions.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_CONST
#define ATTRIBUTE_CONST __attribute__((__const__))
#else
#define ATTRIBUTE_CONST
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
#define FLATTEN __attribute__((__flatten__))
#else
#define FLATTEN
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_FORMAT
#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
#else
#define PRINTFLIKE(f, a)
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC
#define MALLOCLIKE __attribute__((__malloc__))
#else
#define MALLOCLIKE
#endif
/* Forced function inlining */
/* Note: Clang also sets __GNUC__ (see other cases below) */
#ifndef ALWAYS_INLINE
# if defined(__GNUC__)
# define ALWAYS_INLINE inline __attribute__((always_inline))
# elif defined(_MSC_VER)
# define ALWAYS_INLINE __forceinline
# else
# define ALWAYS_INLINE inline
# endif
#endif
/* Used to optionally mark structures with misaligned elements or size as
* packed, to trade off performance for space.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_PACKED
#define PACKED __attribute__((__packed__))
#else
#define PACKED
#endif
/* Attribute pure is used for functions that have no effects other than their
* return value. As a result, calls to it can be dead code eliminated.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_PURE
#define ATTRIBUTE_PURE __attribute__((__pure__))
#else
#define ATTRIBUTE_PURE
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
#define ATTRIBUTE_RETURNS_NONNULL __attribute__((__returns_nonnull__))
#else
#define ATTRIBUTE_RETURNS_NONNULL
#endif
#ifndef NORETURN
# ifdef _MSC_VER
# define NORETURN __declspec(noreturn)
# elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
# define NORETURN __attribute__((__noreturn__))
# else
# define NORETURN
# endif
#endif
#ifdef __cplusplus
/**
* Macro function that evaluates to true if T is a trivially
* destructible type -- that is, if its (non-virtual) destructor
* performs no action and all member variables and base classes are
* trivially destructible themselves.
*/
# if (defined(__clang__) && defined(__has_feature))
# if __has_feature(has_trivial_destructor)
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# elif defined(__GNUC__)
# if ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# ifndef HAS_TRIVIAL_DESTRUCTOR
/* It's always safe (if inefficient) to assume that a
* destructor is non-trivial.
*/
# define HAS_TRIVIAL_DESTRUCTOR(T) (false)
# endif
#endif
/**
* PUBLIC/USED macros
*
* If we build the library with gcc's -fvisibility=hidden flag, we'll
* use the PUBLIC macro to mark functions that are to be exported.
*
* We also need to define a USED attribute, so the optimizer doesn't
* inline a static function that we later use in an alias. - ajax
*/
#ifndef PUBLIC
# if defined(__GNUC__)
# define PUBLIC __attribute__((visibility("default")))
# define USED __attribute__((used))
# elif defined(_MSC_VER)
# define PUBLIC __declspec(dllexport)
# define USED
# else
# define PUBLIC
# define USED
# endif
#endif
/**
* UNUSED marks variables (or sometimes functions) that have to be defined,
* but are sometimes (or always) unused beyond that. A common case is for
* a function parameter to be used in some build configurations but not others.
* Another case is fallback vfuncs that don't do anything with their params.
*
* Note that this should not be used for identifiers used in `assert()`;
* see ASSERTED below.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_UNUSED
#define UNUSED __attribute__((unused))
#else
#define UNUSED
#endif
/**
* Use ASSERTED to indicate that an identifier is unused outside of an `assert()`,
* so that assert-free builds don't get "unused variable" warnings.
*/
#ifdef NDEBUG
#define ASSERTED UNUSED
#else
#define ASSERTED
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
#define MUST_CHECK __attribute__((warn_unused_result))
#else
#define MUST_CHECK
#endif
#if defined(__GNUC__)
#define ATTRIBUTE_NOINLINE __attribute__((noinline))
#else
#define ATTRIBUTE_NOINLINE
#endif
/**
* Check that STRUCT::FIELD can hold MAXVAL. We use a lot of bitfields
* in Mesa/gallium. We have to be sure they're of sufficient size to
* hold the largest expected value.
* Note that with MSVC, enums are signed and enum bitfields need one extra
* high bit (always zero) to ensure the max value is handled correctly.
* This macro will detect that with MSVC, but not GCC.
*/
#define ASSERT_BITFIELD_SIZE(STRUCT, FIELD, MAXVAL) \
do { \
ASSERTED STRUCT s; \
s.FIELD = (MAXVAL); \
assert((int) s.FIELD == (MAXVAL) && "Insufficient bitfield size!"); \
} while (0)
/** Compute ceiling of integer quotient of A divided by B. */
#define DIV_ROUND_UP( A, B ) ( ((A) + (B) - 1) / (B) )
/** Clamp X to [MIN,MAX]. Turn NaN into MIN, arbitrarily. */
#define CLAMP( X, MIN, MAX ) ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
/** Minimum of two values: */
#define MIN2( A, B ) ( (A)<(B) ? (A) : (B) )
/** Maximum of two values: */
#define MAX2( A, B ) ( (A)>(B) ? (A) : (B) )
/** Minimum and maximum of three values: */
#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
/** Align a value to a power of two */
#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
/**
* Macro for declaring an explicit conversion operator. Defaults to an
* implicit conversion if C++11 is not supported.
*/
#if __cplusplus >= 201103L
#define EXPLICIT_CONVERSION explicit
#elif defined(__cplusplus)
#define EXPLICIT_CONVERSION
#endif
/** Set a single bit */
#define BITFIELD_BIT(b) (1u << (b))
/** Set all bits up to excluding bit b */
#define BITFIELD_MASK(b) \
((b) == 32 ? (~0u) : BITFIELD_BIT((b) % 32) - 1)
/** Set count bits starting from bit b */
#define BITFIELD_RANGE(b, count) \
(BITFIELD_MASK((b) + (count)) & ~BITFIELD_MASK(b))
/** Set a single bit */
#define BITFIELD64_BIT(b) (1ull << (b))
/** Set all bits up to excluding bit b */
#define BITFIELD64_MASK(b) \
((b) == 64 ? (~0ull) : BITFIELD64_BIT(b) - 1)
/** Set count bits starting from bit b */
#define BITFIELD64_RANGE(b, count) \
(BITFIELD64_MASK((b) + (count)) & ~BITFIELD64_MASK(b))
/* TODO: In future we should try to move this to u_debug.h once header
* dependencies are reorganised to allow this.
*/
enum pipe_debug_type
{
PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
PIPE_DEBUG_TYPE_ERROR,
PIPE_DEBUG_TYPE_SHADER_INFO,
PIPE_DEBUG_TYPE_PERF_INFO,
PIPE_DEBUG_TYPE_INFO,
PIPE_DEBUG_TYPE_FALLBACK,
PIPE_DEBUG_TYPE_CONFORMANCE,
};
#endif /* UTIL_MACROS_H */

View file

@ -1,120 +0,0 @@
#!/usr/bin/env python3
import os, ctypes, ctypes.util, io, mmap, pathlib
from tinygrad import Tensor, dtypes, Device
from tinygrad.helpers import Timing, from_mv
libc = ctypes.CDLL(ctypes.util.find_library("c"))
#from extra.hip_gpu_driver import hip_ioctl
# sudo su -c "echo 3 > /proc/sys/vm/drop_caches"
# sudo su -c 'echo 8 > /proc/sys/kernel/printk'
# sudo su -c "echo 'module amdgpu +p' > /sys/kernel/debug/dynamic_debug/control"
libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t]
libc.read.restype = ctypes.c_size_t
libc.malloc.argtypes = [ctypes.c_size_t]
libc.malloc.restype = ctypes.c_void_p
def read_direct(fd, sz):
with Timing("mmap: ", lambda x: f", {sz/x:.2f} GB/s"):
buf = mmap.mmap(-1, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE)
with Timing("read: ", lambda x: f", {sz/x:.2f} GB/s"):
ret = libc.read(fd, from_mv(buf), sz)
assert ret == sz
def read_mmap(fd, sz):
with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"):
buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
t = 0
for i in range(0, sz, 0x1000): t += buf[i]
# def _copyin_async(self, dest:T, src:T, size:int): check(hip.hipMemcpyAsync(dest, src, size, hip.hipMemcpyHostToDevice, None))
def read_to_gpu_mmap(fd, sz, gpubuf):
with Timing("gpu copyin: ", lambda x: f", {sz/x:.2f} GB/s"):
with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"):
buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
dev.allocator._copyin_async(gpubuf, from_mv(buf), sz)
dev.synchronize()
def read_to_gpu_single(fd, sz, gpubuf):
os.lseek(fd, 0, os.SEEK_SET)
with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
hst = dev.allocator._hostalloc(sz)
with Timing("read to host: ", lambda x: f", {sz/x:.2f} GB/s"):
ret = libc.read(fd, hst, sz)
with Timing("gpu host copy: ", lambda x: f", {sz/x:.2f} GB/s"):
dev.allocator._copyin_async(gpubuf, hst, sz)
dev.synchronize()
def read_to_gpu_pingpong(fd, sz, gpubuf):
psz = 256*1024*1024
print(f"piece size {psz/(1024*1024):.2f} MB")
with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
hst1 = dev.allocator._hostalloc(psz)
hst2 = dev.allocator._hostalloc(psz)
os.lseek(fd, 0, os.SEEK_SET)
with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
for i in range(sz//(psz*2)):
with Timing("tfer(0): ", lambda x: f", {psz/x:.2f} GB/s"):
ret = libc.read(fd, hst1, psz)
dev.synchronize()
dev.allocator._copyin_async(gpubuf, hst1, psz)
with Timing("tfer(1): ", lambda x: f", {psz/x:.2f} GB/s"):
ret = libc.read(fd, hst2, psz)
dev.synchronize()
dev.allocator._copyin_async(gpubuf, hst2, psz)
dev.synchronize()
MAP_LOCKED = 0x2000
MAP_HUGETLB = 0x40000
if __name__ == "__main__":
dev = Device[Device.DEFAULT]
warm = (Tensor.ones(1024, device=Device.DEFAULT).contiguous() + Tensor.ones(1024, device=Device.DEFAULT).contiguous()).realize()
#fn = "/home/tiny/tinygrad/weights/rng"
fn = pathlib.Path(__file__).parents[1] / "weights/LLaMA-2/70B/consolidated.00.pth"
sz = os.stat(fn).st_size
t = Tensor.empty(sz, dtype=dtypes.uint8, device=f"disk:{fn}")
with Timing("copy: ", lambda x: f", {sz/x:.2f} GB/s"):
on_dev = t.to(Device.DEFAULT).realize()
exit(0)
# 4GB of random numbers
#fd = os.open("/home/tiny/tinygrad/weights/rng", os.O_RDWR|os.O_DIRECT)
#sz = os.fstat(fd).st_size // 4
fd = os.open("/home/tiny/tinygrad/weights/LLaMA/7B/consolidated.00.pth", os.O_RDWR|os.O_DIRECT)
sz = os.fstat(fd).st_size
print(f"read {sz} from {fd}")
with Timing("gpu alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
gpubuf = dev.allocator._alloc(sz)
# warmup
dev.allocator._copyin_async(gpubuf, from_mv(bytearray(b"\x00\x00\x00\x00"*0x1000)), 0x4000)
print("copying, is warm")
print("****** read to gpu pingpong")
read_to_gpu_pingpong(fd, sz, gpubuf)
exit(0)
print("****** read direct")
read_direct(fd, sz)
print("****** read mmap")
read_mmap(fd, sz)
print("****** read to gpu single")
read_to_gpu_single(fd, sz, gpubuf)
print("****** read to gpu mmap")
read_to_gpu_mmap(fd, sz, gpubuf)
os._exit(0)

View file

@ -1,21 +0,0 @@
import sys, sqlite3, pickle
from tinygrad.helpers import CACHEDB
if __name__ == "__main__":
fn = sys.argv[1] if len(sys.argv) > 1 else CACHEDB
conn = sqlite3.connect(fn)
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
for f in cur.fetchall():
table = f[0]
cur2 = conn.cursor()
cur2.execute(f"SELECT COUNT(*) FROM {table}")
cnt = cur2.fetchone()[0]
print(f"{table:20s} : {cnt}")
cur3 = conn.cursor()
cur3.execute(f"SELECT * FROM {table} LIMIT 10")
for f in cur3.fetchall():
v = pickle.loads(f[-1])
print(" ", len(f[0]) if isinstance(f[0], str) else f[0], f[1:-1], str(v)[0:50])
#print(f"{len(k):10d}, {sk} -> {v}")

View file

@ -36,7 +36,7 @@ if __name__ == "__main__":
for _ in range(run_count): tc = (a@b).realize()
GlobalCounters.reset()
ei = ExecItem(runner, [a.uop.buffer, b.uop.buffer, c.uop.buffer])
ei = ExecItem(ast, [a.uop.buffer, b.uop.buffer, c.uop.buffer], prg=runner)
with Context(DEBUG=2):
for _ in range(run_count): ei.run(wait=True)
print(f"custom {(c-tc).square().mean().item()}")

View file

@ -1,3 +1,4 @@
import numpy as np
from tinygrad import Tensor, Device, Context, GlobalCounters, dtypes
from tinygrad.uop.ops import UOp, KernelInfo, sint, AxisType
from tinygrad.engine.realize import ExecItem, get_runner
@ -140,15 +141,14 @@ def hand_spec_kernel3():
return sink.sink(arg=KernelInfo(opts_to_apply=())).simplify()
def test_matmul(sink:UOp, N=N):
with Context(DEBUG=0):
a = Tensor.randn(N, N)
b = Tensor.randn(N, N)
hc = Tensor.empty(N, N)
Tensor.realize(a, b, hc)
rng = np.random.default_rng()
a = Tensor(rng.random((N, N), dtype=np.float32)-0.5)
b = Tensor(rng.random((N, N), dtype=np.float32)-0.5)
hc = Tensor.empty(N, N)
Tensor.realize(a, b, hc)
ei = ExecItem(get_runner(Device.DEFAULT, sink), [t.uop.buffer for t in [hc, a, b]])
ei = ExecItem(sink, [t.uop.buffer for t in [hc, a, b]], prg=get_runner(Device.DEFAULT, sink))
GlobalCounters.reset()
ets = []
with Context(DEBUG=2):
for _ in range(run_count):

View file

@ -1,27 +0,0 @@
#!/usr/bin/env python3
import time
import jax
import jax.numpy as jnp
print(jax.devices())
DEVICES = len(jax.devices())
BS = 32
N = 4096
dtype = jnp.float16
A = jnp.zeros((DEVICES, BS, N, N), dtype)
B = jnp.zeros((1, 1, N, N), dtype)
A = jax.device_put_sharded([A[i] for i in range(DEVICES)], jax.devices())
B = jax.device_put_sharded([B for i in range(DEVICES)], jax.devices())
OPS = DEVICES*BS*N*N*N*2
def matmul(A,B): return jnp.matmul(A,B,preferred_element_type=jnp.float32)
pmatmul = jax.pmap(matmul)
MAX_TFLOPS = 123*DEVICES # Peak FP16 Tensor TFLOPS with FP32 Acc (7900XTX)
for i in range(10):
st = time.perf_counter()
C = pmatmul(A,B).block_until_ready()
et = time.perf_counter()-st
tflops = (OPS*1e-12)/et
print(f"time {et*1e3:.2f} ms, TFLOPS {tflops:6.2f}, MFU {(tflops/MAX_TFLOPS)*100:4.2f}% out shape {C.shape} dtype {C.dtype}")

View file

@ -1,10 +0,0 @@
import mlx.core as mx
from tinygrad.helpers import Timing
N = 4096
x = mx.random.normal((N,N))
w = mx.random.normal((N,N))
FLOPS = N*N*N*2
for i in range(10):
with Timing("", lambda x: f" {FLOPS/x:.2f} GFLOPS"):
mx.eval(x@w)

View file

@ -3,7 +3,6 @@ from tinygrad import dtypes, Tensor
from tinygrad.helpers import getenv, get_single_element
from tinygrad.dtype import _to_np_dtype
from tinygrad.codegen.opt import OptOps
from tinygrad.engine.realize import lower_schedule
dtype_in = (dtypes.half if getenv("HALF") else dtypes.bfloat16 if getenv("BFLOAT16") else
dtypes.fp8e4m3 if getenv("FP8E4M3") else dtypes.fp8e5m2 if getenv("FP8E5M2") else dtypes.float)
@ -40,8 +39,8 @@ if __name__ == "__main__":
if getenv("SHOULD_USE_TC"):
sched = a.matmul(b, dtype=acc_dtype).schedule()
lowered = list(lower_schedule(sched))
ei = get_single_element(lowered)[1]
ei = get_single_element(sched)
ei.lower()
assert any(opt.op is OptOps.TC for opt in ei.prg.p.applied_opts), f"TC not triggered, {ei.prg.p.applied_opts}"
ref = a.numpy().astype(np.float32) @ b.numpy().astype(np.float32)

View file

@ -1,33 +0,0 @@
import time
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
for dtype in [tf.float16, tf.float32]:
for N in [256, 512, 1024, 2048, 4096, 8192]:
FLOPS = N*N*N*2
b = tf.random.uniform((N, N), dtype=dtype)
c = tf.random.uniform((N, N), dtype=dtype)
b = tf.Variable(b)
c = tf.Variable(c)
def tf_prog(b, c):
st = time.perf_counter()
a = tf.matmul(b, c)
tf.debugging.check_numerics(a, "Nan or Inf in result") # Ensures that the calculation is done.
return time.perf_counter() - st
tm = min([tf_prog(b, c) for _ in range(20)])
print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS {N:4d}x{N:4d}x{N:4d} matmul in {dtype}")

View file

@ -33,5 +33,5 @@ if __name__ == "__main__":
new_src = prg.src
# can mod source here
prg = replace(prg, src=new_src)
ei = ExecItem(CompiledRunner(prg), [x.ensure_allocated() for x in si.bufs], si.metadata)
ei = ExecItem(si.ast, [x.ensure_allocated() for x in si.bufs], si.metadata, prg=CompiledRunner(prg))
for i in range(5): ei.run(wait=True)

View file

@ -88,7 +88,7 @@ if __name__ == "__main__":
prg = ProgramSpec("matmul_kernel", src, device=Device.DEFAULT,
global_size=[M//BLOCK_SIZE_M, N//BLOCK_SIZE_N, 1], local_size=[32*compiled.metadata.num_warps, 1, 1],
mem_estimate=A.nbytes() + B.nbytes() + C.nbytes())
ei = ExecItem(CompiledRunner(prg), [x.ensure_allocated() for x in si.bufs], si.metadata)
ei = ExecItem(si.ast, [x.ensure_allocated() for x in si.bufs], si.metadata, prg=CompiledRunner(prg))
tflops = []
for i in range(5):
tm = ei.run(wait=True)

1
extra/hevc/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
out/

83
extra/hevc/decode.py Normal file
View file

@ -0,0 +1,83 @@
import argparse, os, hashlib
from tinygrad.helpers import getenv, DEBUG, round_up, Timing, tqdm, fetch
from extra.hevc.hevc import parse_hevc_file_headers, untile_nv12, to_bgr, nv_gpu
from tinygrad import Tensor, dtypes, Device, Variable, TinyJit
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str, default="")
parser.add_argument("--output_dir", type=str, default="extra/hevc/out")
args = parser.parse_args()
if args.input_file == "":
url = "https://github.com/haraschax/filedump/raw/09a497959f7fa6fd8dba501a25f2cdb3a41ecb12/comma_video.hevc"
hevc_tensor = Tensor.from_url(url, device="CPU")
else:
hevc_tensor = Tensor.empty(os.stat(args.input_file).st_size, dtype=dtypes.uint8, device=f"disk:{args.input_file}").to("CPU")
dat = bytes(hevc_tensor.data())
dat_hash = hashlib.md5(dat).hexdigest()
with Timing("prep infos: "):
dat_nv = hevc_tensor.to("NV")
opaque, frame_info, w, h, luma_w, luma_h, chroma_off = parse_hevc_file_headers(dat)
frame_info = frame_info[:getenv("MAX_FRAMES", len(frame_info))]
# move all needed data to gpu
#all_slices = []
with Timing("copy to gpu: "):
opaque_nv = opaque.to("NV").contiguous().realize()
hevc_tensor = hevc_tensor.to("NV")
out_image_size = luma_h + (luma_h + 1) // 2, round_up(luma_w, 64)
max_hist = max(history_sz for _, _, _, history_sz, _ in frame_info)
# define variables
v_pos = Variable("pos", 0, max_hist + 1)
v_offset = Variable("offset", 0, hevc_tensor.numel()-1)
v_sz = Variable("sz", 0, hevc_tensor.numel())
v_i = Variable("i", 0, len(frame_info)-1)
@TinyJit
def decode_jit(pos:Variable, src:Tensor, data:Tensor, *hist:Tensor):
return src.decode_hevc_frame(pos, out_image_size, data, hist).realize()
# warm up
history = [Tensor.empty(*out_image_size, dtype=dtypes.uint8, device="NV") for _ in range(max_hist)]
for i in range(3):
hevc_frame = hevc_tensor.shrink((((bound_offset:=v_offset.bind(frame_info[0][0])), bound_offset+v_sz.bind(frame_info[0][1])),))
decode_jit(v_pos.bind(0), hevc_frame, opaque_nv[v_i.bind(0)], *history)
out_images = []
with Timing("decoding whole file: ", on_exit=(lambda et: f", {len(frame_info)} frames, {len(frame_info)/(et/1e9):.2f} fps")):
for i, (offset, sz, frame_pos, history_sz, is_hist) in enumerate(frame_info):
history = history[-max_hist:] if max_hist > 0 else []
# TODO: this shrink should work as a slice
hevc_frame = hevc_tensor.shrink((((bound_offset:=v_offset.bind(offset)), bound_offset+v_sz.bind(sz)),))
outimg = decode_jit(v_pos.bind(frame_pos), hevc_frame, opaque_nv[v_i.bind(i)], *history).clone()
out_images.append(outimg)
if is_hist: history.append(outimg)
Device.default.synchronize()
if getenv("VALIDATE", 0):
import pickle
if dat_hash == "b813bfdbec194fd17fdf0e3ceb8cea1c":
url = "https://github.com/nimlgen/hevc_validate_set/raw/refs/heads/main/decoded_frames_b813bfdbec194fd17fdf0e3ceb8cea1c.pkl"
decoded_frames = pickle.load(fetch(url).open("rb"))
else: decoded_frames = pickle.load(open(f"extra/hevc/decoded_frames_{dat_hash}.pkl", "rb"))
else: import cv2
for i, img in tqdm(enumerate(out_images)):
if getenv("VALIDATE", 0):
if i < len(decoded_frames) and len(decoded_frames[i]) > 0:
img = untile_nv12(img, h, w, luma_w, chroma_off).realize()
assert img.data() == decoded_frames[i], f"Frame {i} does not match reference decoder!"
print(f"Frame {i} matches reference decoder!")
else:
if len(args.output_dir):
os.makedirs(args.output_dir, exist_ok=True)
img = to_bgr(img, h, w, luma_w, chroma_off).realize()
cv2.imwrite(f"{args.output_dir}/out_frame_{i:04d}.png", img.numpy())

450
extra/hevc/hevc.py Normal file
View file

@ -0,0 +1,450 @@
import dataclasses, enum, argparse, os, itertools, time, ctypes
from typing import Any
from tinygrad import Tensor, dtypes, Device, TinyJit
from tinygrad.helpers import DEBUG, round_up, ceildiv, Timing, prod
from tinygrad.runtime.autogen import avcodec, nv_570 as nv_gpu
class BitReader:
def __init__(self, data:bytes): self.reader, self.current_bits, self.bits, self.read_bits, self.total = iter(data), 0, 0, 0, len(data) * 8
def empty(self): return self.read_bits == self.total and self.current_bits == 0
def peak_bits(self, n):
while self.current_bits < n:
self.bits = (self.bits << 8) | next(self.reader)
self.current_bits += 8
self.read_bits += 8
return (self.bits >> (self.current_bits - n)) & ((1 << n) - 1)
def _next_bits(self, n):
val = self.peak_bits(n)
self.bits &= (1 << (self.current_bits - n)) - 1
self.current_bits -= n
return val
def u(self, n): return self._next_bits(n)
# 9.2 Parsing process for 0-th order Exp-Golomb codes
def ue_v(self):
leading_zero_bits = -1
while True:
bit = self.u(1)
leading_zero_bits += 1
if bit == 1: break
part = self.u(leading_zero_bits)
if leading_zero_bits == 0: return 0
return (1 << leading_zero_bits) - 1 + part
# 9.2.2 Mapping process for signed Exp-Golomb codes
def se_v(self):
k = self.ue_v()
return (-1 ** (k + 1)) * (k // 2)
# 7.3.1.1 General NAL unit syntax
def _hevc_get_rbsp(dat:bytes, off=0) -> bytes:
rbsp = bytes()
while off < len(dat):
if off + 2 < len(dat) and dat[off:off+3] == b'\x00\x00\x03':
rbsp += bytes([0, 0])
off += 3
else:
rbsp += bytes([dat[off]])
off += 1
return rbsp
class HevcSlice:
# 7.3.3 Profile, tier and level syntax
def profile_tier_level(self, r:BitReader, enable:bool, max_sub_layers:int):
assert enable and max_sub_layers == 0, "no sublayers supported"
self._notimpl_profile_tier_level = r.u(88)
self.general_level_idc = r.u(8)
# 7.3.7 Short-term reference picture set syntax
def st_ref_pic_set(self, r:BitReader, stRpsIdx:int, num_short_term_ref_pic_sets:int=0, sps=None):
inter_ref_pic_set_prediction_flag = r.u(1) if stRpsIdx != 0 else 0
if inter_ref_pic_set_prediction_flag:
if stRpsIdx == num_short_term_ref_pic_sets:
delta_idx_minus1 = r.ue_v()
delta_rps_sign = r.u(1)
abs_delta_rps_minus1 = r.ue_v()
NumDeltaPocs = sps.num_negative_pics + sps.num_positive_pics
for i in range(NumDeltaPocs + 1):
used_by_curr_pic_flag = r.u(1)
if not used_by_curr_pic_flag:
use_delta_flag = r.u(1)
else:
self.num_negative_pics = r.ue_v()
self.num_positive_pics = r.ue_v()
for i in range(self.num_negative_pics):
delta_poc_s0_minus1 = r.ue_v()
used_by_curr_pic_s0_flag = r.u(1)
for i in range(self.num_positive_pics):
delta_poc_s1_minus1 = r.ue_v()
used_by_curr_pic_s1_flag = r.u(1)
# 7.3.2.2 Sequence parameter set RBSP syntax
class SPS(HevcSlice):
def __init__(self, r:BitReader):
self.sps_video_parameter_set_id = r.u(4)
self.sps_max_sub_layers_minus1 = r.u(3)
self.sps_temporal_id_nesting_flag = r.u(1)
self.profile_tier_level(r, True, self.sps_max_sub_layers_minus1)
self.sps_seq_parameter_set_id = r.ue_v()
self.chroma_format_idc = r.ue_v()
self.separate_colour_plane_flag = r.u(1) if self.chroma_format_idc == 3 else 0
self.pic_width_in_luma_samples = r.ue_v()
self.pic_height_in_luma_samples = r.ue_v()
self.conformance_window_flag = r.u(1)
if self.conformance_window_flag:
self.conf_win_left_offset = r.ue_v()
self.conf_win_right_offset = r.ue_v()
self.conf_win_top_offset = r.ue_v()
self.conf_win_bottom_offset = r.ue_v()
else: self.conf_win_left_offset = self.conf_win_right_offset = self.conf_win_top_offset = self.conf_win_bottom_offset = 0
self.bit_depth_luma = r.ue_v() + 8
self.bit_depth_chroma = r.ue_v() + 8
self.log2_max_pic_order_cnt_lsb_minus4 = r.ue_v()
self.sps_sub_layer_ordering_info_present_flag = r.u(1)
self.sps_max_dec_pic_buffering, self.sps_max_num_reorder_pics, self.sps_max_latency_increase_plus1 = [], [], []
for i in range((0 if self.sps_sub_layer_ordering_info_present_flag else self.sps_max_sub_layers_minus1), self.sps_max_sub_layers_minus1 + 1):
self.sps_max_dec_pic_buffering.append(r.ue_v() + 1)
self.sps_max_num_reorder_pics.append(r.ue_v())
self.sps_max_latency_increase_plus1.append(r.ue_v())
self.log2_min_luma_coding_block_size = r.ue_v() + 3
self.log2_max_luma_coding_block_size = self.log2_min_luma_coding_block_size + r.ue_v()
self.log2_min_transform_block_size = r.ue_v() + 2
self.log2_max_transform_block_size = self.log2_min_transform_block_size + r.ue_v()
self.max_transform_hierarchy_depth_inter = r.ue_v()
self.max_transform_hierarchy_depth_intra = r.ue_v()
if scaling_list_enabled_flag := r.u(1):
if sps_scaling_list_data_present_flag := r.u(1): assert False, "scaling_list_data parsing not implemented"
self.amp_enabled_flag = r.u(1)
self.sample_adaptive_offset_enabled_flag = r.u(1)
self.pcm_enabled_flag = r.u(1)
assert self.pcm_enabled_flag == 0, "pcm not implemented"
self.num_short_term_ref_pic_sets = r.ue_v()
for i in range(self.num_short_term_ref_pic_sets):
self.st_ref_pic_set(r, i, self.num_short_term_ref_pic_sets)
self.long_term_ref_pics_present_flag = r.u(1)
if self.long_term_ref_pics_present_flag: assert False, "long_term_ref_pics parsing not implemented"
self.sps_temporal_mvp_enabled_flag = r.u(1)
self.strong_intra_smoothing_enabled_flag = r.u(1)
# 7.3.2.3 Picture parameter set RBSP syntax
class PPS(HevcSlice):
def __init__(self, r:BitReader):
self.pps_pic_parameter_set_id = r.ue_v()
self.pps_seq_parameter_set_id = r.ue_v()
self.dependent_slice_segments_enabled_flag = r.u(1)
self.output_flag_present_flag = r.u(1)
self.num_extra_slice_header_bits = r.u(3)
self.sign_data_hiding_enabled_flag = r.u(1)
self.cabac_init_present_flag = r.u(1)
self.num_ref_idx_l0_default_active = r.ue_v() + 1
self.num_ref_idx_l1_default_active = r.ue_v() + 1
self.init_qp = r.se_v() + 26
self.constrained_intra_pred_flag = r.u(1)
self.transform_skip_enabled_flag = r.u(1)
self.cu_qp_delta_enabled_flag = r.u(1)
if self.cu_qp_delta_enabled_flag: self.diff_cu_qp_delta_depth = r.ue_v()
self.pps_cb_qp_offset = r.se_v()
self.pps_cr_qp_offset = r.se_v()
self.pps_slice_chroma_qp_offsets_present_flag = r.u(1)
self.weighted_pred_flag = r.u(1)
self.weighted_bipred_flag = r.u(1)
self.transquant_bypass_enabled_flag = r.u(1)
self.tiles_enabled_flag = r.u(1)
self.entropy_coding_sync_enabled_flag = r.u(1)
if self.tiles_enabled_flag:
self.num_tile_columns_minus1 = r.ue_v()
self.num_tile_rows_minus1 = r.ue_v()
self.uniform_spacing_flag = r.u(1)
self.column_width_minus1, self.row_height_minus1 = [], []
if not self.uniform_spacing_flag:
for i in range(self.num_tile_columns_minus1): self.column_width_minus1.append(r.ue_v())
for i in range(self.num_tile_rows_minus1): self.row_height_minus1.append(r.ue_v())
self.loop_filter_across_tiles_enabled_flag = r.u(1)
self.loop_filter_across_slices_enabled_flag = r.u(1)
self.deblocking_filter_control_present_flag = r.u(1)
if self.deblocking_filter_control_present_flag: assert False, "deblocking_filter parsing not implemented"
self.scaling_list_data_present_flag = r.u(1)
if self.scaling_list_data_present_flag: assert False, "scaling_list_data parsing not implemented"
self.lists_modification_present_flag = r.u(1)
self.log2_parallel_merge_level = r.ue_v() + 2
# 7.3.6 Slice segment header syntax
class SliceSegment(HevcSlice):
def __init__(self, r:BitReader, nal_unit_type:int, sps:SPS, pps:PPS):
self.first_slice_segment_in_pic_flag = r.u(1)
if nal_unit_type >= avcodec.HEVC_NAL_BLA_W_LP and nal_unit_type <= avcodec.HEVC_NAL_RSV_IRAP_VCL23:
self.no_output_of_prior_pics_flag = r.u(1)
self.slice_pic_parameter_set_id = r.ue_v()
if not self.first_slice_segment_in_pic_flag:
if pps.dependent_slice_segments_enabled_flag:
self.dependent_slice_segment_flag = r.u(1)
self.slice_segment_address = r.ue_v()
self.dependent_slice_segment_flag = 0
if not self.dependent_slice_segment_flag:
r.u(pps.num_extra_slice_header_bits) # extra bits ignored
self.slice_type = r.ue_v()
self.sw_skip_start = r.read_bits - r.current_bits
self.pic_output_flag = r.u(1) if pps.output_flag_present_flag else 0
self.colour_plane_id = r.u(2) if sps.separate_colour_plane_flag else 0
if nal_unit_type != avcodec.HEVC_NAL_IDR_W_RADL and nal_unit_type != avcodec.HEVC_NAL_IDR_N_LP:
self.slice_pic_order_cnt_lsb = r.u(sps.log2_max_pic_order_cnt_lsb_minus4 + 4)
self.short_term_ref_pic_set_sps_flag = r.u(1)
if not self.short_term_ref_pic_set_sps_flag:
self.short_term_ref_pics_in_slice_start = r.read_bits - r.current_bits
self.st_ref_pic_set(r, sps.num_short_term_ref_pic_sets, sps=sps)
self.short_term_ref_pics_in_slice_end = r.read_bits - r.current_bits
elif sps.num_short_term_ref_pic_sets > 1: assert False, "short_term_ref_pic_set parsing not implemented"
if sps.long_term_ref_pics_present_flag: assert False, "long_term_ref_pics parsing not implemented"
self.sw_skip_end = r.read_bits - r.current_bits
self.slice_temporal_mvp_enabled_flag = r.u(1) if sps.sps_temporal_mvp_enabled_flag else 0
else: self.slice_pic_order_cnt_lsb, self.sw_skip_end = 0, self.sw_skip_start
if sps.sample_adaptive_offset_enabled_flag:
slice_sao_luma_flag = r.u(1)
ChromaArrayType = sps.chroma_format_idc if sps.separate_colour_plane_flag == 0 else 0
slice_sao_chroma_flag = r.u(1) if ChromaArrayType != 0 else 0
if self.slice_type in {avcodec.HEVC_SLICE_B, avcodec.HEVC_SLICE_B}:
if num_ref_idx_active_override_flag := r.u(1):
num_ref_idx_l0_active_minus1 = r.ue_v()
num_ref_idx_l1_active_minus1 = r.ue_v() if self.slice_type == avcodec.HEVC_SLICE_B else 0
def fill_sps_into_dev_context(device_ctx, sps:SPS):
device_ctx.chroma_format_idc = sps.chroma_format_idc
device_ctx.pic_width_in_luma_samples = sps.pic_width_in_luma_samples
device_ctx.pic_height_in_luma_samples = sps.pic_height_in_luma_samples
device_ctx.bit_depth_luma = sps.bit_depth_luma
device_ctx.bit_depth_chroma = sps.bit_depth_chroma
device_ctx.log2_max_pic_order_cnt_lsb_minus4 = sps.log2_max_pic_order_cnt_lsb_minus4
device_ctx.log2_min_luma_coding_block_size = sps.log2_min_luma_coding_block_size
device_ctx.log2_max_luma_coding_block_size = sps.log2_max_luma_coding_block_size
device_ctx.log2_min_transform_block_size = sps.log2_min_transform_block_size
device_ctx.log2_max_transform_block_size = sps.log2_max_transform_block_size
device_ctx.amp_enabled_flag = sps.amp_enabled_flag
device_ctx.pcm_enabled_flag = sps.pcm_enabled_flag
device_ctx.sample_adaptive_offset_enabled_flag = sps.sample_adaptive_offset_enabled_flag
device_ctx.sps_temporal_mvp_enabled_flag = sps.sps_temporal_mvp_enabled_flag
device_ctx.strong_intra_smoothing_enabled_flag = sps.strong_intra_smoothing_enabled_flag
def fill_pps_into_dev_context(device_ctx, pps:PPS):
device_ctx.sign_data_hiding_enabled_flag = pps.sign_data_hiding_enabled_flag
device_ctx.cabac_init_present_flag = pps.cabac_init_present_flag
device_ctx.num_ref_idx_l0_default_active = pps.num_ref_idx_l0_default_active
device_ctx.num_ref_idx_l1_default_active = pps.num_ref_idx_l1_default_active
device_ctx.init_qp = pps.init_qp
device_ctx.cu_qp_delta_enabled_flag = pps.cu_qp_delta_enabled_flag
device_ctx.diff_cu_qp_delta_depth = getattr(pps, 'diff_cu_qp_delta_depth', 0)
device_ctx.pps_cb_qp_offset = pps.pps_cb_qp_offset
device_ctx.pps_cr_qp_offset = pps.pps_cr_qp_offset
device_ctx.pps_slice_chroma_qp_offsets_present_flag = pps.pps_slice_chroma_qp_offsets_present_flag
device_ctx.weighted_pred_flag = pps.weighted_pred_flag
device_ctx.weighted_bipred_flag = pps.weighted_bipred_flag
device_ctx.transquant_bypass_enabled_flag = pps.transquant_bypass_enabled_flag
device_ctx.tiles_enabled_flag = pps.tiles_enabled_flag
device_ctx.entropy_coding_sync_enabled_flag = pps.entropy_coding_sync_enabled_flag
device_ctx.loop_filter_across_slices_enabled_flag = pps.loop_filter_across_slices_enabled_flag
device_ctx.deblocking_filter_control_present_flag = pps.deblocking_filter_control_present_flag
device_ctx.scaling_list_data_present_flag = pps.scaling_list_data_present_flag
device_ctx.lists_modification_present_flag = pps.lists_modification_present_flag
device_ctx.log2_parallel_merge_level = pps.log2_parallel_merge_level
device_ctx.loop_filter_across_tiles_enabled_flag = getattr(pps, 'loop_filter_across_tiles_enabled_flag', 0)
def parse_hevc_file_headers(dat:bytes, device="NV"):
res = []
nal_unit_start = 1
history:list[tuple[int, int, int]] = []
device_ctx = nv_gpu.nvdec_hevc_pic_s(gptimer_timeout_value=92720000, tileformat=1, sw_start_code_e=1, pattern_id=2)
nal_infos = []
ctx_bytes = bytes()
align_ctx_bytes_size = 0x300
def _flush_picture():
nonlocal res, history, device_ctx, nal_infos, ctx_bytes, align_ctx_bytes_size
if not len(nal_infos): return
hdr, nal_unit_type = nal_infos[0][0]
assert all(nal_unit_type == x[0][1] for x in nal_infos), "all NAL units in a picture must be of the same type"
device_ctx.curr_pic_idx = next(i for i in range(16) if all(d[0] != i for d in history))
if nal_unit_type in {avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_IDR_N_LP}:
history = []
device_ctx.num_ref_frames = len(history)
device_ctx.IDR_picture_flag = int(nal_unit_type in {avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_IDR_N_LP})
device_ctx.RAP_picture_flag = int(nal_unit_type >= avcodec.HEVC_NAL_BLA_W_LP and nal_unit_type <= avcodec.HEVC_NAL_RSV_IRAP_VCL23)
device_ctx.RefDiffPicOrderCnts=(ctypes.c_int16 * 16)()
device_ctx.colMvBuffersize = (round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64) // 16) // 256
device_ctx.framestride=(ctypes.c_uint32 * 2)(round_up(sps.pic_width_in_luma_samples, 64), round_up(sps.pic_width_in_luma_samples, 64))
device_ctx.sw_hdr_skip_length = hdr.sw_skip_end - hdr.sw_skip_start
device_ctx.num_bits_short_term_ref_pics_in_slice = max(0, device_ctx.sw_hdr_skip_length - 9)
device_ctx.stream_len = sum(x[2] for x in nal_infos)
if pps.tiles_enabled_flag:
device_ctx.num_tile_columns = pps.num_tile_columns_minus1 + 1
device_ctx.num_tile_rows = pps.num_tile_rows_minus1 + 1
device_ctx.num_short_term_ref_pic_sets = sps.num_short_term_ref_pic_sets
luma_h_rounded = round_up(sps.pic_height_in_luma_samples, 64)
device_ctx.HevcSaoBufferOffset = (608 * luma_h_rounded) >> 8
device_ctx.HevcBsdCtrlOffset = ((device_ctx.HevcSaoBufferOffset<<8) + 4864 * luma_h_rounded) >> 8
device_ctx.v1.hevc_main10_444_ext.HevcFltAboveOffset = ((device_ctx.HevcBsdCtrlOffset<<8) + 152 * luma_h_rounded) >> 8
device_ctx.v1.hevc_main10_444_ext.HevcSaoAboveOffset = ((device_ctx.v1.hevc_main10_444_ext.HevcFltAboveOffset<<8) + 2000 * luma_h_rounded) >> 8
device_ctx.v3.HevcSliceEdgeOffset = device_ctx.v1.hevc_main10_444_ext.HevcSaoAboveOffset
before_list, after_list = [], []
for pic_idx, poc, _ in history:
device_ctx.RefDiffPicOrderCnts[pic_idx] = hdr.slice_pic_order_cnt_lsb - poc
if hdr.slice_pic_order_cnt_lsb < poc: after_list.append((poc - hdr.slice_pic_order_cnt_lsb, pic_idx))
else: before_list.append((hdr.slice_pic_order_cnt_lsb - poc, pic_idx))
before_list.sort()
after_list.sort()
device_ctx.initreflistidxl0 = (ctypes.c_uint8 * 16)(*[idx for _,idx in before_list + after_list])
if hdr.slice_type == avcodec.HEVC_SLICE_B: device_ctx.initreflistidxl1 = (ctypes.c_uint8 * 16)(*[idx for _,idx in after_list + before_list])
locl_ctx_bytes = bytes(device_ctx)
locl_ctx_bytes += b'\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00' # blackwell extension
locl_ctx_bytes += bytes(0x200 - len(locl_ctx_bytes)) # pad to 512 bytes
pic_width_in_ctbs = ceildiv(sps.pic_width_in_luma_samples, (1 << sps.log2_max_luma_coding_block_size))
pic_height_in_ctbs = ceildiv(sps.pic_height_in_luma_samples, (1 << sps.log2_max_luma_coding_block_size))
# append tile sizes 0x200
if pps.tiles_enabled_flag and pps.uniform_spacing_flag:
assert device_ctx.num_tile_columns == 1 and device_ctx.num_tile_rows == 1, "not implemented: uniform spacing with multiple tiles"
locl_ctx_bytes += pic_width_in_ctbs.to_bytes(2, "little") + pic_height_in_ctbs.to_bytes(2, "little")
else:
if pps.tiles_enabled_flag and not getattr(pps, 'uniform_spacing_flag', 0):
column_width = [cw_minus1 + 1 for cw_minus1 in pps.column_width_minus1[0:pps.num_tile_columns_minus1]]
row_height = [rh_minus1 + 1 for rh_minus1 in pps.row_height_minus1[0:pps.num_tile_rows_minus1]]
else:
column_width = []
row_height = []
column_width.append(pic_width_in_ctbs - sum(column_width))
row_height.append(pic_height_in_ctbs - sum(row_height))
for c in column_width:
for r in row_height: locl_ctx_bytes += c.to_bytes(2, "little") + r.to_bytes(2, "little")
luma_size = round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64)
chroma_size = round_up(sps.pic_width_in_luma_samples, 64) * round_up((sps.pic_height_in_luma_samples + 1) // 2, 64)
is_hist = nal_unit_type in {avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL}
res.append((nal_infos[0][1], device_ctx.stream_len, device_ctx.curr_pic_idx, len(history), is_hist))
locl_ctx_bytes += (align_ctx_bytes_size - len(locl_ctx_bytes)) * b'\x00'
ctx_bytes += locl_ctx_bytes
if nal_unit_type in {avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL}:
history.append((device_ctx.curr_pic_idx, hdr.slice_pic_order_cnt_lsb, None))
if len(history) >= sps.sps_max_dec_pic_buffering[0]:
# remove the oldest poc
history.pop(0)
nal_infos = []
cnt = 0
while nal_unit_start < len(dat):
assert dat[nal_unit_start:nal_unit_start+3] == b"\x00\x00\x01", "NAL unit start code not found"
pos = dat.find(b"\x00\x00\x01", nal_unit_start + 3)
nal_unit_len = (pos if pos != -1 else len(dat)) - nal_unit_start
# 7.3.1.1 General NAL unit syntax
nal_unit_type = (dat[nal_unit_start+3] >> 1) & 0x3F
slice_dat = dat[nal_unit_start+5:nal_unit_start+nal_unit_len]
if nal_unit_type == avcodec.HEVC_NAL_SPS:
sps = SPS(BitReader(_hevc_get_rbsp(slice_dat)))
fill_sps_into_dev_context(device_ctx, sps)
elif nal_unit_type == avcodec.HEVC_NAL_PPS:
pps = PPS(BitReader(_hevc_get_rbsp(slice_dat)))
fill_pps_into_dev_context(device_ctx, pps)
elif nal_unit_type in {avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_TRAIL_N}:
hdr = SliceSegment(BitReader(slice_dat), nal_unit_type, sps, pps)
if hdr.first_slice_segment_in_pic_flag == 1: _flush_picture()
nal_infos.append(((hdr, nal_unit_type), nal_unit_start, nal_unit_len))
nal_unit_start += nal_unit_len
_flush_picture()
w = sps.pic_width_in_luma_samples - 2 * (sps.conf_win_left_offset + sps.conf_win_right_offset)
h = sps.pic_height_in_luma_samples - 2 * (sps.conf_win_top_offset + sps.conf_win_bottom_offset)
chroma_off = round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64)
opaque = Tensor(ctx_bytes, device=device).reshape(len(res), align_ctx_bytes_size)
return opaque, res, w, h, sps.pic_width_in_luma_samples, sps.pic_height_in_luma_samples, chroma_off
def _addr_table(h, w, w_aligned):
GOB_W, GOB_H = 64, 8
GOB_SIZE = GOB_W * GOB_H
BLOCK_H_GOBS = 2
xs = Tensor.arange(w, dtype=dtypes.uint32).reshape(1, w)
ys = Tensor.arange(h, dtype=dtypes.uint32).reshape(h, 1)
gob_x = xs // GOB_W
gob_y = ys // GOB_H
super_block_y = gob_y // BLOCK_H_GOBS
gob_y_in_block = gob_y % BLOCK_H_GOBS
stride_gobs = w_aligned // GOB_W
base = ((super_block_y * stride_gobs + gob_x) * BLOCK_H_GOBS + gob_y_in_block) * GOB_SIZE
lx, ly = xs % GOB_W, ys % GOB_H
swiz = (lx & 0x0F) | ((ly & 0x03) << 4) | ((lx & 0x10) << 2) | ((ly & 0x04) << 5) | ((lx & 0x20) << 3)
return (base + swiz).reshape(-1)
def nv12_to_bgr_from_planes(luma: Tensor, chroma: Tensor, h: int, w: int) -> Tensor:
Y = luma.reshape(h, w).cast(dtypes.float32)
uv = chroma.reshape(h // 2, w // 2, 2).cast(dtypes.float32)
U_small = uv[..., 0]
V_small = uv[..., 1]
U = U_small.reshape(h // 2, 1, w // 2, 1).expand(h // 2, 2, w // 2, 2).reshape(h, w)
V = V_small.reshape(h // 2, 1, w // 2, 1).expand(h // 2, 2, w // 2, 2).reshape(h, w)
C = Y - 16.0
D = U - 128.0
E = V - 128.0
R = 1.1643835616438356 * C + 1.5960267857142858 * E
G = 1.1643835616438356 * C - 0.39176229009491365 * D - 0.8129676472377708 * E
B = 1.1643835616438356 * C + 2.017232142857143 * D
R = R.maximum(0.0).minimum(255.0)
G = G.maximum(0.0).minimum(255.0)
B = B.maximum(0.0).minimum(255.0)
return Tensor.stack([B, G, R], dim=2).cast(dtypes.uint8)
def untile_nv12(src:Tensor, h:int, w:int, luma_w:int, chroma_off:int) -> Tensor:
luma = src.reshape(-1)[_addr_table(h, w, round_up(luma_w, 64))]
chroma = src.reshape(-1)[chroma_off:][_addr_table((h + 1) // 2, w, round_up(luma_w, 64))]
return luma.cat(chroma).realize()
def to_bgr(tensor:Tensor, h:int, w:int, luma_w:int, chroma_off:int) -> Tensor:
luma = tensor.reshape(-1)[_addr_table(h, w, round_up(luma_w, 64))]
chroma = tensor.reshape(-1)[chroma_off:][_addr_table((h + 1) // 2, w, round_up(luma_w, 64))]
return nv12_to_bgr_from_planes(luma, chroma, h, w).realize()

View file

@ -1,12 +0,0 @@
import ctypes
import tinygrad.runtime.autogen.hip as hip
from tinygrad.runtime.ops_hip import check
from tinygrad.helpers import init_c_var
if __name__ == "__main__":
check(hip.hipSetDevice(0))
evt = init_c_var(hip.hipEvent_t(), lambda x: check(hip.hipEventCreate(ctypes.byref(x))))
check(hip.hipSetDevice(1))
check(hip.hipStreamWaitEvent(None, evt, 0))
check(hip.hipSetDevice(0))
check(hip.hipEventRecord(evt, None))

View file

@ -66,7 +66,7 @@ def ioctl(fd, request, argp):
print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
if name == "AMDKFD_IOC_SVM":
out = ctypes.cast(s.attrs, ctypes.POINTER(kfd_ioctl.struct_kfd_ioctl_svm_attribute))
for i in range(s.nattr): print(f"{i}: {kfd_ioctl.kfd_ioctl_svm_attr_type__enumvalues[out[i].type]:40s}: {out[i].value:#x}")
for i in range(s.nattr): print(f"{i}: {kfd_ioctl.enum_kfd_ioctl_svm_attr_type.get(out[i].type):40s}: {out[i].value:#x}")
else:
print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : ioctl",
f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")

View file

@ -1,45 +0,0 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: sentencepiece_model.proto
# Protobuf Python Version: 4.25.1
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03')
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sentencepiece_model_pb2', _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
_globals['DESCRIPTOR']._options = None
_globals['DESCRIPTOR']._serialized_options = b'H\003'
_globals['_TRAINERSPEC'].fields_by_name['mining_sentence_size']._options = None
_globals['_TRAINERSPEC'].fields_by_name['mining_sentence_size']._serialized_options = b'\030\001'
_globals['_TRAINERSPEC'].fields_by_name['training_sentence_size']._options = None
_globals['_TRAINERSPEC'].fields_by_name['training_sentence_size']._serialized_options = b'\030\001'
_globals['_TRAINERSPEC']._serialized_start=45
_globals['_TRAINERSPEC']._serialized_end=1581
_globals['_TRAINERSPEC_MODELTYPE']._serialized_start=1517
_globals['_TRAINERSPEC_MODELTYPE']._serialized_end=1570
_globals['_NORMALIZERSPEC']._serialized_start=1584
_globals['_NORMALIZERSPEC']._serialized_end=1793
_globals['_SELFTESTDATA']._serialized_start=1795
_globals['_SELFTESTDATA']._serialized_end=1916
_globals['_SELFTESTDATA_SAMPLE']._serialized_start=1864
_globals['_SELFTESTDATA_SAMPLE']._serialized_end=1905
_globals['_MODELPROTO']._serialized_start=1919
_globals['_MODELPROTO']._serialized_end=2429
_globals['_MODELPROTO_SENTENCEPIECE']._serialized_start=2208
_globals['_MODELPROTO_SENTENCEPIECE']._serialized_end=2418
_globals['_MODELPROTO_SENTENCEPIECE_TYPE']._serialized_start=2323
_globals['_MODELPROTO_SENTENCEPIECE_TYPE']._serialized_end=2407
# @@protoc_insertion_point(module_scope)

View file

@ -1,176 +0,0 @@
from __future__ import annotations
from typing import List, Optional, Dict, cast
import numpy as np
np.set_printoptions(suppress=True)
import math, functools, time, random, statistics
from tinygrad.helpers import DEBUG, getenv, CACHELEVEL, diskcache_get, diskcache_put, colored, Profiling
from tinygrad.codegen.opt.kernel import Kernel
from tinygrad.device import Buffer, Device, CompileError
from tinygrad.codegen.opt.search import _ensure_buffer_alloc, get_kernel_actions, _time_program
from tinygrad.engine.realize import get_program
class MCTSNode:
def __init__(self, kernel:Kernel, parent=None):
self.kernel:Kernel = kernel
self.t = math.inf
self.n = 0
self.tm = math.inf
self.i = -1
self.parents: List[MCTSNode] = [parent] if parent is not None else []
self.children: Optional[List[MCTSNode]] = None
self.removed_children: List[MCTSNode] = []
def expand_node(node:MCTSNode):
assert node.children is None
node.children = [MCTSNode(x, node) for x in get_kernel_actions(node.kernel, include_0=False).values()]
def remove_node(node:MCTSNode):
for parent in node.parents:
assert parent.children is not None
parent.children.remove(node)
parent.removed_children.append(node)
C = math.sqrt(2)
TEMP = 0.5
def _sample_tree(node:MCTSNode, best_tm:float) -> MCTSNode:
if node.children is None or len(node.children) == 0: return node
unexplored_children = []
explored_children = []
ucb_explored_children: List[float] = []
for child in node.children:
if child.n == 0: unexplored_children.append(child)
else:
ucb = -child.t/best_tm + C*math.sqrt(math.log(node.n)/child.n)
if not math.isinf(ucb):
explored_children.append(child)
ucb_explored_children.append(ucb)
if len(unexplored_children): return random.choice(unexplored_children)
if not len(explored_children): return node
# safe softmax
ucb_exp = np.exp((np.array(ucb_explored_children)-max(ucb_explored_children))/TEMP)
return _sample_tree(explored_children[np.random.choice(len(ucb_exp), p=ucb_exp/np.sum(ucb_exp))], best_tm)
# this will expand/remove sometimes
def sample_tree(root:MCTSNode, best_tm:float) -> Optional[MCTSNode]:
if root.children is None: expand_node(root)
while root.children:
# tree traversal
node = _sample_tree(root, best_tm)
if node.children is not None and len(node.children) == 0:
remove_node(node)
continue
# node expansion
if node.n != 0:
if node.children is None: expand_node(node)
assert node.children is not None
if len(node.children) == 0:
remove_node(node)
continue
node = random.choice(node.children)
return node
return None
def backprop(bnode:MCTSNode, tm, strength=1.0):
if bnode.t > tm: bnode.t = tm
bnode.n += strength
for parent in bnode.parents: backprop(parent, tm, strength/len(bnode.parents))
graph_mcts_cnt = 0
def mcts_search(lin:Kernel, rawbufs:List[Buffer], amt:int) -> Kernel:
global graph_mcts_cnt
# TODO: copied from BEAM
key = {"ast": lin.ast.key, "amt": amt, "device": lin.opts.device, "suffix": lin.opts.suffix}
if not getenv("IGNORE_MCTS_CACHE") and CACHELEVEL >= 1 and (val:=diskcache_get("mcts_search", key)) is not None:
ret = lin.copy()
for o in val[len(lin.applied_opts):]: ret.apply_opt(o)
return ret
rawbufs = _ensure_buffer_alloc(rawbufs)
var_vals = {k.expr:(k.vmax+k.vmin)//2 for k in lin.ast.variables()}
dev = Device[lin.opts.device]
root = MCTSNode(lin)
st = time.perf_counter()
best, best_idx, best_tm = lin, 0, math.inf
seen_libs: Dict[bytes, MCTSNode] = {}
seen_asts: Dict[bytes, MCTSNode] = {}
compile_time, runtime_time = 0.0, 0.0
for i in range(amt):
node = sample_tree(root, best_tm) # sample and expand
if node is None: break # finished the whole tree
node.i = i # when was node explored
opt_ast = node.kernel.get_optimized_ast()
if (sibling_node:=seen_asts.get(opt_ast.key, None)) is not None:
# early check for same optimized AST hit
remove_node(node)
tm = sibling_node.t
else:
seen_asts[opt_ast.key] = node
# lowering (50% of the time)
p = get_program(node.kernel.get_optimized_ast(name_override="test"), node.kernel.opts)
# rollout
tm1 = time.perf_counter()
try:
lib = dev.compiler.compile(p.src)
except CompileError:
# NOTE: many of these "compiler errors" are caused by bad code output from the lowerer
lib = None
tm2 = time.perf_counter()
if lib is None:
tm = math.inf
else:
if (sibling_node:=seen_libs.get(lib, None)) is not None:
# NOTE: these should all be caught by the AST check, need to canonicalize
# remove this node, it's a duplicate
remove_node(node)
tm = sibling_node.t
else:
seen_libs[lib] = node
try: tm = statistics.median(_time_program(p, lib, var_vals, rawbufs, cnt=3, early_stop=best_tm*5/1e6))*1e6
except RuntimeError: tm = math.inf
node.tm = tm
tm3 = time.perf_counter()
compile_time += tm2-tm1
runtime_time += tm3-tm2
# mock rollout
#node.tm = tm = random.random() + 0.1
if tm < best_tm: best, best_idx, best_tm = node.kernel, i, tm
et = time.perf_counter() - st
if DEBUG>=2: print(f"\r{et:7.2f}s {colored(f'{compile_time*100/et:3.0f}%', 'cyan')} {colored(f'{runtime_time*100/et:3.0f}%', 'red')}: {tm:12.2f} us best: {best_tm:12.2f} us @ {best_idx+1:4d} {i+1:4d}/{amt:4d} {int(round((i+1)/et)):4d}/s {node.kernel.colored_shape()}\033[K", end="") # noqa: E501
# backprop
backprop(node, tm)
if DEBUG>=2: print()
if getenv("MCTSGRAPH"):
import networkx as nx
import os
GRAPHPATH = "/tmp/net"
def save_graph(G, fn, opt=""):
print("saving", G, f"to {fn}.svg")
nx.drawing.nx_pydot.write_dot(G, f'{fn}.dot')
os.system(f'dot {opt} -Tsvg {fn}.dot -o {fn}.svg')
G = nx.DiGraph()
def add_node(node:MCTSNode):
if node.n == 0: return
for parent in node.parents: G.add_edge(parent, node)
gopts = node.kernel.applied_opts
edge_lbl = f"{str(gopts[-1].op)[7:]} {gopts[-1].axis} {gopts[-1].arg}" if len(gopts) else "ROOT"
G.add_node(node, label=f"{node.i+1}\n{node.tm:.2f} us\n{edge_lbl}\nt {node.t:.2f}\nn {node.n}",
fillcolor="#80ff8080" if node.tm == best_tm else "#ffff8080", style='filled' if node.t == best_tm else '')
if node.children is not None:
for child in node.children+node.removed_children: add_node(child)
add_node(root)
save_graph(G, f"{GRAPHPATH}.{graph_mcts_cnt}.mcts", '-Grankdir=LR')
graph_mcts_cnt += 1
if CACHELEVEL >= 1: diskcache_put("mcts_search", key, best.applied_opts)
return best

View file

@ -48,7 +48,7 @@ if __name__=="__main__":
COMPILER = HIPCompiler(DEV.arch)
if DEV.arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
if DEV.arch == 'gfx1103': NUM_WORKGROUPS = 8
if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 40
if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 32
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))

View file

@ -242,7 +242,8 @@ class BertIntermediate:
def __call__(self, hidden_states):
x = self.dense(hidden_states)
# tinygrad gelu is openai gelu but we need the original bert gelu
return gelu(x)
# NOTE: contiguous for speed
return gelu(x).contiguous()
class BertAttention:
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):

View file

@ -0,0 +1,603 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef clc9b0_h_
#define clc9b0_h_
#include "nvtypes.h"
#ifdef __cplusplus
extern "C" {
#endif
#define NVC9B0_VIDEO_DECODER (0x0000C9B0)
#define NVC9B0_NOP (0x00000100)
#define NVC9B0_NOP_PARAMETER 31:0
#define NVC9B0_PM_TRIGGER (0x00000140)
#define NVC9B0_PM_TRIGGER_V 31:0
#define NVC9B0_SET_APPLICATION_ID (0x00000200)
#define NVC9B0_SET_APPLICATION_ID_ID 31:0
#define NVC9B0_SET_APPLICATION_ID_ID_MPEG12 (0x00000001)
#define NVC9B0_SET_APPLICATION_ID_ID_VC1 (0x00000002)
#define NVC9B0_SET_APPLICATION_ID_ID_H264 (0x00000003)
#define NVC9B0_SET_APPLICATION_ID_ID_MPEG4 (0x00000004)
#define NVC9B0_SET_APPLICATION_ID_ID_VP8 (0x00000005)
#define NVC9B0_SET_APPLICATION_ID_ID_CTR64 (0x00000006)
#define NVC9B0_SET_APPLICATION_ID_ID_HEVC (0x00000007)
#define NVC9B0_SET_APPLICATION_ID_ID_NEW_H264 (0x00000008)
#define NVC9B0_SET_APPLICATION_ID_ID_VP9 (0x00000009)
#define NVC9B0_SET_APPLICATION_ID_ID_PASS1 (0x0000000A)
#define NVC9B0_SET_APPLICATION_ID_ID_HEVC_PARSER (0x0000000C)
#define NVC9B0_SET_APPLICATION_ID_ID_UCODE_TEST (0x0000000D)
#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_DECRYPTAUDIO (0x0000000E)
#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_DECRYPTAUDIOMULTIPLE (0x0000000F)
#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_PREPROCESSENCRYPTEDDATA (0x00000010)
#define NVC9B0_SET_APPLICATION_ID_ID_VP9_WITH_PARSER (0x00000011)
#define NVC9B0_SET_APPLICATION_ID_ID_AVD (0x00000012)
#define NVC9B0_SET_APPLICATION_ID_ID_HW_DRM_PR4_DECRYPTCONTENTMULTIPLE (0x00000013)
#define NVC9B0_SET_APPLICATION_ID_ID_DHKE (0x00000020)
#define NVC9B0_SET_WATCHDOG_TIMER (0x00000204)
#define NVC9B0_SET_WATCHDOG_TIMER_TIMER 31:0
#define NVC9B0_SEMAPHORE_A (0x00000240)
#define NVC9B0_SEMAPHORE_A_UPPER 7:0
#define NVC9B0_SEMAPHORE_B (0x00000244)
#define NVC9B0_SEMAPHORE_B_LOWER 31:0
#define NVC9B0_SEMAPHORE_C (0x00000248)
#define NVC9B0_SEMAPHORE_C_PAYLOAD 31:0
#define NVC9B0_CTX_SAVE_AREA (0x0000024C)
#define NVC9B0_CTX_SAVE_AREA_OFFSET 31:0
#define NVC9B0_CTX_SWITCH (0x00000250)
#define NVC9B0_CTX_SWITCH_OP 1:0
#define NVC9B0_CTX_SWITCH_OP_CTX_UPDATE (0x00000000)
#define NVC9B0_CTX_SWITCH_OP_CTX_SAVE (0x00000001)
#define NVC9B0_CTX_SWITCH_OP_CTX_RESTORE (0x00000002)
#define NVC9B0_CTX_SWITCH_OP_CTX_FORCERESTORE (0x00000003)
#define NVC9B0_CTX_SWITCH_CTXID_VALID 2:2
#define NVC9B0_CTX_SWITCH_CTXID_VALID_FALSE (0x00000000)
#define NVC9B0_CTX_SWITCH_CTXID_VALID_TRUE (0x00000001)
#define NVC9B0_CTX_SWITCH_RESERVED0 7:3
#define NVC9B0_CTX_SWITCH_CTX_ID 23:8
#define NVC9B0_CTX_SWITCH_RESERVED1 31:24
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_LOWER (0x00000254)
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_LOWER_PAYLOAD_LOWER 31:0
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_UPPER (0x00000258)
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_UPPER_PAYLOAD_UPPER 31:0
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_A (0x0000025C)
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_A_LOWER 31:0
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_B (0x00000260)
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_B_UPPER 31:0
#define NVC9B0_EXECUTE (0x00000300)
#define NVC9B0_EXECUTE_NOTIFY 0:0
#define NVC9B0_EXECUTE_NOTIFY_DISABLE (0x00000000)
#define NVC9B0_EXECUTE_NOTIFY_ENABLE (0x00000001)
#define NVC9B0_EXECUTE_NOTIFY_ON 1:1
#define NVC9B0_EXECUTE_NOTIFY_ON_END (0x00000000)
#define NVC9B0_EXECUTE_NOTIFY_ON_BEGIN (0x00000001)
#define NVC9B0_EXECUTE_PREDICATION 2:2
#define NVC9B0_EXECUTE_PREDICATION_DISABLE (0x00000000)
#define NVC9B0_EXECUTE_PREDICATION_ENABLE (0x00000001)
#define NVC9B0_EXECUTE_PREDICATION_OP 3:3
#define NVC9B0_EXECUTE_PREDICATION_OP_EQUAL_ZERO (0x00000000)
#define NVC9B0_EXECUTE_PREDICATION_OP_NOT_EQUAL_ZERO (0x00000001)
#define NVC9B0_EXECUTE_AWAKEN 8:8
#define NVC9B0_EXECUTE_AWAKEN_DISABLE (0x00000000)
#define NVC9B0_EXECUTE_AWAKEN_ENABLE (0x00000001)
#define NVC9B0_SEMAPHORE_D (0x00000304)
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE 1:0
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_ONE (0x00000000)
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_FOUR (0x00000001)
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_TWO (0x00000002)
#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE 8:8
#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE_FALSE (0x00000000)
#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE_TRUE (0x00000001)
#define NVC9B0_SEMAPHORE_D_OPERATION 17:16
#define NVC9B0_SEMAPHORE_D_OPERATION_RELEASE (0x00000000)
#define NVC9B0_SEMAPHORE_D_OPERATION_RESERVED_0 (0x00000001)
#define NVC9B0_SEMAPHORE_D_OPERATION_RESERVED_1 (0x00000002)
#define NVC9B0_SEMAPHORE_D_OPERATION_TRAP (0x00000003)
#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE 21:21
#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE_FALSE (0x00000000)
#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE_TRUE (0x00000001)
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE 23:22
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_UNCONDITIONAL (0x00000000)
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_CONDITIONAL (0x00000001)
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_CONDITIONAL_EXT (0x00000002)
#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE 24:24
#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE_32BIT (0x00000000)
#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE_64BIT (0x00000001)
#define NVC9B0_SET_PREDICATION_OFFSET_UPPER (0x00000308)
#define NVC9B0_SET_PREDICATION_OFFSET_UPPER_OFFSET 7:0
#define NVC9B0_SET_PREDICATION_OFFSET_LOWER (0x0000030C)
#define NVC9B0_SET_PREDICATION_OFFSET_LOWER_OFFSET 31:0
#define NVC9B0_SET_AUXILIARY_DATA_BUFFER (0x00000310)
#define NVC9B0_SET_AUXILIARY_DATA_BUFFER_OFFSET 31:0
#define NVC9B0_SET_CONTROL_PARAMS (0x00000400)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE 3:0
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG1 (0x00000000)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG2 (0x00000001)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VC1 (0x00000002)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_H264 (0x00000003)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG4 (0x00000004)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_DIVX3 (0x00000004)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VP8 (0x00000005)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_HEVC (0x00000007)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VP9 (0x00000009)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_AV1 (0x0000000A)
#define NVC9B0_SET_CONTROL_PARAMS_GPTIMER_ON 4:4
#define NVC9B0_SET_CONTROL_PARAMS_RET_ERROR 5:5
#define NVC9B0_SET_CONTROL_PARAMS_ERR_CONCEAL_ON 6:6
#define NVC9B0_SET_CONTROL_PARAMS_ERROR_FRM_IDX 12:7
#define NVC9B0_SET_CONTROL_PARAMS_MBTIMER_ON 13:13
#define NVC9B0_SET_CONTROL_PARAMS_EC_INTRA_FRAME_USING_PSLC 14:14
#define NVC9B0_SET_CONTROL_PARAMS_IGNORE_SOME_FIELDS_CRC_CHECK 15:15
#define NVC9B0_SET_CONTROL_PARAMS_EVENT_TRACE_LOGGING_ON 16:16
#define NVC9B0_SET_CONTROL_PARAMS_ALL_INTRA_FRAME 17:17
#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV 19:18
#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV_TRACE3D_RUN (0x00000000)
#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV_PROD_RUN (0x00000001)
#define NVC9B0_SET_CONTROL_PARAMS_HINT_DUMP_EN 20:20
#define NVC9B0_SET_CONTROL_PARAMS_RESERVED 25:21
#define NVC9B0_SET_CONTROL_PARAMS_NVDECSIM_SKIP_SCP 26:26
#define NVC9B0_SET_CONTROL_PARAMS_ENABLE_ENCRYPT 27:27
#define NVC9B0_SET_CONTROL_PARAMS_ENCRYPTMODE 31:28
#define NVC9B0_SET_DRV_PIC_SETUP_OFFSET (0x00000404)
#define NVC9B0_SET_DRV_PIC_SETUP_OFFSET_OFFSET 31:0
#define NVC9B0_SET_IN_BUF_BASE_OFFSET (0x00000408)
#define NVC9B0_SET_IN_BUF_BASE_OFFSET_OFFSET 31:0
#define NVC9B0_SET_PICTURE_INDEX (0x0000040C)
#define NVC9B0_SET_PICTURE_INDEX_INDEX 31:0
#define NVC9B0_SET_SLICE_OFFSETS_BUF_OFFSET (0x00000410)
#define NVC9B0_SET_SLICE_OFFSETS_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_COLOC_DATA_OFFSET (0x00000414)
#define NVC9B0_SET_COLOC_DATA_OFFSET_OFFSET 31:0
#define NVC9B0_SET_HISTORY_OFFSET (0x00000418)
#define NVC9B0_SET_HISTORY_OFFSET_OFFSET 31:0
#define NVC9B0_SET_DISPLAY_BUF_SIZE (0x0000041C)
#define NVC9B0_SET_DISPLAY_BUF_SIZE_SIZE 31:0
#define NVC9B0_SET_HISTOGRAM_OFFSET (0x00000420)
#define NVC9B0_SET_HISTOGRAM_OFFSET_OFFSET 31:0
#define NVC9B0_SET_NVDEC_STATUS_OFFSET (0x00000424)
#define NVC9B0_SET_NVDEC_STATUS_OFFSET_OFFSET 31:0
#define NVC9B0_SET_DISPLAY_BUF_LUMA_OFFSET (0x00000428)
#define NVC9B0_SET_DISPLAY_BUF_LUMA_OFFSET_OFFSET 31:0
#define NVC9B0_SET_DISPLAY_BUF_CHROMA_OFFSET (0x0000042C)
#define NVC9B0_SET_DISPLAY_BUF_CHROMA_OFFSET_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET0 (0x00000430)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET0_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET1 (0x00000434)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET1_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET2 (0x00000438)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET2_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET3 (0x0000043C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET3_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET4 (0x00000440)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET4_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET5 (0x00000444)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET5_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET6 (0x00000448)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET6_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET7 (0x0000044C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET7_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET8 (0x00000450)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET8_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET9 (0x00000454)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET9_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET10 (0x00000458)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET10_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET11 (0x0000045C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET11_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET12 (0x00000460)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET12_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET13 (0x00000464)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET13_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET14 (0x00000468)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET14_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET15 (0x0000046C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET15_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET16 (0x00000470)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET16_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET0 (0x00000474)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET0_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET1 (0x00000478)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET1_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET2 (0x0000047C)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET2_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET3 (0x00000480)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET3_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET4 (0x00000484)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET4_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET5 (0x00000488)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET5_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET6 (0x0000048C)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET6_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET7 (0x00000490)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET7_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET8 (0x00000494)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET8_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET9 (0x00000498)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET9_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET10 (0x0000049C)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET10_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET11 (0x000004A0)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET11_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET12 (0x000004A4)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET12_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET13 (0x000004A8)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET13_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET14 (0x000004AC)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET14_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET15 (0x000004B0)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET15_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET16 (0x000004B4)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET16_OFFSET 31:0
#define NVC9B0_SET_PIC_SCRATCH_BUF_OFFSET (0x000004B8)
#define NVC9B0_SET_PIC_SCRATCH_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_EXTERNAL_MVBUFFER_OFFSET (0x000004BC)
#define NVC9B0_SET_EXTERNAL_MVBUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_SET_SUB_SAMPLE_MAP_OFFSET (0x000004C0)
#define NVC9B0_SET_SUB_SAMPLE_MAP_OFFSET_OFFSET 31:0
#define NVC9B0_SET_SUB_SAMPLE_MAP_IV_OFFSET (0x000004C4)
#define NVC9B0_SET_SUB_SAMPLE_MAP_IV_OFFSET_OFFSET 31:0
#define NVC9B0_SET_INTRA_TOP_BUF_OFFSET (0x000004C8)
#define NVC9B0_SET_INTRA_TOP_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_TILE_SIZE_BUF_OFFSET (0x000004CC)
#define NVC9B0_SET_TILE_SIZE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_FILTER_BUFFER_OFFSET (0x000004D0)
#define NVC9B0_SET_FILTER_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_SET_CRC_STRUCT_OFFSET (0x000004D4)
#define NVC9B0_SET_CRC_STRUCT_OFFSET_OFFSET 31:0
#define NVC9B0_SET_PR_SSM_CONTENT_INFO_BUF_OFFSET (0x000004D8)
#define NVC9B0_SET_PR_SSM_CONTENT_INFO_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_MBHIST_BUF_OFFSET (0x00000500)
#define NVC9B0_H264_SET_MBHIST_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP8_SET_PROB_DATA_OFFSET (0x00000540)
#define NVC9B0_VP8_SET_PROB_DATA_OFFSET_OFFSET 31:0
#define NVC9B0_VP8_SET_HEADER_PARTITION_BUF_BASE_OFFSET (0x00000544)
#define NVC9B0_VP8_SET_HEADER_PARTITION_BUF_BASE_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SCALING_LIST_OFFSET (0x00000580)
#define NVC9B0_HEVC_SET_SCALING_LIST_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_TILE_SIZES_OFFSET (0x00000584)
#define NVC9B0_HEVC_SET_TILE_SIZES_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET (0x00000588)
#define NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SAO_BUFFER_OFFSET (0x0000058C)
#define NVC9B0_HEVC_SET_SAO_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SLICE_INFO_BUFFER_OFFSET (0x00000590)
#define NVC9B0_HEVC_SET_SLICE_INFO_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SLICE_GROUP_INDEX (0x00000594)
#define NVC9B0_HEVC_SET_SLICE_GROUP_INDEX_OFFSET 31:0
#define NVC9B0_VP9_SET_PROB_TAB_BUF_OFFSET (0x000005C0)
#define NVC9B0_VP9_SET_PROB_TAB_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_CTX_COUNTER_BUF_OFFSET (0x000005C4)
#define NVC9B0_VP9_SET_CTX_COUNTER_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_SEGMENT_READ_BUF_OFFSET (0x000005C8)
#define NVC9B0_VP9_SET_SEGMENT_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_SEGMENT_WRITE_BUF_OFFSET (0x000005CC)
#define NVC9B0_VP9_SET_SEGMENT_WRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_TILE_SIZE_BUF_OFFSET (0x000005D0)
#define NVC9B0_VP9_SET_TILE_SIZE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_COL_MVWRITE_BUF_OFFSET (0x000005D4)
#define NVC9B0_VP9_SET_COL_MVWRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_COL_MVREAD_BUF_OFFSET (0x000005D8)
#define NVC9B0_VP9_SET_COL_MVREAD_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_FILTER_BUFFER_OFFSET (0x000005DC)
#define NVC9B0_VP9_SET_FILTER_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_PARSER_SET_PIC_SETUP_OFFSET (0x000005E0)
#define NVC9B0_VP9_PARSER_SET_PIC_SETUP_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_PARSER_SET_PREV_PIC_SETUP_OFFSET (0x000005E4)
#define NVC9B0_VP9_PARSER_SET_PREV_PIC_SETUP_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_PARSER_SET_PROB_TAB_BUF_OFFSET (0x000005E8)
#define NVC9B0_VP9_PARSER_SET_PROB_TAB_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_HINT_DUMP_BUF_OFFSET (0x000005EC)
#define NVC9B0_VP9_SET_HINT_DUMP_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_CLEAR_HEADER_OFFSET (0x00000600)
#define NVC9B0_PASS1_SET_CLEAR_HEADER_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_RE_ENCRYPT_OFFSET (0x00000604)
#define NVC9B0_PASS1_SET_RE_ENCRYPT_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_VP8_TOKEN_OFFSET (0x00000608)
#define NVC9B0_PASS1_SET_VP8_TOKEN_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_INPUT_DATA_OFFSET (0x0000060C)
#define NVC9B0_PASS1_SET_INPUT_DATA_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_OUTPUT_DATA_SIZE_OFFSET (0x00000610)
#define NVC9B0_PASS1_SET_OUTPUT_DATA_SIZE_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_PROB_TAB_READ_BUF_OFFSET (0x00000640)
#define NVC9B0_AV1_SET_PROB_TAB_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_PROB_TAB_WRITE_BUF_OFFSET (0x00000644)
#define NVC9B0_AV1_SET_PROB_TAB_WRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_SEGMENT_READ_BUF_OFFSET (0x00000648)
#define NVC9B0_AV1_SET_SEGMENT_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_SEGMENT_WRITE_BUF_OFFSET (0x0000064C)
#define NVC9B0_AV1_SET_SEGMENT_WRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MV0_READ_BUF_OFFSET (0x00000650)
#define NVC9B0_AV1_SET_COL_MV0_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MV1_READ_BUF_OFFSET (0x00000654)
#define NVC9B0_AV1_SET_COL_MV1_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MV2_READ_BUF_OFFSET (0x00000658)
#define NVC9B0_AV1_SET_COL_MV2_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MVWRITE_BUF_OFFSET (0x0000065C)
#define NVC9B0_AV1_SET_COL_MVWRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_GLOBAL_MODEL_BUF_OFFSET (0x00000660)
#define NVC9B0_AV1_SET_GLOBAL_MODEL_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_FILM_GRAIN_BUF_OFFSET (0x00000664)
#define NVC9B0_AV1_SET_FILM_GRAIN_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_TILE_STREAM_INFO_BUF_OFFSET (0x00000668)
#define NVC9B0_AV1_SET_TILE_STREAM_INFO_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_SUB_STREAM_ENTRY_BUF_OFFSET (0x0000066C)
#define NVC9B0_AV1_SET_SUB_STREAM_ENTRY_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_HINT_DUMP_BUF_OFFSET (0x00000670)
#define NVC9B0_AV1_SET_HINT_DUMP_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_SCALING_LIST_OFFSET (0x00000680)
#define NVC9B0_H264_SET_SCALING_LIST_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_VLDHIST_BUF_OFFSET (0x00000684)
#define NVC9B0_H264_SET_VLDHIST_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET0 (0x00000688)
#define NVC9B0_H264_SET_EDOBOFFSET0_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET1 (0x0000068C)
#define NVC9B0_H264_SET_EDOBOFFSET1_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET2 (0x00000690)
#define NVC9B0_H264_SET_EDOBOFFSET2_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET3 (0x00000694)
#define NVC9B0_H264_SET_EDOBOFFSET3_OFFSET 31:0
#define NVC9B0_SET_CONTENT_INITIAL_VECTOR(b) (0x00000C00 + (b)*0x00000004)
#define NVC9B0_SET_CONTENT_INITIAL_VECTOR_VALUE 31:0
#define NVC9B0_SET_CTL_COUNT (0x00000C10)
#define NVC9B0_SET_CTL_COUNT_VALUE 31:0
#define NVC9B0_SET_UPPER_SRC (0x00000C14)
#define NVC9B0_SET_UPPER_SRC_OFFSET 7:0
#define NVC9B0_SET_LOWER_SRC (0x00000C18)
#define NVC9B0_SET_LOWER_SRC_OFFSET 31:0
#define NVC9B0_SET_UPPER_DST (0x00000C1C)
#define NVC9B0_SET_UPPER_DST_OFFSET 7:0
#define NVC9B0_SET_LOWER_DST (0x00000C20)
#define NVC9B0_SET_LOWER_DST_OFFSET 31:0
#define NVC9B0_SET_BLOCK_COUNT (0x00000C24)
#define NVC9B0_SET_BLOCK_COUNT_VALUE 31:0
#define NVC9B0_PR_SET_REQUEST_BUF_OFFSET (0x00000D00)
#define NVC9B0_PR_SET_REQUEST_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_REQUEST_BUF_SIZE (0x00000D04)
#define NVC9B0_PR_SET_REQUEST_BUF_SIZE_SIZE 31:0
#define NVC9B0_PR_SET_RESPONSE_BUF_OFFSET (0x00000D08)
#define NVC9B0_PR_SET_RESPONSE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_RESPONSE_BUF_SIZE (0x00000D0C)
#define NVC9B0_PR_SET_RESPONSE_BUF_SIZE_SIZE 31:0
#define NVC9B0_PR_SET_REQUEST_MESSAGE_BUF_OFFSET (0x00000D10)
#define NVC9B0_PR_SET_REQUEST_MESSAGE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_RESPONSE_MESSAGE_BUF_OFFSET (0x00000D14)
#define NVC9B0_PR_SET_RESPONSE_MESSAGE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_OFFSET (0x00000D18)
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_SIZE (0x00000D1C)
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_SIZE_SIZE 31:0
#define NVC9B0_PR_SET_CONTENT_DECRYPT_INFO_BUF_OFFSET (0x00000D20)
#define NVC9B0_PR_SET_CONTENT_DECRYPT_INFO_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_REENCRYPTED_BITSTREAM_BUF_OFFSET (0x00000D24)
#define NVC9B0_PR_SET_REENCRYPTED_BITSTREAM_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_DH_KE_SET_CHALLENGE_BUF_OFFSET (0x00000E00)
#define NVC9B0_DH_KE_SET_CHALLENGE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_DH_KE_SET_RESPONSE_BUF_OFFSET (0x00000E04)
#define NVC9B0_DH_KE_SET_RESPONSE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_SESSION_KEY(b) (0x00000F00 + (b)*0x00000004)
#define NVC9B0_SET_SESSION_KEY_VALUE 31:0
#define NVC9B0_SET_CONTENT_KEY(b) (0x00000F10 + (b)*0x00000004)
#define NVC9B0_SET_CONTENT_KEY_VALUE 31:0
#define NVC9B0_PM_TRIGGER_END (0x00001114)
#define NVC9B0_PM_TRIGGER_END_V 31:0
#define NVC9B0_ERROR_NONE (0x00000000)
#define NVC9B0_OS_ERROR_EXECUTE_INSUFFICIENT_DATA (0x00000001)
#define NVC9B0_OS_ERROR_SEMAPHORE_INSUFFICIENT_DATA (0x00000002)
#define NVC9B0_OS_ERROR_INVALID_METHOD (0x00000003)
#define NVC9B0_OS_ERROR_INVALID_DMA_PAGE (0x00000004)
#define NVC9B0_OS_ERROR_UNHANDLED_INTERRUPT (0x00000005)
#define NVC9B0_OS_ERROR_EXCEPTION (0x00000006)
#define NVC9B0_OS_ERROR_INVALID_CTXSW_REQUEST (0x00000007)
#define NVC9B0_OS_ERROR_APPLICATION (0x00000008)
#define NVC9B0_OS_ERROR_SW_BREAKPT (0x00000009)
#define NVC9B0_OS_INTERRUPT_EXECUTE_AWAKEN (0x00000100)
#define NVC9B0_OS_INTERRUPT_BACKEND_SEMAPHORE_AWAKEN (0x00000200)
#define NVC9B0_OS_INTERRUPT_CTX_ERROR_FBIF (0x00000300)
#define NVC9B0_OS_INTERRUPT_LIMIT_VIOLATION (0x00000400)
#define NVC9B0_OS_INTERRUPT_LIMIT_AND_FBIF_CTX_ERROR (0x00000500)
#define NVC9B0_OS_INTERRUPT_HALT_ENGINE (0x00000600)
#define NVC9B0_OS_INTERRUPT_TRAP_NONSTALL (0x00000700)
#define NVC9B0_H264_VLD_ERR_SEQ_DATA_INCONSISTENT (0x00004001)
#define NVC9B0_H264_VLD_ERR_PIC_DATA_INCONSISTENT (0x00004002)
#define NVC9B0_H264_VLD_ERR_SLC_DATA_BUF_ADDR_OUT_OF_BOUNDS (0x00004100)
#define NVC9B0_H264_VLD_ERR_BITSTREAM_ERROR (0x00004101)
#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x000041F8)
#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_SIZE_NOT_MULT256 (0x00004200)
#define NVC9B0_H264_VLD_ERR_SLC_DATA_OUT_SIZE_NOT_MULT256 (0x00004201)
#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00004203)
#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_SLC_HDR_OUT_INVALID (0x00004204)
#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL (0x00004205)
#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_BUF_ALREADY_VALID (0x00004206)
#define NVC9B0_H264_VLD_ERR_SLC_DATA_OUT_BUF_TOO_SMALL (0x00004207)
#define NVC9B0_H264_VLD_ERR_DATA_BUF_CNT_TOO_SMALL (0x00004208)
#define NVC9B0_H264_VLD_ERR_BITSTREAM_EMPTY (0x00004209)
#define NVC9B0_H264_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x0000420A)
#define NVC9B0_H264_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x0000420B)
#define NVC9B0_H264_VLD_ERR_HIST_BUF_TOO_SMALL (0x00004300)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_BUF_ADDR_OUT_OF_BOUND (0x00005100)
#define NVC9B0_VC1_VLD_ERR_BITSTREAM_ERROR (0x00005101)
#define NVC9B0_VC1_VLD_ERR_PIC_HDR_OUT_SIZE_NOT_MULT256 (0x00005200)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_SIZE_NOT_MULT256 (0x00005201)
#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x00005202)
#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00005203)
#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_PIC_HDR_OUT_INVALID (0x00005204)
#define NVC9B0_VC1_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL (0x00005205)
#define NVC9B0_VC1_VLD_ERR_PIC_HDR_OUT_BUF_ALREADY_VALID (0x00005206)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_BUF_TOO_SMALL (0x00005207)
#define NVC9B0_VC1_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL (0x00005208)
#define NVC9B0_VC1_VLD_ERR_BITSTREAM_EMPTY (0x00005209)
#define NVC9B0_VC1_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x0000520A)
#define NVC9B0_VC1_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x0000520B)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_BUF_FULL_TIME_OUT (0x00005300)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_BUF_ADDR_OUT_OF_BOUNDS (0x00006100)
#define NVC9B0_MPEG12_VLD_ERR_BITSTREAM_ERROR (0x00006101)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_SIZE_NOT_MULT256 (0x00006200)
#define NVC9B0_MPEG12_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x00006201)
#define NVC9B0_MPEG12_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00006202)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_BUF_TOO_SMALL (0x00006203)
#define NVC9B0_MPEG12_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL (0x00006204)
#define NVC9B0_MPEG12_VLD_ERR_BITSTREAM_EMPTY (0x00006205)
#define NVC9B0_MPEG12_VLD_ERR_INVALID_PIC_STRUCTURE (0x00006206)
#define NVC9B0_MPEG12_VLD_ERR_INVALID_PIC_CODING_TYPE (0x00006207)
#define NVC9B0_MPEG12_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x00006208)
#define NVC9B0_MPEG12_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x00006209)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_BUF_FULL_TIME_OUT (0x00006300)
#define NVC9B0_CMN_VLD_ERR_PDEC_RETURNED_ERROR (0x00007101)
#define NVC9B0_CMN_VLD_ERR_EDOB_FLUSH_TIME_OUT (0x00007102)
#define NVC9B0_CMN_VLD_ERR_EDOB_REWIND_TIME_OUT (0x00007103)
#define NVC9B0_CMN_VLD_ERR_VLD_WD_TIME_OUT (0x00007104)
#define NVC9B0_CMN_VLD_ERR_NUM_SLICES_ZERO (0x00007105)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_BUF_ADDR_OUT_OF_BOUND (0x00008100)
#define NVC9B0_MPEG4_VLD_ERR_BITSTREAM_ERROR (0x00008101)
#define NVC9B0_MPEG4_VLD_ERR_PIC_HDR_OUT_SIZE_NOT_MULT256 (0x00008200)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_SIZE_NOT_MULT256 (0x00008201)
#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x00008202)
#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00008203)
#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_PIC_HDR_OUT_INVALID (0x00008204)
#define NVC9B0_MPEG4_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL (0x00008205)
#define NVC9B0_MPEG4_VLD_ERR_PIC_HDR_OUT_BUF_ALREADY_VALID (0x00008206)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_BUF_TOO_SMALL (0x00008207)
#define NVC9B0_MPEG4_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL (0x00008208)
#define NVC9B0_MPEG4_VLD_ERR_BITSTREAM_EMPTY (0x00008209)
#define NVC9B0_MPEG4_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x0000820A)
#define NVC9B0_MPEG4_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x0000820B)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_BUF_FULL_TIME_OUT (0x00051E01)
#define NVC9B0_DEC_ERROR_MPEG12_APPTIMER_EXPIRED (0xDEC10001)
#define NVC9B0_DEC_ERROR_MPEG12_MVTIMER_EXPIRED (0xDEC10002)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_TOKEN (0xDEC10003)
#define NVC9B0_DEC_ERROR_MPEG12_SLICEDATA_MISSING (0xDEC10004)
#define NVC9B0_DEC_ERROR_MPEG12_HWERR_INTERRUPT (0xDEC10005)
#define NVC9B0_DEC_ERROR_MPEG12_DETECTED_VLD_FAILURE (0xDEC10006)
#define NVC9B0_DEC_ERROR_MPEG12_PICTURE_INIT (0xDEC10100)
#define NVC9B0_DEC_ERROR_MPEG12_STATEMACHINE_FAILURE (0xDEC10101)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_PIC (0xDEC10901)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_UCODE (0xDEC10902)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_FC (0xDEC10903)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_SLH (0xDEC10904)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_UCODE_SIZE (0xDEC10905)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_SLICE_COUNT (0xDEC10906)
#define NVC9B0_DEC_ERROR_VC1_APPTIMER_EXPIRED (0xDEC20001)
#define NVC9B0_DEC_ERROR_VC1_MVTIMER_EXPIRED (0xDEC20002)
#define NVC9B0_DEC_ERROR_VC1_INVALID_TOKEN (0xDEC20003)
#define NVC9B0_DEC_ERROR_VC1_SLICEDATA_MISSING (0xDEC20004)
#define NVC9B0_DEC_ERROR_VC1_HWERR_INTERRUPT (0xDEC20005)
#define NVC9B0_DEC_ERROR_VC1_DETECTED_VLD_FAILURE (0xDEC20006)
#define NVC9B0_DEC_ERROR_VC1_TIMEOUT_POLLING_FOR_DATA (0xDEC20007)
#define NVC9B0_DEC_ERROR_VC1_PDEC_PIC_END_UNALIGNED (0xDEC20008)
#define NVC9B0_DEC_ERROR_VC1_WDTIMER_EXPIRED (0xDEC20009)
#define NVC9B0_DEC_ERROR_VC1_ERRINTSTART (0xDEC20010)
#define NVC9B0_DEC_ERROR_VC1_IQT_ERRINT (0xDEC20011)
#define NVC9B0_DEC_ERROR_VC1_MC_ERRINT (0xDEC20012)
#define NVC9B0_DEC_ERROR_VC1_MC_IQT_ERRINT (0xDEC20013)
#define NVC9B0_DEC_ERROR_VC1_REC_ERRINT (0xDEC20014)
#define NVC9B0_DEC_ERROR_VC1_REC_IQT_ERRINT (0xDEC20015)
#define NVC9B0_DEC_ERROR_VC1_REC_MC_ERRINT (0xDEC20016)
#define NVC9B0_DEC_ERROR_VC1_REC_MC_IQT_ERRINT (0xDEC20017)
#define NVC9B0_DEC_ERROR_VC1_DBF_ERRINT (0xDEC20018)
#define NVC9B0_DEC_ERROR_VC1_DBF_IQT_ERRINT (0xDEC20019)
#define NVC9B0_DEC_ERROR_VC1_DBF_MC_ERRINT (0xDEC2001A)
#define NVC9B0_DEC_ERROR_VC1_DBF_MC_IQT_ERRINT (0xDEC2001B)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_ERRINT (0xDEC2001C)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_IQT_ERRINT (0xDEC2001D)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_MC_ERRINT (0xDEC2001E)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_MC_IQT_ERRINT (0xDEC2001F)
#define NVC9B0_DEC_ERROR_VC1_PICTURE_INIT (0xDEC20100)
#define NVC9B0_DEC_ERROR_VC1_STATEMACHINE_FAILURE (0xDEC20101)
#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_PIC (0xDEC20901)
#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_UCODE (0xDEC20902)
#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_FC (0xDEC20903)
#define NVC9B0_DEC_ERROR_VC1_INVAILD_CTXID_SLH (0xDEC20904)
#define NVC9B0_DEC_ERROR_VC1_INVALID_UCODE_SIZE (0xDEC20905)
#define NVC9B0_DEC_ERROR_VC1_INVALID_SLICE_COUNT (0xDEC20906)
#define NVC9B0_DEC_ERROR_H264_APPTIMER_EXPIRED (0xDEC30001)
#define NVC9B0_DEC_ERROR_H264_MVTIMER_EXPIRED (0xDEC30002)
#define NVC9B0_DEC_ERROR_H264_INVALID_TOKEN (0xDEC30003)
#define NVC9B0_DEC_ERROR_H264_SLICEDATA_MISSING (0xDEC30004)
#define NVC9B0_DEC_ERROR_H264_HWERR_INTERRUPT (0xDEC30005)
#define NVC9B0_DEC_ERROR_H264_DETECTED_VLD_FAILURE (0xDEC30006)
#define NVC9B0_DEC_ERROR_H264_ERRINTSTART (0xDEC30010)
#define NVC9B0_DEC_ERROR_H264_IQT_ERRINT (0xDEC30011)
#define NVC9B0_DEC_ERROR_H264_MC_ERRINT (0xDEC30012)
#define NVC9B0_DEC_ERROR_H264_MC_IQT_ERRINT (0xDEC30013)
#define NVC9B0_DEC_ERROR_H264_REC_ERRINT (0xDEC30014)
#define NVC9B0_DEC_ERROR_H264_REC_IQT_ERRINT (0xDEC30015)
#define NVC9B0_DEC_ERROR_H264_REC_MC_ERRINT (0xDEC30016)
#define NVC9B0_DEC_ERROR_H264_REC_MC_IQT_ERRINT (0xDEC30017)
#define NVC9B0_DEC_ERROR_H264_DBF_ERRINT (0xDEC30018)
#define NVC9B0_DEC_ERROR_H264_DBF_IQT_ERRINT (0xDEC30019)
#define NVC9B0_DEC_ERROR_H264_DBF_MC_ERRINT (0xDEC3001A)
#define NVC9B0_DEC_ERROR_H264_DBF_MC_IQT_ERRINT (0xDEC3001B)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_ERRINT (0xDEC3001C)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_IQT_ERRINT (0xDEC3001D)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_MC_ERRINT (0xDEC3001E)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_MC_IQT_ERRINT (0xDEC3001F)
#define NVC9B0_DEC_ERROR_H264_PICTURE_INIT (0xDEC30100)
#define NVC9B0_DEC_ERROR_H264_STATEMACHINE_FAILURE (0xDEC30101)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_PIC (0xDEC30901)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_UCODE (0xDEC30902)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_FC (0xDEC30903)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_SLH (0xDEC30904)
#define NVC9B0_DEC_ERROR_H264_INVALID_UCODE_SIZE (0xDEC30905)
#define NVC9B0_DEC_ERROR_H264_INVALID_SLICE_COUNT (0xDEC30906)
#define NVC9B0_DEC_ERROR_MPEG4_APPTIMER_EXPIRED (0xDEC40001)
#define NVC9B0_DEC_ERROR_MPEG4_MVTIMER_EXPIRED (0xDEC40002)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_TOKEN (0xDEC40003)
#define NVC9B0_DEC_ERROR_MPEG4_SLICEDATA_MISSING (0xDEC40004)
#define NVC9B0_DEC_ERROR_MPEG4_HWERR_INTERRUPT (0xDEC40005)
#define NVC9B0_DEC_ERROR_MPEG4_DETECTED_VLD_FAILURE (0xDEC40006)
#define NVC9B0_DEC_ERROR_MPEG4_TIMEOUT_POLLING_FOR_DATA (0xDEC40007)
#define NVC9B0_DEC_ERROR_MPEG4_PDEC_PIC_END_UNALIGNED (0xDEC40008)
#define NVC9B0_DEC_ERROR_MPEG4_WDTIMER_EXPIRED (0xDEC40009)
#define NVC9B0_DEC_ERROR_MPEG4_ERRINTSTART (0xDEC40010)
#define NVC9B0_DEC_ERROR_MPEG4_IQT_ERRINT (0xDEC40011)
#define NVC9B0_DEC_ERROR_MPEG4_MC_ERRINT (0xDEC40012)
#define NVC9B0_DEC_ERROR_MPEG4_MC_IQT_ERRINT (0xDEC40013)
#define NVC9B0_DEC_ERROR_MPEG4_REC_ERRINT (0xDEC40014)
#define NVC9B0_DEC_ERROR_MPEG4_REC_IQT_ERRINT (0xDEC40015)
#define NVC9B0_DEC_ERROR_MPEG4_REC_MC_ERRINT (0xDEC40016)
#define NVC9B0_DEC_ERROR_MPEG4_REC_MC_IQT_ERRINT (0xDEC40017)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_ERRINT (0xDEC40018)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_IQT_ERRINT (0xDEC40019)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_MC_ERRINT (0xDEC4001A)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_MC_IQT_ERRINT (0xDEC4001B)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_ERRINT (0xDEC4001C)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_IQT_ERRINT (0xDEC4001D)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_MC_ERRINT (0xDEC4001E)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_MC_IQT_ERRINT (0xDEC4001F)
#define NVC9B0_DEC_ERROR_MPEG4_PICTURE_INIT (0xDEC40100)
#define NVC9B0_DEC_ERROR_MPEG4_STATEMACHINE_FAILURE (0xDEC40101)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_PIC (0xDEC40901)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_UCODE (0xDEC40902)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_FC (0xDEC40903)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_SLH (0xDEC40904)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_UCODE_SIZE (0xDEC40905)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_SLICE_COUNT (0xDEC40906)
#ifdef __cplusplus
}; /* extern "C" */
#endif
#endif // clc9b0_h

View file

@ -64,14 +64,17 @@ nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(
x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}
def get_classes():
hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_570.py").read_text()
clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()
pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'
matches = re.findall(pattern, clss, re.MULTILINE)
return {int(num, base=16):name for name, num in matches}
res = {}
known_classes = {"NV01_DEVICE_0", "NV01_ROOT", "NV1_MEMORY_SYSTEM", "NV01_MEMORY_VIRTUAL", "NV1_MEMORY_USER", "NV50_MEMORY_VIRTUAL", "NV_FERMI_VASPACE_A",
"NV20_SUBDEVICE_0"}
for nm,val in nv_gpu.__dict__.items():
if not isinstance(val, int): continue
if 0x3000 < val < 0xffff: res[val] = nm
if nm in known_classes: res[val] = nm
return res
nvclasses = get_classes()
nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}
nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}
nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC9B0_", "NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}
global_ioctl_id = 0
gpus_user_modes = []

File diff suppressed because it is too large Load diff

View file

@ -29,8 +29,9 @@ rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw
# create QCOM tensor with the externally managed buffer
x = Tensor.from_blob(rawbuf_ptr, (8, 8), dtype=dtypes.int, device='QCOM')
y = (x + 1).numpy()
print(y)
y = (x + 1).reshape(-1).tolist()
print(y[:10])
assert y == [i + 1 for i in range(64)]
# all calculations are done, save to free the object
cl.clReleaseMemObject(cl_buf)
@ -49,7 +50,7 @@ for i in range(4):
cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0]
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20]
y = calc(x = Tensor.from_blob(rawbuf_ptr, (2, 2), dtype=dtypes.int, device='QCOM')).numpy()
y = calc(x = Tensor.from_blob(rawbuf_ptr, (2, 2), dtype=dtypes.int, device='QCOM')).tolist()
print(f'jit {i}\n', y)
# all calculations are done, save to free the object
@ -80,8 +81,19 @@ rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw
# dtypes.imageh = cl.cl_image_format(cl.CL_RGBA, cl.CL_HALF_FLOAT)
# dtypes.imagef = cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT)
x = Tensor.from_blob(rawbuf_ptr, (h*w*4,), dtype=dtypes.imagef((h,w)), device='QCOM')
y = (x + 1).numpy()
print(y)
y = (x + 1).tolist()
print(y[:10])
# all calculations are done, save to free the object
cl.clReleaseMemObject(cl_img)
# from numpy
import numpy as np
YUV_SIZE = 50
a_np = (32*np.random.randn(YUV_SIZE).astype(np.float32) + 128).clip(0,255).astype(np.uint8)
a = Tensor.from_blob(a_np.ctypes.data, (YUV_SIZE,), dtype=dtypes.uint8, device='QCOM').realize()
print(a.numpy()[:10], a_np[:10])
assert np.all(a.numpy() == a_np)
assert np.all((a - 1).numpy() == a_np - 1)

View file

@ -1,128 +0,0 @@
import numpy as np
import ctypes
from tinygrad import Tensor, GlobalCounters, Context
from tinygrad.engine.realize import lower_schedule, CompiledRunner
from tinygrad.device import CPUProgram
from dataclasses import replace
from keystone import Ks, KS_ARCH_ARM64, KS_MODE_LITTLE_ENDIAN
# only the memory access, over 100 GB/s! (sometimes)
reduce_asm = """
movi v0.2d, #0000000000000000
mov w9, #0x30
mov w10, #0x20
mov x8, #-0x10
movi v1.2d, #0000000000000000
movk w9, #0x300, lsl #16
movi v2.2d, #0000000000000000
movk w10, #0x200, lsl #16
movi v3.2d, #0000000000000000
mov w11, #0x1000000
mov w12, #0x3ffff0
loop:
ldp q4, q5, [x1]
add x13, x1, x11
add x15, x1, x10
add x14, x1, x9
add x8, x8, #0x10
cmp x8, x12
ldp q6, q7, [x1, #0x20]
add x1, x1, #0x40
ldp q4, q5, [x13]
ldp q6, q7, [x13, #0x20]
ldp q4, q5, [x15, #-0x20]
ldp q6, q7, [x15]
ldp q4, q5, [x14, #-0x30]
ldp q6, q7, [x14, #-0x10]
b.lo loop
fadd v0.4s, v1.4s, v0.4s
fadd v0.4s, v2.4s, v0.4s
fadd v0.4s, v3.4s, v0.4s
dup v1.4s, v0.s[1]
dup v2.4s, v0.s[2]
fadd v1.4s, v0.4s, v1.4s
dup v0.4s, v0.s[3]
fadd v1.4s, v2.4s, v1.4s
fadd v0.4s, v0.4s, v1.4s
str s0, [x0]
ret
"""
ks = Ks(KS_ARCH_ARM64, KS_MODE_LITTLE_ENDIAN)
arm_bytecode, _ = ks.asm(reduce_asm)
arm_bytecode = bytes(arm_bytecode)
reduce_src = """
// data1 is 16M inputs
typedef float float4 __attribute__((aligned(32),vector_size(16)));
void reduce(float* restrict data0, float* restrict data1) {
float4 acc0 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 acc1 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 acc2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 acc3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 acc4 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 acc5 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 acc6 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 acc7 = {0.0f, 0.0f, 0.0f, 0.0f};
float* data1_1 = data1+4194304;
float* data1_2 = data1+(4194304*2);
float* data1_3 = data1+(4194304*3);
for (int ridx0 = 0; ridx0 < 16777216/4; ridx0+=16) {
float4 val0 = *(float4*)((data1+(ridx0+0)));
float4 val1 = *(float4*)((data1+(ridx0+4)));
float4 val2 = *(float4*)((data1+(ridx0+8)));
float4 val3 = *(float4*)((data1+(ridx0+12)));
acc0 += val0;
acc1 += val1;
acc2 += val2;
acc3 += val3;
val0 = *(float4*)((data1_1+(ridx0+0)));
val1 = *(float4*)((data1_1+(ridx0+4)));
val2 = *(float4*)((data1_1+(ridx0+8)));
val3 = *(float4*)((data1_1+(ridx0+12)));
acc4 += val0;
acc5 += val1;
acc6 += val2;
acc7 += val3;
val0 = *(float4*)((data1_2+(ridx0+0)));
val1 = *(float4*)((data1_2+(ridx0+4)));
val2 = *(float4*)((data1_2+(ridx0+8)));
val3 = *(float4*)((data1_2+(ridx0+12)));
acc0 += val0;
acc1 += val1;
acc2 += val2;
acc3 += val3;
val0 = *(float4*)((data1_3+(ridx0+0)));
val1 = *(float4*)((data1_3+(ridx0+4)));
val2 = *(float4*)((data1_3+(ridx0+8)));
val3 = *(float4*)((data1_3+(ridx0+12)));
acc4 += val0;
acc5 += val1;
acc6 += val2;
acc7 += val3;
}
float4 out = acc0+acc1+acc2+acc3+acc4+acc5+acc6+acc7;
*(data0+0) = out[0]+out[1]+out[2]+out[3];
}
"""
if __name__ == "__main__":
a = Tensor(np_array:=(np.random.default_rng().random((4096, 4096), dtype=np.float32)-0.5)).realize()
with Context(SPLIT_REDUCEOP=0):
# TODO: make it easy to alter the OptOps for a ScheduleItem
GlobalCounters.reset()
out = a.sum()
sis = out.schedule()
for i,(_,ei) in enumerate(lower_schedule(sis)):
if i == 0:
# change the source code
prg_spec = ei.prg.p
prg_spec = replace(prg_spec, name="reduce", src=reduce_src)
prg = CompiledRunner(prg_spec)
# change the assembly
#prg._prg = CPUProgram(prg_spec.name, arm_bytecode)
print("buffer at:",hex(ctypes.addressof(ei.bufs[1]._buf)))
ei = replace(ei, prg=prg)
ei.run()
print(out.item())
np.testing.assert_allclose(out.item(), np_array.sum(), atol=1, rtol=1e-4)

View file

@ -1,5 +1,5 @@
use half::f16;
use num_traits::{float::FloatCore, PrimInt, Unsigned};
use num_traits::{float::FloatCore, PrimInt, Unsigned, clamp};
pub fn bits<T>(word: T, hi: usize, lo: usize) -> T where T: PrimInt + Unsigned {
assert!(hi >= lo);
@ -48,6 +48,7 @@ impl IEEEClass<u64> for f64 {
pub trait VOPModifier<T> {
fn negate(&self, pos: usize, modifier: usize) -> T;
fn absolute(&self, pos: usize, modifier: usize) -> T;
fn clmp(&self, cm: bool) -> T;
}
impl<T> VOPModifier<T> for T
where
@ -65,6 +66,11 @@ where
_ => *self,
}
}
fn clmp(&self, cm:bool) -> T {
if !cm { return *self }
let r = clamp(*self, T::zero(), T::one());
if r == T::zero() { T::zero() } else { r }
}
}
pub fn extract_mantissa(x: f64) -> f64 {

View file

@ -1024,7 +1024,7 @@ impl<'a> Thread<'a> {
let vdst = (instr & 0xff) as usize;
let abs = ((instr >> 8) & 0x7) as usize;
let opsel = ((instr >> 11) & 0xf) as usize;
let cm = (instr >> 15) & 0x1;
let cm = ((instr >> 15) & 0x1) != 0;
let s = |n: usize| ((instr >> n) & 0x1ff) as usize;
let src = (s(32), s(41), s(50));
@ -1032,7 +1032,9 @@ impl<'a> Thread<'a> {
let omod = (instr >> 59) & 0x3;
let neg = ((instr >> 61) & 0x7) as usize;
assert_eq!(omod, 0);
assert_eq!(cm, 0);
if op != 272 && cm {
return todo_instr!(op); // TODO: add VOP3 clamp for all ops
}
assert_eq!(opsel, 0);
match op {
@ -1266,7 +1268,7 @@ impl<'a> Thread<'a> {
}
let ret = match op {
257 | 259 | 299 | 260 | 261 | 264 | 272 | 392 | 426 | 430 | 531 | 537 | 540 | 551 | 567 | 796 => {
257 | 259 | 299 | 260 | 261 | 264 | 272 | 392 | 426 | 430 | 531 | 537 | 540 | 543 | 551 | 567 | 606 | 796 => {
let s0 = f32::from_bits(s0).negate(0, neg).absolute(0, abs);
let s1 = f32::from_bits(s1).negate(1, neg).absolute(1, abs);
let s2 = f32::from_bits(s2).negate(2, neg).absolute(2, abs);
@ -1275,12 +1277,26 @@ impl<'a> Thread<'a> {
260 => s0 - s1,
261 => s1 - s0,
264 => s0 * s1,
272 => f32::max(s0, s1),
272 => f32::max(s0, s1).clmp(cm),
299 => f32::mul_add(s0, s1, f32::from_bits(self.vec_reg[vdst])),
426 => s0.recip(),
430 => 1.0 / f32::sqrt(s0),
531 => f32::mul_add(s0, s1, s2),
537 => f32::min(f32::min(s0, s1), s2),
543 => {
if s0.is_nan() || s1.is_nan() || s2.is_nan() {
f32::min(f32::min(s0, s1), s2)
} else {
let max = f32::max(f32::max(s0, s1), s2);
if max == s0 {
f32::max(s1, s2)
} else if max == s1 {
f32::max(s0, s2)
} else {
f32::max(s0, s1)
}
}
},
540 => f32::max(f32::max(s0, s1), s2),
551 => s2 / s1,
567 => {
@ -1290,6 +1306,7 @@ impl<'a> Thread<'a> {
false => ret,
}
}
606 => f32::min(f32::max(s0, s1), s2),
796 => s0 * 2f32.powi(s1.to_bits() as i32),
// cnd_mask isn't a float only ALU but supports neg
257 => {

View file

@ -1,75 +0,0 @@
import pickle, sys
from dataclasses import replace
from tinygrad import Device, Context, Tensor, GlobalCounters
from tinygrad.device import Buffer
from tinygrad.helpers import getenv, BEAM
from tinygrad.engine.jit import TinyJit
from tinygrad.engine.realize import CompiledRunner, ExecItem, ScheduleItem, lower_schedule_item, get_program
from tinygrad.renderer import ProgramSpec
from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps
from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
import numpy as np
def move_jit_captured_to_dev(captured, device="DSP"):
captured.expected_st_vars_dtype_device = [x[:3] + (device,) for x in captured.expected_st_vars_dtype_device]
assign = {}
def move_buffer(b):
if b in assign: return assign[b]
if b._base is not None:
newbuf = Buffer(device, b.size, b.dtype, base=move_buffer(b._base), offset=b.offset)
else:
newbuf = Buffer(device, b.size, b.dtype)
if b.is_allocated(): newbuf.ensure_allocated().copyin(b.as_buffer())
assign[b] = newbuf
return assign[b]
for item in captured.jit_cache:
for b in item.bufs:
if b is not None: move_buffer(b)
captured.jit_cache = [ExecItem(item.prg, [assign.get(b,b) for b in item.bufs]) for item in captured.jit_cache]
return captured
if __name__ == "__main__":
with Context(DEBUG=0):
with open(sys.argv[1], "rb") as f:
fxn: TinyJit = pickle.load(f)
print(f"{f.tell()/1e6:.2f}M loaded")
print(type(fxn))
# Move all buffers to DSP device.
fxn.captured = move_jit_captured_to_dev(fxn.captured, "DSP")
new_jit = []
knum = 1
for ei in fxn.captured.jit_cache:
# skip the copy and the first kernel
if isinstance(ei.prg, CompiledRunner) and all(x is not None for x in ei.bufs):
if knum == (pknum:=getenv("KNUM", 0)) or pknum == 0:
p: ProgramSpec = ei.prg.p
k = Kernel(p.ast, Device["DSP"].renderer)
if getenv("VALIDATE"):
with Context(NOOPT=1):
lower_schedule_item(ScheduleItem(p.ast, ei.bufs)).run()
correct = ei.bufs[0].numpy()
ei.bufs[0].copyin(memoryview(bytearray(b'\x00'*ei.bufs[0].nbytes)))
GlobalCounters.kernel_count -= 1
if not getenv("NOOPT"): k.apply_opts(hand_coded_optimizations(k))
p2 = get_program(k.ast, k.opts, k.applied_opts)
new_ei = replace(ei, prg=CompiledRunner(p2))
new_ei.run()
new_jit.append(new_ei)
test = ei.bufs[0].numpy()
if getenv("VALIDATE"):
import numpy as np
np.testing.assert_allclose(correct, test, rtol=1e-3, atol=1e-3)
knum += 1
if getenv("RUN_JIT", 0):
fxn.captured.free_intermediates()
fxn.captured.jit_cache = new_jit
fxn(input=Tensor(np.zeros((1, 3, 224, 224), dtype=np.float32), device="DSP"))

View file

@ -1,114 +0,0 @@
# code from https://x.com/awnihannun/status/1832511021602500796
from huggingface_hub import snapshot_download
import mlx.core as mx
import mlx.nn as nn
import time
class Block(nn.Module):
def __init__(self, in_dims, dims, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(
in_dims, dims, kernel_size=3, stride=stride, padding=1, bias=False
)
self.bn1 = nn.BatchNorm(dims)
self.conv2 = nn.Conv2d(
dims, dims, kernel_size=3, stride=1, padding=1, bias=False
)
self.bn2 = nn.BatchNorm(dims)
self.downsample = []
if stride != 1:
self.downsample = [
nn.Conv2d(in_dims, dims, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm(dims)
]
def __call__(self, x):
out = nn.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
for l in self.downsample:
x = l(x)
out += x
out = nn.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super().__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm(64)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 64, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 128, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 256, 512, num_blocks[3], stride=2)
self.fc = nn.Linear(512, num_classes)
def _make_layer(self, block, in_dims, dims, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(in_dims, dims, stride))
in_dims = dims
return layers
def __call__(self, x):
x = nn.relu(self.bn1(self.conv1(x)))
x = self.maxpool(x)
for l in self.layer1 + self.layer2 + self.layer3 + self.layer4:
x = l(x)
x = mx.mean(x, axis=[1, 2])
x = self.fc(x)
return x
def load():
model = ResNet(Block, [2, 2, 2, 2], num_classes=1000)
file = "model.safetensors"
model_path = snapshot_download(
repo_id="awni/resnet18-mlx",
allow_patterns=[file],
)
model.load_weights(model_path + "/" + file)
model.eval()
mx.eval(model)
return model
if __name__ == "__main__":
resnet18 = load()
@mx.compile
def forward(im):
return resnet18(im)
batch_sizes = [1, 2, 4, 8, 16, 32, 64]
#its = 200
#batch_sizes = [64]
its = 20
print(f"Batch Size | Images-per-second | Milliseconds-per-image")
print(f"---- | ---- | ---- ")
for N in batch_sizes:
image = mx.random.uniform(shape=(N, 288, 288, 3))
# Warmup
for _ in range(5):
output = forward(image)
mx.eval(output)
tic = time.time()
for _ in range(its):
output = forward(image)
mx.async_eval(output)
mx.eval(output)
toc = time.time()
ims_per_sec = N * its / (toc - tic)
ms_per_im = 1e3 / ims_per_sec
print(f"{N} | {ims_per_sec:.3f} | {ms_per_im:.3f}")

Some files were not shown because too many files have changed in this diff Show more