specify renderer in DEV, <dev>_<ren>=1 is deprecated (#15551)

This commit is contained in:
Christopher Milan 2026-03-31 15:35:14 -07:00 committed by GitHub
commit acf239e4d2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 204 additions and 173 deletions

View file

@ -101,10 +101,10 @@ jobs:
run: DEV=METAL python3.11 test/opt/test_tensor_cores.py
- name: Test AMX tensor cores
run: |
DEBUG=2 DEV=CPU CPU_LLVM=0 AMX=1 python3.11 test/opt/test_tensor_cores.py
DEBUG=2 DEV=CPU CPU_LLVM=1 AMX=1 python3.11 test/opt/test_tensor_cores.py
DEBUG=2 DEV=CPU CPU_LLVM=0 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
DEBUG=2 DEV=CPU CPU_LLVM=1 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_tensor_cores.py
DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_tensor_cores.py
DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
- name: Run Tensor Core GEMM (float)
run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py
- name: Run Tensor Core GEMM (half)
@ -193,9 +193,9 @@ jobs:
#- name: UsbGPU openpilot test
# run: sudo -E PYTHONPATH=. GMMU=0 DEV=AMD AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
- name: UsbGPU (USB4/TB) boot time
run: PYTHONPATH=. DEBUG=3 DEV=NV NV_IFACE=PCI NV_NAK=1 time python3.11 test/test_tiny.py TestTiny.test_plus
run: PYTHONPATH=. DEBUG=3 DEV=NV:NAK NV_IFACE=PCI time python3.11 test/test_tiny.py TestTiny.test_plus
- name: UsbGPU (USB4/TB) tiny tests
run: PYTHONPATH=. DEV=NV NV_IFACE=PCI NV_NAK=1 python3.11 test/test_tiny.py
run: PYTHONPATH=. DEV=NV:NAK NV_IFACE=PCI python3.11 test/test_tiny.py
testnvidiabenchmark:
name: tinybox green Benchmark
@ -237,7 +237,7 @@ jobs:
- name: Test tensor cores
run: |
DEV=NV ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
DEV=NV NV_PTX=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
DEV=NV:PTX ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
- name: Run Tensor Core GEMM (CUDA)
run: |
DEV=CUDA SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
@ -245,7 +245,7 @@ jobs:
DEV=CUDA SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
DEV=CUDA SHOULD_USE_TC=1 FP8E4M3=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
- name: Run Tensor Core GEMM (PTX)
run: DEV=NV NV_PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
run: DEV=NV:PTX SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
- name: Run Tensor Core GEMM (NV)
run: DEV=NV SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
- name: Test DEV=NV
@ -328,7 +328,7 @@ jobs:
# run: DEV=NV M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
# TODO: too slow
# - name: Fuzz Padded Tensor Core GEMM (PTX)
# run: DEV=NV NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
# run: DEV=NV:PTX M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
- name: HEVC Decode Benchmark
run: VALIDATE=1 MAX_FRAMES=100 ASSERT_FPS=1400 JITBEAM=1 DEV=NV PYTHONPATH=. python3 extra/hevc/decode.py
- name: Train MNIST
@ -410,11 +410,11 @@ jobs:
# LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py
- name: Test speed vs theoretical
run: DEV=AMD IGNORE_BEAM_CACHE=1 CCACHE=0 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
- name: Test tensor cores AMD_LLVM=0
run: DEV=AMD AMD_LLVM=0 python3 test/opt/test_tensor_cores.py
- name: Test tensor cores (no LLVM)
run: DEV=AMD python3 test/opt/test_tensor_cores.py
# TODO: this is flaky
# - name: Test tensor cores AMD_LLVM=1
# run: DEV=AMD AMD_LLVM=1 python3 test/opt/test_tensor_cores.py
# - name: Test tensor cores AMD:LLVM
# run: DEV=AMD:LLVM python3 test/opt/test_tensor_cores.py
- name: Run Tensor Core GEMM (AMD)
run: |
DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
@ -591,7 +591,7 @@ jobs:
- name: openpilot compile3 0.11.0 driving_vision
run: BENCHMARK_LOG=openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
- name: IR3 openpilot compile3 0.11.0 driving_vision
run: BENCHMARK_LOG=ir3_openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM QCOM_IR3=1 FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
run: BENCHMARK_LOG=ir3_openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM:IR3 FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
- name: openpilot compile3 0.11.0 driving_policy
run: BENCHMARK_LOG=openpilot_0_11_0_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=3 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_policy.onnx
- name: openpilot compile3 0.11.0 dmonitoring
@ -609,7 +609,7 @@ jobs:
# generate quantized weights
ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
ln -s /data/home/tiny/tinygrad/testsig-*.so .
PYTHONPATH=. CC=clang-19 DEV=CPU CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
PYTHONPATH=. CC=clang-19 DEV=CPU QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
# benchmark on DSP with NOOPT=1, the devectorizer has issues
PYTHONPATH=. CC=clang-19 DEV=DSP NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
- name: Run process replay tests
@ -632,7 +632,7 @@ jobs:
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
- name: openpilot compile3 0.10.1 driving_vision
run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." GMMU=0 DEV=AMD AMD_LLVM=1 AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." GMMU=0 DEV=AMD:LLVM AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
- name: openpilot load_pickle 0.10.1 driving_vision
run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision_load_pickle PYTHONPATH="." GMMU=0 DEV=AMD AMD_IFACE=USB ASSERT_MIN_LOAD_TIME=15 python3 examples/openpilot/load_pickle.py
@ -677,8 +677,8 @@ jobs:
# Fails on 9070
# - name: Test tensor cores
# run: |
# DEV=AMD AMD_LLVM=0 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
# DEV=AMD AMD_LLVM=1 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
# DEV=AMD python3 test/test_linearizer.py test/opt/test_tensor_cores.py
# DEV=AMD:LLVM python3 test/test_linearizer.py test/opt/test_tensor_cores.py
# DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
- name: Run Tensor Core GEMM (AMD)
run: DEV=AMD SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py

View file

@ -29,9 +29,9 @@ jobs:
deps: testing_unit
llvm: 'true'
- name: Speed Test
run: DEV=CPU CPU_LLVM=1 THREADS=0 python3 test/speed/external_test_speed_v_torch.py
run: DEV=CPU:LLVM THREADS=0 python3 test/speed/external_test_speed_v_torch.py
- name: Speed Test (BEAM=2)
run: BEAM=2 DEV=CPU CPU_LLVM=1 THREADS=0 python3 test/speed/external_test_speed_v_torch.py
run: BEAM=2 DEV=CPU:LLVM THREADS=0 python3 test/speed/external_test_speed_v_torch.py
docs:
name: Docs
@ -83,7 +83,7 @@ jobs:
run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
- name: Compile EfficientNet to C and test it
run: |
DEV=CPU CPU_LLVM=0 python examples/compile_efficientnet.py > recognize.c
DEV=CPU python examples/compile_efficientnet.py > recognize.c
clang -O2 recognize.c -lm -o recognize
cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock
@ -114,11 +114,11 @@ jobs:
- name: Test one op in torch tests
run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
- name: Test Ops with TINY_BACKEND
run: DEV=CPU CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/backend/test_ops.py --durations=20
run: DEV=CPU:LLVM LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/backend/test_ops.py --durations=20
- name: Test in-place operations on views
run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
- name: Test multi-gpu
run: DEV=CPU CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
run: DEV=CPU:LLVM GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
- name: Test kernel fusion
run: python3 extra/torch_backend/test_kernel_fusion.py
@ -423,7 +423,7 @@ jobs:
- name: Test openpilot CL compile fp32 (test correctness)
run: DEV=CL IMAGE=1 SELFTEST=1 python examples/openpilot/compile3.py https://github.com/haraschax/filedump/raw/refs/heads/master/driving_vision_fp32.onnx
- name: Test openpilot LLVM compile fp16
run: IMAGE=1 FLOAT16=1 DEV=CPU CPU_LLVM=1 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/cf6376aa9a090f0da26c280ef69eabf9bbdd51d1faac9ed392919c3db69be916
run: IMAGE=1 FLOAT16=1 DEV=CPU:LLVM python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/cf6376aa9a090f0da26c280ef69eabf9bbdd51d1faac9ed392919c3db69be916
- name: Run process replay tests
uses: ./.github/actions/process-replay
@ -445,15 +445,15 @@ jobs:
python-version: '3.12'
llvm: 'true'
- name: Test ONNX (CPU)
run: DEV=CPU CPU_LLVM=0 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test ONNX (LLVM)
run: DEV=CPU CPU_LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
run: DEV=CPU:LLVM python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test ONNX Runner (CPU)
run: DEV=CPU CPU_LLVM=0 python3 test/external/external_test_onnx_runner.py
run: DEV=CPU python3 test/external/external_test_onnx_runner.py
- name: Test Additional ONNX Ops (CPU)
run: DEV=CPU CPU_LLVM=0 python3 test/external/external_test_onnx_ops.py
run: DEV=CPU python3 test/external/external_test_onnx_ops.py
- name: Test Quantize ONNX
run: DEV=CPU CPU_LLVM=0 python3 test/backend/test_quantize_onnx.py
run: DEV=CPU python3 test/backend/test_quantize_onnx.py
- name: Run process replay tests
uses: ./.github/actions/process-replay
@ -529,11 +529,11 @@ jobs:
opencl: 'true'
llvm: 'true'
- name: Test models (llvm)
run: DEV=CPU CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20
run: DEV=CPU:LLVM python -m pytest -n=auto test/models --durations=20
- name: Test models (opencl)
run: DEV=CL python -m pytest -n=auto test/models --durations=20
- name: Test models (cpu)
run: DEV=CPU CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20
run: DEV=CPU python -m pytest -n=auto test/models --durations=20
- name: Run process replay tests
uses: ./.github/actions/process-replay
@ -572,11 +572,11 @@ jobs:
pydeps: "pillow"
llvm: "true"
- name: Test LLVM=1 DEVECTORIZE=0
run: DEV=CPU CPU_LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
- name: Test LLVM=1 DEVECTORIZE=0 for model
run: DEV=CPU CPU_LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py
- name: Test DEV=CPU DEVECTORIZE=0
run: DEV=CPU CPU_LLVM=0 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
testdsp:
name: Linux (DSP)
@ -667,9 +667,9 @@ jobs:
- name: Install rocprof-trace-decoder
run: sudo PYTHONPATH="." ./extra/sqtt/install_rocprof_decoder.py
- name: Run AMD renderer tests
run: AMD_LLVM=0 python -m pytest -n=auto test/amd/ --durations 20
- name: Run AMD renderer tests (AMD_LLVM=1)
run: AMD_LLVM=1 python -m pytest -n=auto test/amd/ --durations 20
run: python -m pytest -n=auto test/amd/ --durations 20
- name: Run AMD renderer tests (AMD:LLVM)
run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
- name: Run SQTT profiling tests
run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
- name: Run AMD emulated tests on NULL backend
@ -681,7 +681,7 @@ jobs:
- name: Run ASM matmul on MOCKGPU
run: PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
- name: Run LLVM test
run: AMD_LLVM=1 python test/device/test_amd_llvm.py
run: DEV=AMD:LLVM python test/device/test_amd_llvm.py
testmockam:
name: Linux (am)
@ -726,11 +726,10 @@ jobs:
runs-on: ubuntu-22.04
timeout-minutes: 15
env:
DEV: AMD
DEV: AMD${{ matrix.backend == 'amdllvm' && ':LLVM' || '' }}
MOCKGPU: 1
MOCKGPU_ARCH: ${{ matrix.arch }}
SKIP_SLOW_TEST: 1
AMD_LLVM: ${{ matrix.backend == 'amdllvm' && '1' || matrix.backend != 'amdllvm' && '0' }}
steps:
- name: Checkout Code
uses: actions/checkout@v6
@ -777,7 +776,7 @@ jobs:
cuda: 'true'
ocelot: 'true'
- name: Set env
run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA\nCUDA_PTX=1' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
- name: Check Device.DEFAULT and print some source
run: |
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
@ -811,7 +810,7 @@ jobs:
llvm: ${{ matrix.backend == 'llvm' || matrix.backend == 'lvp' }}
mesa: ${{ matrix.backend == 'lvp' && 'true' }}
- name: Set env
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'DEV=CL' || matrix.backend == 'lvp' && 'DEV=CPU\nCPU_LVP=1' }}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'DEV=CL' || matrix.backend == 'lvp' && 'DEV=CPU:LVP' }}" >> $GITHUB_ENV
- name: Check Device.DEFAULT and print some source
run: |
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT"
@ -864,23 +863,20 @@ jobs:
env:
MOCKGPU: 1
DEV: AMD
AMD_LLVM: 0
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
- name: Run pytest (amd with llvm backend)
env:
MOCKGPU: 1
DEV: AMD
AMD_LLVM: 1
DEV: "AMD:LLVM"
FORWARD_ONLY: 1
run: |
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
- name: Run pytest (ptx)
env:
MOCKGPU: 1
NV_PTX: 1
DEV: NV
DEV: "NV:PTX"
FORWARD_ONLY: 1
# TODO: failing due to library loading error
CAPTURE_PROCESS_REPLAY: 0
@ -945,7 +941,7 @@ jobs:
llvm: ${{ matrix.backend == 'llvm' || matrix.backend == 'lvp' }}
mesa: ${{ matrix.backend == 'lvp' && 'true' }}
- name: Set env
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'metal' && 'DEV=METAL' || matrix.backend == 'lvp' && 'DEV=CPU\nCPU_LVP=1' }}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'metal' && 'DEV=METAL' || matrix.backend == 'lvp' && 'DEV=CPU:LVP' }}" >> $GITHUB_ENV
- name: Check Device.DEFAULT and print some source
run: |
python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU','LVP':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT"
@ -980,7 +976,7 @@ jobs:
pydeps: ${{ matrix.backend == 'webgpu' && 'dawn-python' || '' }}
- name: Set env
shell: bash
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'webgpu' && 'DEV=WEBGPU'}}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'webgpu' && 'DEV=WEBGPU'}}" >> $GITHUB_ENV
- name: Run unit tests
if: matrix.backend=='llvm'
# test_newton_schulz hits RecursionError
@ -988,7 +984,7 @@ jobs:
- name: Run NULL backend tests
if: matrix.backend=='llvm'
shell: bash
run: CPU=0 CPU_LLVM=0 DEV=NULL python -m pytest -n=auto test/null/ --ignore=test/null/test_elf.py --durations=20
run: DEV=NULL python -m pytest -n=auto test/null/ --ignore=test/null/test_elf.py --durations=20
- name: Run pytest (${{ matrix.backend }})
shell: bash
run: |
@ -1017,7 +1013,7 @@ jobs:
python-version: '3.12'
- name: Set env
shell: bash
run: printf "DEV=NULL\nNULL_ALLOW_COPYOUT=1\n${{ matrix.backend == 'ir3' && 'NULL_IR3=1' || matrix.backend == 'nak' && 'NULL_NAK=1' }}" >> $GITHUB_ENV
run: printf "NULL_ALLOW_COPYOUT=1\n${{ matrix.backend == 'ir3' && 'DEV=NULL:IR3' || matrix.backend == 'nak' && 'DEV=NULL:NAK' }}" >> $GITHUB_ENV
- name: Run test_ops
shell: bash
run: |
@ -1040,7 +1036,7 @@ jobs:
python-version: '3.12'
- name: Set env
shell: bash
run: printf "DEV=NULL\nNULL_ALLOW_COPYOUT=1\nNULL_QCOMCL=1" >> $GITHUB_ENV
run: printf "DEV=NULL:QCOMCL\nNULL_ALLOW_COPYOUT=1" >> $GITHUB_ENV
- name: Run test_ops
shell: bash
run: |

View file

@ -28,6 +28,13 @@ The columns of this list are are: Variable, Possible Value(s) and Description.
These control the behavior of core tinygrad even when used as a library.
### DEV variable
The `DEV` variable deserves special note due to its more nuanced syntax.
`DEV` is used to specify the target device and target renderer for said device, separated by colons.
Specifying the renderer is optional, omitting a preference will cause tinygrad to automatically select a renderer from those
available on the system. Some example values for `DEV` are: `AMD`, `AMD:LLVM`, `NV:PTX`, etc.
Variable | Possible Value(s) | Description
---|---|---
DEBUG | [1-7] | enable debugging output (operations, timings, speed, generated code and more)

View file

@ -4,13 +4,13 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra
| Runtime | Description | Compiler Options | Requirements |
|---------|-------------|------------------|--------------|
| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`NV_PTX=1`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`AMD_LLVM=1`)<br>HIP/COMGR (`AMD_HIP=1`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`DEV=NV:PTX`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`DEV=AMD:LLVM`)<br>HIP/COMGR (`DEV=AMD:HIP`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
| [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | - | 6xx series GPUs |
| [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | - | M1+ Macs; Metal 3.0+ for `bfloat` support |
| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`CUDA_PTX=1`) | NVIDIA GPU with CUDA support |
| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`DEV=CUDA:PTX`) | NVIDIA GPU with CUDA support |
| [CL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | - | OpenCL 2.0 compatible device |
| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`CPU_LLVM=1`) | `clang` compiler in system `PATH` |
| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`DEV=CPU:LLVM`) | `clang` compiler in system `PATH` |
| [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | - | Dawn library installed and discoverable. Binaries: [pydawn v0.3.0](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0) |

View file

@ -22,7 +22,7 @@ This will produce a binary in the `extra/remu/target/release` directory.
The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally).
Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `AMD_LLVM=1` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `DEV=AMD:LLVM` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
`PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu.

View file

@ -83,7 +83,7 @@ class TestFmacE64(unittest.TestCase):
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 7.0, places=5)
def test_v_fmac_f32_e64_with_sgpr_sources(self):
"""V_FMAC_F32_E64 with SGPR sources (common in AMD_LLVM output).
"""V_FMAC_F32_E64 with SGPR sources (common in AMD:LLVM output).
This tests the exact pattern that was failing: v_fmac_f32_e64(v[0], s[4], 0)
where src0 is SGPR and src1 is inline constant 0.
@ -936,7 +936,7 @@ class TestF16Modifiers(unittest.TestCase):
def test_v_fmac_f16_hi_dest(self):
"""v_fmac_f16 with .h destination: dst.h = src0 * src1 + dst.h.
This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
This tests the case from AMD:LLVM sin(0) where V_FMAC_F16 writes to v0.h.
"""
instructions = [
s_mov_b32(s[0], 0x38003c00), # v0 = {hi=0.5, lo=1.0}

View file

@ -504,7 +504,7 @@ class TestTinygradKernels(unittest.TestCase):
x_np = np.random.uniform(-2, 2, (33,)).astype(np.float32)
self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).unsqueeze(-1).expand(33, 3).flatten().sum())
@unittest.skip("slow and broken with AMD_LLVM=1")
@unittest.skip("slow and broken with AMD:LLVM")
def test_nonzero(self):
"""Test nonzero operation - counts and gathers indices of non-zero elements."""
import numpy as np

View file

@ -60,11 +60,11 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
from test.amd.test_compare_emulators import get_kernels_from_tinygrad
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
from tinygrad.helpers import AMD_LLVM
from tinygrad.helpers import DEV
kernels, _, _ = get_kernels_from_tinygrad(op_fn)
# rendered source can be C or llvmir
compiler = (AMDLLVMCompiler if AMD_LLVM else HIPCompiler)(get_target(arch))
compiler = (AMDLLVMCompiler if DEV.renderer == "LLVM" else HIPCompiler)(get_target(arch))
# First pass: decode all instructions and collect info
decoded_instrs: list[tuple] = [] # list of (ki, offset, orig_bytes, decoded, our_disasm, decode_ok, decode_err)

View file

@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
import numpy as np
from typing import List, Callable
import torch
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, AMD_LLVM, EMULATE
from tinygrad.helpers import getenv, CI, DEBUG, DEV, EMULATE, IMAGE, Context
from tinygrad import Tensor, Device, dtypes
from tinygrad.tensor import _to_np_dtype
from tinygrad.device import is_dtype_supported
@ -560,7 +560,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(45,65), (45,65)], lambda x,y: x/y)
helper_test_op([(), ()], lambda x,y: x/y)
@unittest.skipIf(Device.DEFAULT == "AMD" and AMD_LLVM, "AMD with LLVM backend generate rcp in FP division causes trunc/floor errors")
@unittest.skipIf(Device.DEFAULT == "AMD" and DEV.renderer == "LLVM", "AMD with LLVM backend generate rcp in FP division causes trunc/floor errors")
def test_div_rounding_mode(self):
for denominator in [-10, -5, -3, -2, -1, 1, 2, 3, 5, 10]:
# int numerator
@ -843,7 +843,7 @@ class TestOps(unittest.TestCase):
self.assertEqual(a, b)
self.assertEqual(Tensor(-1).contiguous().idiv(4).item(), 0) # NOTE this is trunc-div behaviour
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
def test_sin(self):
helper_test_op([(45,65)], lambda x: x.sin())
helper_test_op([()], lambda x: x.sin())
@ -853,7 +853,7 @@ class TestOps(unittest.TestCase):
helper_test_op(None, lambda x: x.sin(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and platform.system() == "Windows", "Not accurate enough with DirectX backend")
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
def test_cos(self):
helper_test_op([(45,65)], lambda x: x.cos())
helper_test_op([()], lambda x: x.cos())
@ -862,7 +862,7 @@ class TestOps(unittest.TestCase):
helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and platform.system() == "Windows", "Not accurate enough with DirectX backend")
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
def test_tan(self):
# NOTE: backward has much higher diff with input close to pi/2 and -pi/2
helper_test_op([(45,65)], lambda x: x.tan(), low=-1.5, high=1.5)
@ -1398,7 +1398,7 @@ class TestOps(unittest.TestCase):
np.arange(64,128,dtype=np.float32).reshape(8,8)])
def test_small_gemm_eye(self):
helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and DEV.renderer == "LLVM") or IMAGE
or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
@unittest.skipIf(Device.DEFAULT == "QCOM", "not precise enough")
def test_gemm_fp16(self):
@ -2396,7 +2396,7 @@ class TestOps(unittest.TestCase):
lambda x,w: torch.nn.functional.conv2d(x,w,stride=2),
lambda x,w: Tensor.conv2d(x,w,stride=2))
@unittest.skipUnless(Device.DEFAULT == "CPU" and CPU_LLVM, "DEVECTORIZE=0 only for LLVM")
@unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM")
def test_strided_conv2d_simple_vec(self):
with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()
@ -2715,9 +2715,9 @@ class TestOps(unittest.TestCase):
lambda x: Tensor.avg_pool2d(x, kernel_size=(11,28)), rtol=1e-5)
def test_avg_pool3d(self):
# TODO: AMD_LLVM has larger atol
# TODO: AMD:LLVM has larger atol
# TODO: DEV=PYTHON backward hangs?
atol = 1e-2 if AMD_LLVM else 1e-6
atol = 1e-2 if DEV.device == "AMD" and DEV.renderer == "LLVM" else 1e-6
helper_test_op([(1,1,16,16,16)],
lambda x: torch.nn.functional.avg_pool3d(x, kernel_size=(8,8,8), stride=5, padding=1, count_include_pad=False),
lambda x: Tensor.avg_pool2d(x, kernel_size=(8,8,8), stride=5, padding=1, count_include_pad=False), atol=atol, rtol=1e-5, forward_only=True)

View file

@ -1,7 +1,7 @@
import unittest
from tinygrad import Tensor, Device, dtypes
from tinygrad.tensor import _to_np_dtype
from tinygrad.helpers import Context, getenv, CI, OSX
from tinygrad.helpers import Context, getenv, CI, DEV, OSX
from test.backend.test_schedule import check_schedule
from test.backend.test_dtype_alu import ht, dtypes_float
from tinygrad.device import is_dtype_supported
@ -188,7 +188,7 @@ class TestTranscendentalVectorized(unittest.TestCase):
for vec_size in [1,2,3,4,5,127,128]: self._test_vectorized_op(Tensor.log2, np.log2, (0.001, 200), vec_size)
@unittest.skipIf(Device.DEFAULT == "DSP", "requires int division")
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and OSX, "WEBGPU Metal backend is not accurate enough")
def test_sin_vectorized(self):
for vec_size in [1,2,3,4,5,127,128]: self._test_vectorized_op(Tensor.sin, np.sin, (-100, 100), vec_size)

View file

@ -11,8 +11,7 @@ from examples.stable_diffusion import AutoencoderKL
def set_eval_params():
# override these as needed from cli
for k,v in {"MODEL": "stable_diffusion", "GPUS": "8", "EVAL_SAMPLES": "600", "CONTEXT_BS": "816", "DENOISE_BS": "600", "DECODE_BS": "384",
"INCEPTION_BS": "560", "CLIP_BS": "240", "DATADIR": "/raid/datasets/stable_diffusion", "CKPTDIR": "/raid/weights/stable_diffusion",
"AMD_LLVM": "0"}.items():
"INCEPTION_BS": "560", "CLIP_BS": "240", "DATADIR": "/raid/datasets/stable_diffusion", "CKPTDIR": "/raid/weights/stable_diffusion"}.items():
os.environ[k] = getenv(k, v)
class TestEval(unittest.TestCase):

View file

@ -1,7 +1,7 @@
An integrated environment for AMD GPU assembly and emulation
Test with `pytest -n12 test/amd/`
`AMD_LLVM=1 pytest -n12 test/amd/`
`DEV=AMD:LLVM pytest -n12 test/amd/`
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1 PYTHON_REMU=1`
@ -20,13 +20,13 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
The ops tests also pass, but they are very slow, so you should run them one at a time.
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware

View file

@ -1,7 +1,7 @@
import unittest, io
from contextlib import redirect_stdout
from tinygrad import Tensor, dtypes, Device
from tinygrad.helpers import OSX, CPU_LLVM, CPU_LVP
from tinygrad.helpers import OSX, DEV
from tinygrad.device import is_dtype_supported
from tinygrad.engine.realize import get_program
@ -18,7 +18,7 @@ class TestCompileFailures(unittest.TestCase):
class TestDisassembly(unittest.TestCase):
# TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
@unittest.skipUnless(Device.DEFAULT in ("CPU",) and not (CPU_LLVM or CPU_LVP) and OSX, "m series cpus support fp16 arithmetic")
@unittest.skipUnless(Device.DEFAULT in ("CPU",) and DEV.renderer not in ("LLVM", "LVP") and OSX, "m series cpus support fp16 arithmetic")
def test_float16_alu(self):
c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
s = c.schedule()[-1]

View file

@ -3,7 +3,7 @@ import unittest, os, subprocess
from unittest.mock import patch
from tinygrad import Tensor
from tinygrad.device import Device, Compiler, enumerate_devices_str
from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, WIN, CI, OSX
from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, Target, WIN, CI, OSX, DEV
from tinygrad.runtime.support.c import DLL
class TestDevice(unittest.TestCase):
@ -39,6 +39,12 @@ class TestDevice(unittest.TestCase):
self.assertNotEqual(result.returncode, 0)
self.assertIn(b"deprecated", result.stderr)
def test_old_renderer_env_raises(self):
result = subprocess.run(['python3', '-c', 'from tinygrad import Device; Device[Device.DEFAULT].renderer'],
env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"}, capture_output=True)
self.assertNotEqual(result.returncode, 0)
self.assertIn(b"deprecated", result.stderr)
@unittest.skipIf(WIN and CI, "skipping windows test") # TODO: subprocess causes memory violation?
def test_env_overwrite_default_compiler(self):
if Device.DEFAULT == "CPU":
@ -48,13 +54,11 @@ class TestDevice(unittest.TestCase):
imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler"
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"})
shell=True, check=True, env={**os.environ, "DEV": "CPU:LLVM"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "0"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CC": "LLVM"})
shell=True, check=True, env={**os.environ, "DEV": "CPU"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CC": "CLANGJIT"})
shell=True, check=True, env={**os.environ, "DEV": "CPU:CLANGJIT"})
elif Device.DEFAULT == "AMD":
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
try: _, _ = HIPCompiler(Device[Device.DEFAULT].arch), AMDLLVMCompiler(Device[Device.DEFAULT].arch)
@ -62,27 +66,25 @@ class TestDevice(unittest.TestCase):
imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler"
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "1"})
shell=True, check=True, env={**os.environ, "DEV": "AMD:LLVM"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "0"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_CC": "LLVM"})
shell=True, check=True, env={**os.environ, "DEV": "AMD"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_CC": "HIP"})
shell=True, check=True, env={**os.environ, "DEV": "AMD:HIP"})
else: self.skipTest("only run on CPU/AMD")
@unittest.skipIf((WIN and CI) or (not Device.DEFAULT == "CPU"), "skipping windows test")
@unittest.skipIf(WIN and CI, "skipping windows test")
def test_env_online(self):
from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
try: _, _ = CPULLVMCompiler(), ClangJITCompiler()
except Exception as e: self.skipTest(f"skipping compiler test: not all compilers: {e}")
with Context(CPU_LLVM=1):
with Context(DEV="CPU:LLVM"):
inst = Device["CPU"].compiler
self.assertIsInstance(Device["CPU"].compiler, CPULLVMCompiler)
with Context(CPU_LLVM=0):
with Context(DEV="CPU"):
self.assertIsInstance(Device["CPU"].compiler, ClangJITCompiler)
with Context(CPU_LLVM=1):
with Context(DEV="CPU:LLVM"):
self.assertIsInstance(Device["CPU"].compiler, CPULLVMCompiler)
assert inst is Device["CPU"].compiler # cached
@ -94,7 +96,7 @@ class TestDevice(unittest.TestCase):
except Exception as e: self.skipTest(f"skipping: LLVM not available: {e}")
dev = Device["CPU"]
dev.cached_pair.clear()
dev.cached_renderer.clear()
with patch("tinygrad.renderer.cstyle.ClangJITRenderer.__init__", side_effect=RuntimeError("broken")):
self.assertIsInstance(dev.renderer.compiler, CPULLVMCompiler)
@ -104,6 +106,21 @@ class TestDevice(unittest.TestCase):
with Context(DEV="NULL"): self.assertEqual(Tensor.empty(1).device, "NULL")
self.assertEqual(Tensor.empty(1).device, orig_dev)
class TestDevVar(unittest.TestCase):
def test_parse(self):
for d, t in [("AMD", Target(device="AMD", renderer="")), ("AMD:LLVM", Target(device="AMD", renderer="LLVM")),
(":LLVM", Target(device="", renderer="LLVM"))]:
with Context(DEV=d):
self.assertEqual(DEV.value, t)
self.assertEqual(str(DEV.value), d)
def test_target(self):
with Context(DEV="CPU"): self.assertEqual(DEV.target("CPU"), Target("CPU"))
with Context(DEV="CPU:LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU", "LLVM"))
with Context(DEV=":LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU", "LLVM"))
with Context(DEV="AMD:LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU"))
with Context(DEV=""): self.assertEqual(DEV.target("CPU"), Target("CPU"))
class MockCompiler(Compiler):
def __init__(self, key): super().__init__(key)
def compile(self, src) -> bytes: return src.encode()

View file

@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
from tinygrad.uop.ops import Ops
from tinygrad.dtype import DType
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM, Context
from tinygrad.helpers import AMX, DEV, Context
from test.helpers import slow
from tinygrad.engine.realize import CompiledRunner, get_program
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
@ -75,9 +75,9 @@ class TestTensorCores(unittest.TestCase):
a, b = Tensor.rand(m, k, dtype=tc.dtype_in), Tensor.rand(k, n, dtype=tc.dtype_in)
r = a.matmul(b, dtype=tc.dtype_out)
prg = get_program(r.schedule()[-1].ast, Device[Device.DEFAULT].renderer, opts=[Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))])
if Device.DEFAULT == "CPU" and CPU_LLVM:
if Device.DEFAULT == "CPU" and DEV.renderer == "LLVM":
assert "0x201000" in prg.src
elif Device.DEFAULT == "AMD" and AMD_LLVM:
elif Device.DEFAULT == "AMD" and DEV.renderer == "LLVM":
assert "@llvm.amdgcn.wmma" in prg.src
elif Device[Device.DEFAULT].renderer.suffix == "PTX":
assert "mma.sync.aligned" in prg.src

View file

@ -4,7 +4,7 @@ import numpy as np
from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
from tinygrad.uop.ops import Ops
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import temp, CI, CPU_LVP, Context
from tinygrad.helpers import temp, CI, DEV, Context
N = 200 # has to be bigger than the cache to fail
@ -193,7 +193,7 @@ class TestAssign(unittest.TestCase):
new = a + times_a
np.testing.assert_allclose(new.numpy(), 8)
@unittest.skipIf(CI and CPU_LVP, "flaky in CI")
@unittest.skipIf(CI and DEV.renderer == "LVP", "flaky in CI")
def test_double_assign(self):
a = Tensor.ones(4).contiguous().realize()
a += 1

View file

@ -4,9 +4,8 @@ from collections import defaultdict
from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
from tinygrad.helpers import BENCHMARKS, CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, ContextVar
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, DEV, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
from tinygrad.helpers import EMULATE, EMULATED_DTYPES, NULL_IR3, NULL_QCOMCL, IMAGE, FLOAT16, TracingKey, size_to_str
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, unwrap_class_type
from tinygrad.helpers import suppress_finalizing, select_first_inited, DEV, VIZ, EMULATE, EMULATED_DTYPES, IMAGE, FLOAT16, TracingKey, size_to_str
from tinygrad.dtype import DType, PtrDType, dtypes, _to_np_dtype
if TYPE_CHECKING: from tinygrad.renderer import Renderer
@ -40,7 +39,7 @@ class _Device:
for device in ALL_DEVICES:
with contextlib.suppress(Exception): yield self[device].device
@property
def DEFAULT(self) -> str: return DEV.value.upper() if DEV else self._select_device
def DEFAULT(self) -> str: return DEV.device or self._select_device
@DEFAULT.setter
def DEFAULT(self, v): raise AttributeError(f'setting Device.DEFAULT is deprecated, use "with Context(DEV={v!r})" or "DEV.value = {v!r}"')
@functools.cached_property
@ -284,20 +283,11 @@ class Compiled:
def _renderer_name(self, r:type[Renderer]|functools.partial) -> str:
return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname:=self.device.split(':')[0].upper()) or devname
def _renderer_var(self, r:type[Renderer]|functools.partial) -> ContextVar|None:
return ContextVar._cache.get(f"{self.device}_{self._renderer_name(r)}", None)
def _select_renderer(self) -> Renderer:
# select forced compiler from global env var.
forced_comps = set([r for r in self.renderers if self._renderer_name(r) == val] if
(ctrl:=ContextVar._cache.get(f"{self.device}_CC", None)) is not None and (val:=ctrl.value) else [])
# add forced compilers from individual env vars (only if global env var is not set, as it takes precedence).
if not forced_comps: forced_comps |= set(r for r in self.renderers if (en:=self._renderer_var(r)) is not None and en.value == 1)
if len(forced_comps) > 1: raise RuntimeError(f"{self.device}: multiple compilers set in env {forced_comps}")
return select_first_inited(list(forced_comps) if len(forced_comps)>0 else self.renderers, f"No renderer for {self.device} is available",
self.cached_renderer)
assert (rn:=next((self._renderer_name(r) for r in self.renderers if getenv(f"{self.device}_{self._renderer_name(r)}")), None)) is None, \
f"{self.device}_{rn}=1 is deprecated, use DEV={self.device}:{rn} or {self.device}_CC={rn} instead"
renderers = [r for r in self.renderers if self._renderer_name(r) == rn] if (rn:=DEV.target(self.device).renderer) else self.renderers
return select_first_inited(renderers, f"No renderer for {self.device} is available", self.cached_renderer)
def synchronize(self):
"""
@ -320,37 +310,45 @@ class Compiled:
# TODO: move this to each Device
# this only tracks if the dtype is natively supported, it may be supported in the frontend using decomps
def is_dtype_supported(dtype:DType, device:str|None=None, arch:str|None=None) -> bool:
if device is None: device = Device.DEFAULT
target = DEV.target(device or Device.DEFAULT)
if dtype == dtypes.bfloat16:
if device == "METAL": return not CI or BENCHMARKS
if device == "CUDA": return (not CI or BENCHMARKS) and not CUDA_PTX
if device == "NV": return (not CI or BENCHMARKS) and not NV_PTX and not NV_NAK
if device in {"CPU"}: return (not CI or BENCHMARKS) and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP
return device in {"AMD", "CL", "PYTHON", "NULL"}
match target.device:
case "METAL": return not CI or BENCHMARKS
case "CUDA": return (not CI or BENCHMARKS) and target.renderer != "PTX"
case "NV": return (not CI or BENCHMARKS) and target.renderer not in ("PTX", "NAK")
case "CPU": return (not CI or BENCHMARKS) and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and target.renderer != "LVP"
case "AMD" | "CL" | "PYTHON" | "NULL": return True
case _: return False
if dtype in dtypes.fp8_ocp:
if device == "CUDA": return (not CI or BENCHMARKS) and not CUDA_PTX
if device == "NV": return (not CI or BENCHMARKS) and not NV_PTX and not NV_NAK
if device == "AMD":
# TODO: open the device to get arch of device, will be fixed after triple is in the device string
if arch is None: arch = getattr(Device[device].renderer, "arch", "")
return (not CI or BENCHMARKS) and arch == "gfx950"
return device in {"PYTHON", "NULL"}
if dtype in dtypes.fp8_fnuz: return device in {"PYTHON", "NULL"}
if device == "WEBGPU": return dtype in [dtypes.bool, dtypes.char, dtypes.uchar, dtypes.short,
dtypes.ushort, dtypes.float, dtypes.int32, dtypes.uint32, dtypes.half]
match target.device:
case "CUDA": return (not CI or BENCHMARKS) and target.renderer != "PTX"
case "NV": return (not CI or BENCHMARKS) and target.renderer not in ("PTX", "NAK")
case "AMD":
# TODO: open the device to get arch of device, will be fixed after triple is in the device string
if arch is None: arch = getattr(Device[target.device].renderer, "arch", "")
return (not CI or BENCHMARKS) and arch == "gfx950"
case "PYTHON" | "NULL": return True
case _: return False
if dtype in dtypes.fp8_fnuz: return target.device in {"PYTHON", "NULL"}
if target.device == "WEBGPU": return dtype in [dtypes.bool, dtypes.char, dtypes.uchar, dtypes.short,
dtypes.ushort, dtypes.float, dtypes.int32, dtypes.uint32, dtypes.half]
# for CI GPU and OSX, cl_khr_fp16 isn't supported
# for CI LLVM, it segfaults because it can't link to the casting function
# CI CUDA architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
# PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751
# double can't be bitcast to anything without long support
if dtype == dtypes.half:
if device == "CL": return (not CI or BENCHMARKS) and not OSX
if device == "QCOM": return bool(IMAGE) and bool(FLOAT16) # QCOM compiler is flaky with half
if device in ["CUDA", "NV"]: return (not CI or BENCHMARKS) or "CUDA" in EMULATE.value
if device == "CPU" and CPU_LLVM: return OSX
if device == "PYTHON": return sys.version_info >= (3, 12)
if dtype == dtypes.float64: return (device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not NULL_IR3 and not NULL_QCOMCL
and dtypes.long not in EMULATED_DTYPES.tolist(dtypes))
match target.device:
case "CL": return (not CI or BENCHMARKS) and not OSX
case "QCOM": return bool(IMAGE) and bool(FLOAT16) # QCOM compiler is flaky with half
case "CUDA" | "NV": return (not CI or BENCHMARKS) or "CUDA" in EMULATE.value
case "CPU" if target.renderer == "LLVM": return OSX
case "PYTHON": return sys.version_info >= (3, 12)
if dtype == dtypes.float64:
match target.device:
case _ if dtypes.long in EMULATED_DTYPES.tolist(dtypes): return False # double can't be bitcast to anything without long support
case "CL": return not OSX
case "NULL": return target.renderer not in ("IR3", "QCOMCL")
case "METAL" | "QCOM": return False
return True
if PROFILE:
@ -373,23 +371,16 @@ def enumerate_devices_str() -> Generator[str, None, None]:
compilers_results, any_works = [], False
try:
d = Device[device]
default_renderers, default_renderer = d.renderers, d.renderer
try:
for r in default_renderers:
d.renderers = [r]
try:
# d.renderer, d.compiler = r(), c()
with Context(CACHELEVEL=0, **({f"{device}_CC": d._renderer_name(r)} if (ctrl:=f"{device}_CC") in ContextVar._cache else {})):
test = (Tensor([1,2,3], device=device) * 2).tolist()
if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
set_text = f'({ctrl}={d._renderer_name(r)} to make default)' if (ctrl:=f"{device}_CC") in ContextVar._cache else ''
default_text = '(default)' if type(default_renderer) is type(d.renderer) else set_text
compilers_results.append(f"{colored('+', 'green')} {d._renderer_name(r)} {default_text}")
any_works = True
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._renderer_name(r)}: {e}")
finally:
# put the defaults back!
d.renderers = default_renderers
default_renderer = d.renderer
for r in d.renderers:
try:
# d.renderer, d.compiler = r(), c()
with Context(CACHELEVEL=0, DEV=f"{device}:{d._renderer_name(r)}"): test = (Tensor([1,2,3], device=device) * 2).tolist()
if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
default_text = '(default)' if type(default_renderer) is type(d.renderer) else f'(DEV={device}:{d._renderer_name(r)} to make default)'
compilers_results.append(f"{colored('+', 'green')} {d._renderer_name(r)} {default_text}")
any_works = True
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._renderer_name(r)}: {e}")
result = (colored('PASS', 'green') if any_works else f"{colored('FAIL', 'yellow')}") + ''.join([f'\n{" "*16} {x}' for x in compilers_results])
except Exception as e:
result = f"{colored('FAIL', 'red')} {e}"

View file

@ -4,7 +4,7 @@ START_TIME = time.perf_counter()
import os, functools, platform, re, contextlib, operator, hashlib, pickle, sqlite3, tempfile, pathlib, string, ctypes, sys, gzip, getpass, gc
from collections import defaultdict
import subprocess, shutil, math, types, copyreg, inspect, importlib, decimal, itertools
from dataclasses import dataclass, field
from dataclasses import dataclass, field, replace
from typing import ClassVar, Iterable, Any, TypeVar, Callable, Sequence, TypeGuard, Iterator, Generic, Generator, cast, overload
T = TypeVar("T")
@ -176,7 +176,32 @@ class ContextVar(Generic[T]):
assert isinstance(self.value, str)
return [getattr(obj, x) if obj else x for x in self.value.split(',') if x]
DEV, DEBUG, BEAM, NOOPT = ContextVar("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
@dataclass(frozen=True)
class Target:
device: str = ""
renderer: str = ""
@staticmethod
def parse(s:str) -> Target: return Target(*(x.upper() for x in s.split(':')))
def __repr__(self) -> str: return self.device + (":" + self.renderer if self.renderer else "")
class _DEV(ContextVar):
_value = Target()
@property
def value(self) -> Target: return self._value
@value.setter
def value(self, v:str|Target): self._value = v if isinstance(v, Target) else Target.parse(v)
def __getattr__(self, k): return getattr(self.value, k)
# get target for device string
def target(self, dev:str) -> Target:
t = self.value if self.device == dev or not self.device else Target(device=dev)
# TODO: remove this once DEV supports secondary targets
if (cv:=ContextVar._cache.get(f"{dev}_CC", None)) is not None and cv.value:
assert not t.renderer, f"renderer set in DEV and {dev}_CC"
return replace(t, renderer=cv.value.upper())
return replace(t, device=dev)
DEV, DEBUG, BEAM, NOOPT = _DEV("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
IMAGE, FLOAT16, OPENPILOT_HACKS = ContextVar("IMAGE", 0), ContextVar("FLOAT16", 0), ContextVar("OPENPILOT_HACKS", 0)
JIT, JIT_BATCH_SIZE = ContextVar("JIT", 2 if OSX and ARCH_X86 else 1), ContextVar("JIT_BATCH_SIZE", 32)
WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)
@ -193,13 +218,9 @@ EMULATE, EMULATED_DTYPES = ContextVar("EMULATE", ""), ContextVar("EMULATED_DTYPE
CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0)
CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
# Compilers
CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
NULL_QCOMCL, NULL_IR3, NULL_NAK = ContextVar("NULL_QCOMCL", 0), ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0)
CPU_CC, NV_CC, CUDA_CC, NULL_CC = ContextVar("CPU_CC", ""), ContextVar("NV_CC", ""), ContextVar("CUDA_CC", ""), ContextVar("NULL_CC", "")
NULL_ALLOW_COPYOUT = ContextVar("NULL_ALLOW_COPYOUT", 0)
AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
AMD_CC, QCOM_CC = ContextVar("AMD_CC", ""), ContextVar("QCOM_CC", "")
# VIZ implies PROFILE, but you can run PROFILE without VIZ
VIZ = ContextVar("VIZ", 0)
PROFILE = ContextVar("PROFILE", abs(VIZ.value))

View file

@ -127,7 +127,7 @@ def __getattr__(nm):
return load("rocprof", "['rocprof-trace-decoder', p:='/usr/local/lib/rocprof-trace-decoder.so', p.replace('so','dylib')]",
[f"{{}}/include/{s}.h" for s in ["rocprof_trace_decoder", "trace_decoder_instrument", "trace_decoder_types"]],
srcs="https://github.com/ROCm/rocprof-trace-decoder/archive/dd0485100971522cc4cd8ae136bdda431061a04d.tar.gz")
case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu']", [
case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or DEV.renderer == 'LVP' else ['tinymesa']) + ['tinymesa_cpu']", [
*[f"{{}}/src/compiler/nir/{s}.h" for s in ["nir", "nir_builder", "nir_shader_compiler_options", "nir_serialize"]], "{}/gen/nir_intrinsics.h",
*[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
*[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
@ -146,7 +146,7 @@ def __getattr__(nm):
*[f"python3 src/compiler/{s}_h.py > gen/{s.split('/')[-1]}.h" for s in ["nir/nir_opcodes", "nir/nir_builder_opcodes"]],
*[f"python3 src/compiler/nir/nir_{s}_h.py --outdir gen" for s in ["intrinsics", "intrinsics_indices"]]]), cwd=path, shell=True, check=True),
srcs="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
prolog=["from tinygrad.helpers import CPU_CC, CPU_LVP", "import gzip, base64"],
prolog=["from tinygrad.helpers import CPU_CC, DEV", "import gzip, base64"],
epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
case "libclang":
return load("libclang", clang_lib,

View file

@ -4,9 +4,9 @@ import ctypes
from typing import Annotated, Literal, TypeAlias
from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
from tinygrad.runtime.support import c
from tinygrad.helpers import CPU_CC, CPU_LVP
from tinygrad.helpers import CPU_CC, DEV
import gzip, base64
dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu'])
dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or DEV.renderer == 'LVP' else ['tinymesa']) + ['tinymesa_cpu'])
class struct_u_printf_info(ctypes.Structure): pass
u_printf_info: TypeAlias = struct_u_printf_info
uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32]