mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
specify renderer in DEV, <dev>_<ren>=1 is deprecated (#15551)
This commit is contained in:
parent
5181c8e23a
commit
acf239e4d2
20 changed files with 204 additions and 173 deletions
36
.github/workflows/benchmark.yml
vendored
36
.github/workflows/benchmark.yml
vendored
|
|
@ -101,10 +101,10 @@ jobs:
|
|||
run: DEV=METAL python3.11 test/opt/test_tensor_cores.py
|
||||
- name: Test AMX tensor cores
|
||||
run: |
|
||||
DEBUG=2 DEV=CPU CPU_LLVM=0 AMX=1 python3.11 test/opt/test_tensor_cores.py
|
||||
DEBUG=2 DEV=CPU CPU_LLVM=1 AMX=1 python3.11 test/opt/test_tensor_cores.py
|
||||
DEBUG=2 DEV=CPU CPU_LLVM=0 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
|
||||
DEBUG=2 DEV=CPU CPU_LLVM=1 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
|
||||
DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_tensor_cores.py
|
||||
DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_tensor_cores.py
|
||||
DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
|
||||
DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
|
||||
- name: Run Tensor Core GEMM (float)
|
||||
run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py
|
||||
- name: Run Tensor Core GEMM (half)
|
||||
|
|
@ -193,9 +193,9 @@ jobs:
|
|||
#- name: UsbGPU openpilot test
|
||||
# run: sudo -E PYTHONPATH=. GMMU=0 DEV=AMD AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
|
||||
- name: UsbGPU (USB4/TB) boot time
|
||||
run: PYTHONPATH=. DEBUG=3 DEV=NV NV_IFACE=PCI NV_NAK=1 time python3.11 test/test_tiny.py TestTiny.test_plus
|
||||
run: PYTHONPATH=. DEBUG=3 DEV=NV:NAK NV_IFACE=PCI time python3.11 test/test_tiny.py TestTiny.test_plus
|
||||
- name: UsbGPU (USB4/TB) tiny tests
|
||||
run: PYTHONPATH=. DEV=NV NV_IFACE=PCI NV_NAK=1 python3.11 test/test_tiny.py
|
||||
run: PYTHONPATH=. DEV=NV:NAK NV_IFACE=PCI python3.11 test/test_tiny.py
|
||||
|
||||
testnvidiabenchmark:
|
||||
name: tinybox green Benchmark
|
||||
|
|
@ -237,7 +237,7 @@ jobs:
|
|||
- name: Test tensor cores
|
||||
run: |
|
||||
DEV=NV ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
|
||||
DEV=NV NV_PTX=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
|
||||
DEV=NV:PTX ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Run Tensor Core GEMM (CUDA)
|
||||
run: |
|
||||
DEV=CUDA SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
|
|
@ -245,7 +245,7 @@ jobs:
|
|||
DEV=CUDA SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
|
||||
DEV=CUDA SHOULD_USE_TC=1 FP8E4M3=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
- name: Run Tensor Core GEMM (PTX)
|
||||
run: DEV=NV NV_PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
run: DEV=NV:PTX SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
- name: Run Tensor Core GEMM (NV)
|
||||
run: DEV=NV SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
- name: Test DEV=NV
|
||||
|
|
@ -328,7 +328,7 @@ jobs:
|
|||
# run: DEV=NV M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
|
||||
# TODO: too slow
|
||||
# - name: Fuzz Padded Tensor Core GEMM (PTX)
|
||||
# run: DEV=NV NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
|
||||
# run: DEV=NV:PTX M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
|
||||
- name: HEVC Decode Benchmark
|
||||
run: VALIDATE=1 MAX_FRAMES=100 ASSERT_FPS=1400 JITBEAM=1 DEV=NV PYTHONPATH=. python3 extra/hevc/decode.py
|
||||
- name: Train MNIST
|
||||
|
|
@ -410,11 +410,11 @@ jobs:
|
|||
# LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py
|
||||
- name: Test speed vs theoretical
|
||||
run: DEV=AMD IGNORE_BEAM_CACHE=1 CCACHE=0 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
|
||||
- name: Test tensor cores AMD_LLVM=0
|
||||
run: DEV=AMD AMD_LLVM=0 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test tensor cores (no LLVM)
|
||||
run: DEV=AMD python3 test/opt/test_tensor_cores.py
|
||||
# TODO: this is flaky
|
||||
# - name: Test tensor cores AMD_LLVM=1
|
||||
# run: DEV=AMD AMD_LLVM=1 python3 test/opt/test_tensor_cores.py
|
||||
# - name: Test tensor cores AMD:LLVM
|
||||
# run: DEV=AMD:LLVM python3 test/opt/test_tensor_cores.py
|
||||
- name: Run Tensor Core GEMM (AMD)
|
||||
run: |
|
||||
DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
|
|
@ -591,7 +591,7 @@ jobs:
|
|||
- name: openpilot compile3 0.11.0 driving_vision
|
||||
run: BENCHMARK_LOG=openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
|
||||
- name: IR3 openpilot compile3 0.11.0 driving_vision
|
||||
run: BENCHMARK_LOG=ir3_openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM QCOM_IR3=1 FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
|
||||
run: BENCHMARK_LOG=ir3_openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM:IR3 FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
|
||||
- name: openpilot compile3 0.11.0 driving_policy
|
||||
run: BENCHMARK_LOG=openpilot_0_11_0_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=3 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_policy.onnx
|
||||
- name: openpilot compile3 0.11.0 dmonitoring
|
||||
|
|
@ -609,7 +609,7 @@ jobs:
|
|||
# generate quantized weights
|
||||
ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
|
||||
ln -s /data/home/tiny/tinygrad/testsig-*.so .
|
||||
PYTHONPATH=. CC=clang-19 DEV=CPU CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
|
||||
PYTHONPATH=. CC=clang-19 DEV=CPU QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
|
||||
# benchmark on DSP with NOOPT=1, the devectorizer has issues
|
||||
PYTHONPATH=. CC=clang-19 DEV=DSP NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
|
||||
- name: Run process replay tests
|
||||
|
|
@ -632,7 +632,7 @@ jobs:
|
|||
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
|
||||
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
|
||||
- name: openpilot compile3 0.10.1 driving_vision
|
||||
run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." GMMU=0 DEV=AMD AMD_LLVM=1 AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
|
||||
run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." GMMU=0 DEV=AMD:LLVM AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
|
||||
- name: openpilot load_pickle 0.10.1 driving_vision
|
||||
run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision_load_pickle PYTHONPATH="." GMMU=0 DEV=AMD AMD_IFACE=USB ASSERT_MIN_LOAD_TIME=15 python3 examples/openpilot/load_pickle.py
|
||||
|
||||
|
|
@ -677,8 +677,8 @@ jobs:
|
|||
# Fails on 9070
|
||||
# - name: Test tensor cores
|
||||
# run: |
|
||||
# DEV=AMD AMD_LLVM=0 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
|
||||
# DEV=AMD AMD_LLVM=1 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
|
||||
# DEV=AMD python3 test/test_linearizer.py test/opt/test_tensor_cores.py
|
||||
# DEV=AMD:LLVM python3 test/test_linearizer.py test/opt/test_tensor_cores.py
|
||||
# DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
- name: Run Tensor Core GEMM (AMD)
|
||||
run: DEV=AMD SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
|
||||
|
|
|
|||
64
.github/workflows/test.yml
vendored
64
.github/workflows/test.yml
vendored
|
|
@ -29,9 +29,9 @@ jobs:
|
|||
deps: testing_unit
|
||||
llvm: 'true'
|
||||
- name: Speed Test
|
||||
run: DEV=CPU CPU_LLVM=1 THREADS=0 python3 test/speed/external_test_speed_v_torch.py
|
||||
run: DEV=CPU:LLVM THREADS=0 python3 test/speed/external_test_speed_v_torch.py
|
||||
- name: Speed Test (BEAM=2)
|
||||
run: BEAM=2 DEV=CPU CPU_LLVM=1 THREADS=0 python3 test/speed/external_test_speed_v_torch.py
|
||||
run: BEAM=2 DEV=CPU:LLVM THREADS=0 python3 test/speed/external_test_speed_v_torch.py
|
||||
|
||||
docs:
|
||||
name: Docs
|
||||
|
|
@ -83,7 +83,7 @@ jobs:
|
|||
run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
|
||||
- name: Compile EfficientNet to C and test it
|
||||
run: |
|
||||
DEV=CPU CPU_LLVM=0 python examples/compile_efficientnet.py > recognize.c
|
||||
DEV=CPU python examples/compile_efficientnet.py > recognize.c
|
||||
clang -O2 recognize.c -lm -o recognize
|
||||
cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock
|
||||
|
||||
|
|
@ -114,11 +114,11 @@ jobs:
|
|||
- name: Test one op in torch tests
|
||||
run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
|
||||
- name: Test Ops with TINY_BACKEND
|
||||
run: DEV=CPU CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/backend/test_ops.py --durations=20
|
||||
run: DEV=CPU:LLVM LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/backend/test_ops.py --durations=20
|
||||
- name: Test in-place operations on views
|
||||
run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
|
||||
- name: Test multi-gpu
|
||||
run: DEV=CPU CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
|
||||
run: DEV=CPU:LLVM GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
|
||||
- name: Test kernel fusion
|
||||
run: python3 extra/torch_backend/test_kernel_fusion.py
|
||||
|
||||
|
|
@ -423,7 +423,7 @@ jobs:
|
|||
- name: Test openpilot CL compile fp32 (test correctness)
|
||||
run: DEV=CL IMAGE=1 SELFTEST=1 python examples/openpilot/compile3.py https://github.com/haraschax/filedump/raw/refs/heads/master/driving_vision_fp32.onnx
|
||||
- name: Test openpilot LLVM compile fp16
|
||||
run: IMAGE=1 FLOAT16=1 DEV=CPU CPU_LLVM=1 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/cf6376aa9a090f0da26c280ef69eabf9bbdd51d1faac9ed392919c3db69be916
|
||||
run: IMAGE=1 FLOAT16=1 DEV=CPU:LLVM python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/cf6376aa9a090f0da26c280ef69eabf9bbdd51d1faac9ed392919c3db69be916
|
||||
- name: Run process replay tests
|
||||
uses: ./.github/actions/process-replay
|
||||
|
||||
|
|
@ -445,15 +445,15 @@ jobs:
|
|||
python-version: '3.12'
|
||||
llvm: 'true'
|
||||
- name: Test ONNX (CPU)
|
||||
run: DEV=CPU CPU_LLVM=0 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
- name: Test ONNX (LLVM)
|
||||
run: DEV=CPU CPU_LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
run: DEV=CPU:LLVM python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
|
||||
- name: Test ONNX Runner (CPU)
|
||||
run: DEV=CPU CPU_LLVM=0 python3 test/external/external_test_onnx_runner.py
|
||||
run: DEV=CPU python3 test/external/external_test_onnx_runner.py
|
||||
- name: Test Additional ONNX Ops (CPU)
|
||||
run: DEV=CPU CPU_LLVM=0 python3 test/external/external_test_onnx_ops.py
|
||||
run: DEV=CPU python3 test/external/external_test_onnx_ops.py
|
||||
- name: Test Quantize ONNX
|
||||
run: DEV=CPU CPU_LLVM=0 python3 test/backend/test_quantize_onnx.py
|
||||
run: DEV=CPU python3 test/backend/test_quantize_onnx.py
|
||||
- name: Run process replay tests
|
||||
uses: ./.github/actions/process-replay
|
||||
|
||||
|
|
@ -529,11 +529,11 @@ jobs:
|
|||
opencl: 'true'
|
||||
llvm: 'true'
|
||||
- name: Test models (llvm)
|
||||
run: DEV=CPU CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20
|
||||
run: DEV=CPU:LLVM python -m pytest -n=auto test/models --durations=20
|
||||
- name: Test models (opencl)
|
||||
run: DEV=CL python -m pytest -n=auto test/models --durations=20
|
||||
- name: Test models (cpu)
|
||||
run: DEV=CPU CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20
|
||||
run: DEV=CPU python -m pytest -n=auto test/models --durations=20
|
||||
- name: Run process replay tests
|
||||
uses: ./.github/actions/process-replay
|
||||
|
||||
|
|
@ -572,11 +572,11 @@ jobs:
|
|||
pydeps: "pillow"
|
||||
llvm: "true"
|
||||
- name: Test LLVM=1 DEVECTORIZE=0
|
||||
run: DEV=CPU CPU_LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
|
||||
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
|
||||
- name: Test LLVM=1 DEVECTORIZE=0 for model
|
||||
run: DEV=CPU CPU_LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
|
||||
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py
|
||||
- name: Test DEV=CPU DEVECTORIZE=0
|
||||
run: DEV=CPU CPU_LLVM=0 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
|
||||
run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
|
||||
|
||||
testdsp:
|
||||
name: Linux (DSP)
|
||||
|
|
@ -667,9 +667,9 @@ jobs:
|
|||
- name: Install rocprof-trace-decoder
|
||||
run: sudo PYTHONPATH="." ./extra/sqtt/install_rocprof_decoder.py
|
||||
- name: Run AMD renderer tests
|
||||
run: AMD_LLVM=0 python -m pytest -n=auto test/amd/ --durations 20
|
||||
- name: Run AMD renderer tests (AMD_LLVM=1)
|
||||
run: AMD_LLVM=1 python -m pytest -n=auto test/amd/ --durations 20
|
||||
run: python -m pytest -n=auto test/amd/ --durations 20
|
||||
- name: Run AMD renderer tests (AMD:LLVM)
|
||||
run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
|
||||
- name: Run SQTT profiling tests
|
||||
run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
|
||||
- name: Run AMD emulated tests on NULL backend
|
||||
|
|
@ -681,7 +681,7 @@ jobs:
|
|||
- name: Run ASM matmul on MOCKGPU
|
||||
run: PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
|
||||
- name: Run LLVM test
|
||||
run: AMD_LLVM=1 python test/device/test_amd_llvm.py
|
||||
run: DEV=AMD:LLVM python test/device/test_amd_llvm.py
|
||||
|
||||
testmockam:
|
||||
name: Linux (am)
|
||||
|
|
@ -726,11 +726,10 @@ jobs:
|
|||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
DEV: AMD
|
||||
DEV: AMD${{ matrix.backend == 'amdllvm' && ':LLVM' || '' }}
|
||||
MOCKGPU: 1
|
||||
MOCKGPU_ARCH: ${{ matrix.arch }}
|
||||
SKIP_SLOW_TEST: 1
|
||||
AMD_LLVM: ${{ matrix.backend == 'amdllvm' && '1' || matrix.backend != 'amdllvm' && '0' }}
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v6
|
||||
|
|
@ -777,7 +776,7 @@ jobs:
|
|||
cuda: 'true'
|
||||
ocelot: 'true'
|
||||
- name: Set env
|
||||
run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA\nCUDA_PTX=1' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
|
||||
run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
|
||||
- name: Check Device.DEFAULT and print some source
|
||||
run: |
|
||||
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
|
||||
|
|
@ -811,7 +810,7 @@ jobs:
|
|||
llvm: ${{ matrix.backend == 'llvm' || matrix.backend == 'lvp' }}
|
||||
mesa: ${{ matrix.backend == 'lvp' && 'true' }}
|
||||
- name: Set env
|
||||
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'DEV=CL' || matrix.backend == 'lvp' && 'DEV=CPU\nCPU_LVP=1' }}" >> $GITHUB_ENV
|
||||
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'DEV=CL' || matrix.backend == 'lvp' && 'DEV=CPU:LVP' }}" >> $GITHUB_ENV
|
||||
- name: Check Device.DEFAULT and print some source
|
||||
run: |
|
||||
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT"
|
||||
|
|
@ -864,23 +863,20 @@ jobs:
|
|||
env:
|
||||
MOCKGPU: 1
|
||||
DEV: AMD
|
||||
AMD_LLVM: 0
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
|
||||
- name: Run pytest (amd with llvm backend)
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
DEV: AMD
|
||||
AMD_LLVM: 1
|
||||
DEV: "AMD:LLVM"
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
|
||||
- name: Run pytest (ptx)
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
NV_PTX: 1
|
||||
DEV: NV
|
||||
DEV: "NV:PTX"
|
||||
FORWARD_ONLY: 1
|
||||
# TODO: failing due to library loading error
|
||||
CAPTURE_PROCESS_REPLAY: 0
|
||||
|
|
@ -945,7 +941,7 @@ jobs:
|
|||
llvm: ${{ matrix.backend == 'llvm' || matrix.backend == 'lvp' }}
|
||||
mesa: ${{ matrix.backend == 'lvp' && 'true' }}
|
||||
- name: Set env
|
||||
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'metal' && 'DEV=METAL' || matrix.backend == 'lvp' && 'DEV=CPU\nCPU_LVP=1' }}" >> $GITHUB_ENV
|
||||
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'metal' && 'DEV=METAL' || matrix.backend == 'lvp' && 'DEV=CPU:LVP' }}" >> $GITHUB_ENV
|
||||
- name: Check Device.DEFAULT and print some source
|
||||
run: |
|
||||
python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU','LVP':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT"
|
||||
|
|
@ -980,7 +976,7 @@ jobs:
|
|||
pydeps: ${{ matrix.backend == 'webgpu' && 'dawn-python' || '' }}
|
||||
- name: Set env
|
||||
shell: bash
|
||||
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'webgpu' && 'DEV=WEBGPU'}}" >> $GITHUB_ENV
|
||||
run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'webgpu' && 'DEV=WEBGPU'}}" >> $GITHUB_ENV
|
||||
- name: Run unit tests
|
||||
if: matrix.backend=='llvm'
|
||||
# test_newton_schulz hits RecursionError
|
||||
|
|
@ -988,7 +984,7 @@ jobs:
|
|||
- name: Run NULL backend tests
|
||||
if: matrix.backend=='llvm'
|
||||
shell: bash
|
||||
run: CPU=0 CPU_LLVM=0 DEV=NULL python -m pytest -n=auto test/null/ --ignore=test/null/test_elf.py --durations=20
|
||||
run: DEV=NULL python -m pytest -n=auto test/null/ --ignore=test/null/test_elf.py --durations=20
|
||||
- name: Run pytest (${{ matrix.backend }})
|
||||
shell: bash
|
||||
run: |
|
||||
|
|
@ -1017,7 +1013,7 @@ jobs:
|
|||
python-version: '3.12'
|
||||
- name: Set env
|
||||
shell: bash
|
||||
run: printf "DEV=NULL\nNULL_ALLOW_COPYOUT=1\n${{ matrix.backend == 'ir3' && 'NULL_IR3=1' || matrix.backend == 'nak' && 'NULL_NAK=1' }}" >> $GITHUB_ENV
|
||||
run: printf "NULL_ALLOW_COPYOUT=1\n${{ matrix.backend == 'ir3' && 'DEV=NULL:IR3' || matrix.backend == 'nak' && 'DEV=NULL:NAK' }}" >> $GITHUB_ENV
|
||||
- name: Run test_ops
|
||||
shell: bash
|
||||
run: |
|
||||
|
|
@ -1040,7 +1036,7 @@ jobs:
|
|||
python-version: '3.12'
|
||||
- name: Set env
|
||||
shell: bash
|
||||
run: printf "DEV=NULL\nNULL_ALLOW_COPYOUT=1\nNULL_QCOMCL=1" >> $GITHUB_ENV
|
||||
run: printf "DEV=NULL:QCOMCL\nNULL_ALLOW_COPYOUT=1" >> $GITHUB_ENV
|
||||
- name: Run test_ops
|
||||
shell: bash
|
||||
run: |
|
||||
|
|
|
|||
|
|
@ -28,6 +28,13 @@ The columns of this list are are: Variable, Possible Value(s) and Description.
|
|||
|
||||
These control the behavior of core tinygrad even when used as a library.
|
||||
|
||||
### DEV variable
|
||||
|
||||
The `DEV` variable deserves special note due to its more nuanced syntax.
|
||||
`DEV` is used to specify the target device and target renderer for said device, separated by colons.
|
||||
Specifying the renderer is optional, omitting a preference will cause tinygrad to automatically select a renderer from those
|
||||
available on the system. Some example values for `DEV` are: `AMD`, `AMD:LLVM`, `NV:PTX`, etc.
|
||||
|
||||
Variable | Possible Value(s) | Description
|
||||
---|---|---
|
||||
DEBUG | [1-7] | enable debugging output (operations, timings, speed, generated code and more)
|
||||
|
|
|
|||
|
|
@ -4,13 +4,13 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra
|
|||
|
||||
| Runtime | Description | Compiler Options | Requirements |
|
||||
|---------|-------------|------------------|--------------|
|
||||
| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`NV_PTX=1`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
|
||||
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`AMD_LLVM=1`)<br>HIP/COMGR (`AMD_HIP=1`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
|
||||
| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`DEV=NV:PTX`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
|
||||
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`DEV=AMD:LLVM`)<br>HIP/COMGR (`DEV=AMD:HIP`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
|
||||
| [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | - | 6xx series GPUs |
|
||||
| [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | - | M1+ Macs; Metal 3.0+ for `bfloat` support |
|
||||
| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`CUDA_PTX=1`) | NVIDIA GPU with CUDA support |
|
||||
| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`DEV=CUDA:PTX`) | NVIDIA GPU with CUDA support |
|
||||
| [CL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | - | OpenCL 2.0 compatible device |
|
||||
| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`CPU_LLVM=1`) | `clang` compiler in system `PATH` |
|
||||
| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`DEV=CPU:LLVM`) | `clang` compiler in system `PATH` |
|
||||
| [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | - | Dawn library installed and discoverable. Binaries: [pydawn v0.3.0](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0) |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ This will produce a binary in the `extra/remu/target/release` directory.
|
|||
|
||||
The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally).
|
||||
|
||||
Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `AMD_LLVM=1` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
|
||||
Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `DEV=AMD:LLVM` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
|
||||
|
||||
`PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu.
|
||||
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ class TestFmacE64(unittest.TestCase):
|
|||
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 7.0, places=5)
|
||||
|
||||
def test_v_fmac_f32_e64_with_sgpr_sources(self):
|
||||
"""V_FMAC_F32_E64 with SGPR sources (common in AMD_LLVM output).
|
||||
"""V_FMAC_F32_E64 with SGPR sources (common in AMD:LLVM output).
|
||||
|
||||
This tests the exact pattern that was failing: v_fmac_f32_e64(v[0], s[4], 0)
|
||||
where src0 is SGPR and src1 is inline constant 0.
|
||||
|
|
@ -936,7 +936,7 @@ class TestF16Modifiers(unittest.TestCase):
|
|||
def test_v_fmac_f16_hi_dest(self):
|
||||
"""v_fmac_f16 with .h destination: dst.h = src0 * src1 + dst.h.
|
||||
|
||||
This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
|
||||
This tests the case from AMD:LLVM sin(0) where V_FMAC_F16 writes to v0.h.
|
||||
"""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x38003c00), # v0 = {hi=0.5, lo=1.0}
|
||||
|
|
|
|||
|
|
@ -504,7 +504,7 @@ class TestTinygradKernels(unittest.TestCase):
|
|||
x_np = np.random.uniform(-2, 2, (33,)).astype(np.float32)
|
||||
self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).unsqueeze(-1).expand(33, 3).flatten().sum())
|
||||
|
||||
@unittest.skip("slow and broken with AMD_LLVM=1")
|
||||
@unittest.skip("slow and broken with AMD:LLVM")
|
||||
def test_nonzero(self):
|
||||
"""Test nonzero operation - counts and gathers indices of non-zero elements."""
|
||||
import numpy as np
|
||||
|
|
|
|||
|
|
@ -60,11 +60,11 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
|
|||
from test.amd.test_compare_emulators import get_kernels_from_tinygrad
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
|
||||
from tinygrad.helpers import AMD_LLVM
|
||||
from tinygrad.helpers import DEV
|
||||
|
||||
kernels, _, _ = get_kernels_from_tinygrad(op_fn)
|
||||
# rendered source can be C or llvmir
|
||||
compiler = (AMDLLVMCompiler if AMD_LLVM else HIPCompiler)(get_target(arch))
|
||||
compiler = (AMDLLVMCompiler if DEV.renderer == "LLVM" else HIPCompiler)(get_target(arch))
|
||||
|
||||
# First pass: decode all instructions and collect info
|
||||
decoded_instrs: list[tuple] = [] # list of (ki, offset, orig_bytes, decoded, our_disasm, decode_ok, decode_err)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
|
|||
import numpy as np
|
||||
from typing import List, Callable
|
||||
import torch
|
||||
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, AMD_LLVM, EMULATE
|
||||
from tinygrad.helpers import getenv, CI, DEBUG, DEV, EMULATE, IMAGE, Context
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.device import is_dtype_supported
|
||||
|
|
@ -560,7 +560,7 @@ class TestOps(unittest.TestCase):
|
|||
helper_test_op([(45,65), (45,65)], lambda x,y: x/y)
|
||||
helper_test_op([(), ()], lambda x,y: x/y)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "AMD" and AMD_LLVM, "AMD with LLVM backend generate rcp in FP division causes trunc/floor errors")
|
||||
@unittest.skipIf(Device.DEFAULT == "AMD" and DEV.renderer == "LLVM", "AMD with LLVM backend generate rcp in FP division causes trunc/floor errors")
|
||||
def test_div_rounding_mode(self):
|
||||
for denominator in [-10, -5, -3, -2, -1, 1, 2, 3, 5, 10]:
|
||||
# int numerator
|
||||
|
|
@ -843,7 +843,7 @@ class TestOps(unittest.TestCase):
|
|||
self.assertEqual(a, b)
|
||||
self.assertEqual(Tensor(-1).contiguous().idiv(4).item(), 0) # NOTE this is trunc-div behaviour
|
||||
|
||||
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
|
||||
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
|
||||
def test_sin(self):
|
||||
helper_test_op([(45,65)], lambda x: x.sin())
|
||||
helper_test_op([()], lambda x: x.sin())
|
||||
|
|
@ -853,7 +853,7 @@ class TestOps(unittest.TestCase):
|
|||
helper_test_op(None, lambda x: x.sin(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
|
||||
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and platform.system() == "Windows", "Not accurate enough with DirectX backend")
|
||||
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
|
||||
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
|
||||
def test_cos(self):
|
||||
helper_test_op([(45,65)], lambda x: x.cos())
|
||||
helper_test_op([()], lambda x: x.cos())
|
||||
|
|
@ -862,7 +862,7 @@ class TestOps(unittest.TestCase):
|
|||
helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
|
||||
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and platform.system() == "Windows", "Not accurate enough with DirectX backend")
|
||||
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
|
||||
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
|
||||
def test_tan(self):
|
||||
# NOTE: backward has much higher diff with input close to pi/2 and -pi/2
|
||||
helper_test_op([(45,65)], lambda x: x.tan(), low=-1.5, high=1.5)
|
||||
|
|
@ -1398,7 +1398,7 @@ class TestOps(unittest.TestCase):
|
|||
np.arange(64,128,dtype=np.float32).reshape(8,8)])
|
||||
def test_small_gemm_eye(self):
|
||||
helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
|
||||
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
|
||||
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and DEV.renderer == "LLVM") or IMAGE
|
||||
or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "not precise enough")
|
||||
def test_gemm_fp16(self):
|
||||
|
|
@ -2396,7 +2396,7 @@ class TestOps(unittest.TestCase):
|
|||
lambda x,w: torch.nn.functional.conv2d(x,w,stride=2),
|
||||
lambda x,w: Tensor.conv2d(x,w,stride=2))
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "CPU" and CPU_LLVM, "DEVECTORIZE=0 only for LLVM")
|
||||
@unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM")
|
||||
def test_strided_conv2d_simple_vec(self):
|
||||
with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()
|
||||
|
||||
|
|
@ -2715,9 +2715,9 @@ class TestOps(unittest.TestCase):
|
|||
lambda x: Tensor.avg_pool2d(x, kernel_size=(11,28)), rtol=1e-5)
|
||||
|
||||
def test_avg_pool3d(self):
|
||||
# TODO: AMD_LLVM has larger atol
|
||||
# TODO: AMD:LLVM has larger atol
|
||||
# TODO: DEV=PYTHON backward hangs?
|
||||
atol = 1e-2 if AMD_LLVM else 1e-6
|
||||
atol = 1e-2 if DEV.device == "AMD" and DEV.renderer == "LLVM" else 1e-6
|
||||
helper_test_op([(1,1,16,16,16)],
|
||||
lambda x: torch.nn.functional.avg_pool3d(x, kernel_size=(8,8,8), stride=5, padding=1, count_include_pad=False),
|
||||
lambda x: Tensor.avg_pool2d(x, kernel_size=(8,8,8), stride=5, padding=1, count_include_pad=False), atol=atol, rtol=1e-5, forward_only=True)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.helpers import Context, getenv, CI, OSX
|
||||
from tinygrad.helpers import Context, getenv, CI, DEV, OSX
|
||||
from test.backend.test_schedule import check_schedule
|
||||
from test.backend.test_dtype_alu import ht, dtypes_float
|
||||
from tinygrad.device import is_dtype_supported
|
||||
|
|
@ -188,7 +188,7 @@ class TestTranscendentalVectorized(unittest.TestCase):
|
|||
for vec_size in [1,2,3,4,5,127,128]: self._test_vectorized_op(Tensor.log2, np.log2, (0.001, 200), vec_size)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "DSP", "requires int division")
|
||||
@unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
|
||||
@unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and OSX, "WEBGPU Metal backend is not accurate enough")
|
||||
def test_sin_vectorized(self):
|
||||
for vec_size in [1,2,3,4,5,127,128]: self._test_vectorized_op(Tensor.sin, np.sin, (-100, 100), vec_size)
|
||||
|
|
|
|||
|
|
@ -11,8 +11,7 @@ from examples.stable_diffusion import AutoencoderKL
|
|||
def set_eval_params():
|
||||
# override these as needed from cli
|
||||
for k,v in {"MODEL": "stable_diffusion", "GPUS": "8", "EVAL_SAMPLES": "600", "CONTEXT_BS": "816", "DENOISE_BS": "600", "DECODE_BS": "384",
|
||||
"INCEPTION_BS": "560", "CLIP_BS": "240", "DATADIR": "/raid/datasets/stable_diffusion", "CKPTDIR": "/raid/weights/stable_diffusion",
|
||||
"AMD_LLVM": "0"}.items():
|
||||
"INCEPTION_BS": "560", "CLIP_BS": "240", "DATADIR": "/raid/datasets/stable_diffusion", "CKPTDIR": "/raid/weights/stable_diffusion"}.items():
|
||||
os.environ[k] = getenv(k, v)
|
||||
|
||||
class TestEval(unittest.TestCase):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
An integrated environment for AMD GPU assembly and emulation
|
||||
|
||||
Test with `pytest -n12 test/amd/`
|
||||
`AMD_LLVM=1 pytest -n12 test/amd/`
|
||||
`DEV=AMD:LLVM pytest -n12 test/amd/`
|
||||
|
||||
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
|
||||
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1 PYTHON_REMU=1`
|
||||
|
|
@ -20,13 +20,13 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
|
|||
|
||||
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
|
||||
|
||||
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
|
||||
The ops tests also pass, but they are very slow, so you should run them one at a time.
|
||||
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
|
||||
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
|
||||
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import unittest, io
|
||||
from contextlib import redirect_stdout
|
||||
from tinygrad import Tensor, dtypes, Device
|
||||
from tinygrad.helpers import OSX, CPU_LLVM, CPU_LVP
|
||||
from tinygrad.helpers import OSX, DEV
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.engine.realize import get_program
|
||||
|
||||
|
|
@ -18,7 +18,7 @@ class TestCompileFailures(unittest.TestCase):
|
|||
|
||||
class TestDisassembly(unittest.TestCase):
|
||||
# TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
|
||||
@unittest.skipUnless(Device.DEFAULT in ("CPU",) and not (CPU_LLVM or CPU_LVP) and OSX, "m series cpus support fp16 arithmetic")
|
||||
@unittest.skipUnless(Device.DEFAULT in ("CPU",) and DEV.renderer not in ("LLVM", "LVP") and OSX, "m series cpus support fp16 arithmetic")
|
||||
def test_float16_alu(self):
|
||||
c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
|
||||
s = c.schedule()[-1]
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import unittest, os, subprocess
|
|||
from unittest.mock import patch
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.device import Device, Compiler, enumerate_devices_str
|
||||
from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, WIN, CI, OSX
|
||||
from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, Target, WIN, CI, OSX, DEV
|
||||
from tinygrad.runtime.support.c import DLL
|
||||
|
||||
class TestDevice(unittest.TestCase):
|
||||
|
|
@ -39,6 +39,12 @@ class TestDevice(unittest.TestCase):
|
|||
self.assertNotEqual(result.returncode, 0)
|
||||
self.assertIn(b"deprecated", result.stderr)
|
||||
|
||||
def test_old_renderer_env_raises(self):
|
||||
result = subprocess.run(['python3', '-c', 'from tinygrad import Device; Device[Device.DEFAULT].renderer'],
|
||||
env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"}, capture_output=True)
|
||||
self.assertNotEqual(result.returncode, 0)
|
||||
self.assertIn(b"deprecated", result.stderr)
|
||||
|
||||
@unittest.skipIf(WIN and CI, "skipping windows test") # TODO: subprocess causes memory violation?
|
||||
def test_env_overwrite_default_compiler(self):
|
||||
if Device.DEFAULT == "CPU":
|
||||
|
|
@ -48,13 +54,11 @@ class TestDevice(unittest.TestCase):
|
|||
|
||||
imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler"
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"})
|
||||
shell=True, check=True, env={**os.environ, "DEV": "CPU:LLVM"})
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "0"})
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CC": "LLVM"})
|
||||
shell=True, check=True, env={**os.environ, "DEV": "CPU"})
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CC": "CLANGJIT"})
|
||||
shell=True, check=True, env={**os.environ, "DEV": "CPU:CLANGJIT"})
|
||||
elif Device.DEFAULT == "AMD":
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
|
||||
try: _, _ = HIPCompiler(Device[Device.DEFAULT].arch), AMDLLVMCompiler(Device[Device.DEFAULT].arch)
|
||||
|
|
@ -62,27 +66,25 @@ class TestDevice(unittest.TestCase):
|
|||
|
||||
imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler"
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "1"})
|
||||
shell=True, check=True, env={**os.environ, "DEV": "AMD:LLVM"})
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "0"})
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_CC": "LLVM"})
|
||||
shell=True, check=True, env={**os.environ, "DEV": "AMD"})
|
||||
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
|
||||
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_CC": "HIP"})
|
||||
shell=True, check=True, env={**os.environ, "DEV": "AMD:HIP"})
|
||||
else: self.skipTest("only run on CPU/AMD")
|
||||
|
||||
@unittest.skipIf((WIN and CI) or (not Device.DEFAULT == "CPU"), "skipping windows test")
|
||||
@unittest.skipIf(WIN and CI, "skipping windows test")
|
||||
def test_env_online(self):
|
||||
from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
|
||||
try: _, _ = CPULLVMCompiler(), ClangJITCompiler()
|
||||
except Exception as e: self.skipTest(f"skipping compiler test: not all compilers: {e}")
|
||||
|
||||
with Context(CPU_LLVM=1):
|
||||
with Context(DEV="CPU:LLVM"):
|
||||
inst = Device["CPU"].compiler
|
||||
self.assertIsInstance(Device["CPU"].compiler, CPULLVMCompiler)
|
||||
with Context(CPU_LLVM=0):
|
||||
with Context(DEV="CPU"):
|
||||
self.assertIsInstance(Device["CPU"].compiler, ClangJITCompiler)
|
||||
with Context(CPU_LLVM=1):
|
||||
with Context(DEV="CPU:LLVM"):
|
||||
self.assertIsInstance(Device["CPU"].compiler, CPULLVMCompiler)
|
||||
assert inst is Device["CPU"].compiler # cached
|
||||
|
||||
|
|
@ -94,7 +96,7 @@ class TestDevice(unittest.TestCase):
|
|||
except Exception as e: self.skipTest(f"skipping: LLVM not available: {e}")
|
||||
|
||||
dev = Device["CPU"]
|
||||
dev.cached_pair.clear()
|
||||
dev.cached_renderer.clear()
|
||||
with patch("tinygrad.renderer.cstyle.ClangJITRenderer.__init__", side_effect=RuntimeError("broken")):
|
||||
self.assertIsInstance(dev.renderer.compiler, CPULLVMCompiler)
|
||||
|
||||
|
|
@ -104,6 +106,21 @@ class TestDevice(unittest.TestCase):
|
|||
with Context(DEV="NULL"): self.assertEqual(Tensor.empty(1).device, "NULL")
|
||||
self.assertEqual(Tensor.empty(1).device, orig_dev)
|
||||
|
||||
class TestDevVar(unittest.TestCase):
|
||||
def test_parse(self):
|
||||
for d, t in [("AMD", Target(device="AMD", renderer="")), ("AMD:LLVM", Target(device="AMD", renderer="LLVM")),
|
||||
(":LLVM", Target(device="", renderer="LLVM"))]:
|
||||
with Context(DEV=d):
|
||||
self.assertEqual(DEV.value, t)
|
||||
self.assertEqual(str(DEV.value), d)
|
||||
|
||||
def test_target(self):
|
||||
with Context(DEV="CPU"): self.assertEqual(DEV.target("CPU"), Target("CPU"))
|
||||
with Context(DEV="CPU:LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU", "LLVM"))
|
||||
with Context(DEV=":LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU", "LLVM"))
|
||||
with Context(DEV="AMD:LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU"))
|
||||
with Context(DEV=""): self.assertEqual(DEV.target("CPU"), Target("CPU"))
|
||||
|
||||
class MockCompiler(Compiler):
|
||||
def __init__(self, key): super().__init__(key)
|
||||
def compile(self, src) -> bytes: return src.encode()
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
|
|||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM, Context
|
||||
from tinygrad.helpers import AMX, DEV, Context
|
||||
from test.helpers import slow
|
||||
from tinygrad.engine.realize import CompiledRunner, get_program
|
||||
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
||||
|
|
@ -75,9 +75,9 @@ class TestTensorCores(unittest.TestCase):
|
|||
a, b = Tensor.rand(m, k, dtype=tc.dtype_in), Tensor.rand(k, n, dtype=tc.dtype_in)
|
||||
r = a.matmul(b, dtype=tc.dtype_out)
|
||||
prg = get_program(r.schedule()[-1].ast, Device[Device.DEFAULT].renderer, opts=[Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))])
|
||||
if Device.DEFAULT == "CPU" and CPU_LLVM:
|
||||
if Device.DEFAULT == "CPU" and DEV.renderer == "LLVM":
|
||||
assert "0x201000" in prg.src
|
||||
elif Device.DEFAULT == "AMD" and AMD_LLVM:
|
||||
elif Device.DEFAULT == "AMD" and DEV.renderer == "LLVM":
|
||||
assert "@llvm.amdgcn.wmma" in prg.src
|
||||
elif Device[Device.DEFAULT].renderer.suffix == "PTX":
|
||||
assert "mma.sync.aligned" in prg.src
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import numpy as np
|
|||
from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import temp, CI, CPU_LVP, Context
|
||||
from tinygrad.helpers import temp, CI, DEV, Context
|
||||
|
||||
N = 200 # has to be bigger than the cache to fail
|
||||
|
||||
|
|
@ -193,7 +193,7 @@ class TestAssign(unittest.TestCase):
|
|||
new = a + times_a
|
||||
np.testing.assert_allclose(new.numpy(), 8)
|
||||
|
||||
@unittest.skipIf(CI and CPU_LVP, "flaky in CI")
|
||||
@unittest.skipIf(CI and DEV.renderer == "LVP", "flaky in CI")
|
||||
def test_double_assign(self):
|
||||
a = Tensor.ones(4).contiguous().realize()
|
||||
a += 1
|
||||
|
|
|
|||
|
|
@ -4,9 +4,8 @@ from collections import defaultdict
|
|||
from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
|
||||
import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
|
||||
from tinygrad.helpers import BENCHMARKS, CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
|
||||
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, ContextVar
|
||||
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, DEV, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
|
||||
from tinygrad.helpers import EMULATE, EMULATED_DTYPES, NULL_IR3, NULL_QCOMCL, IMAGE, FLOAT16, TracingKey, size_to_str
|
||||
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, unwrap_class_type
|
||||
from tinygrad.helpers import suppress_finalizing, select_first_inited, DEV, VIZ, EMULATE, EMULATED_DTYPES, IMAGE, FLOAT16, TracingKey, size_to_str
|
||||
from tinygrad.dtype import DType, PtrDType, dtypes, _to_np_dtype
|
||||
if TYPE_CHECKING: from tinygrad.renderer import Renderer
|
||||
|
||||
|
|
@ -40,7 +39,7 @@ class _Device:
|
|||
for device in ALL_DEVICES:
|
||||
with contextlib.suppress(Exception): yield self[device].device
|
||||
@property
|
||||
def DEFAULT(self) -> str: return DEV.value.upper() if DEV else self._select_device
|
||||
def DEFAULT(self) -> str: return DEV.device or self._select_device
|
||||
@DEFAULT.setter
|
||||
def DEFAULT(self, v): raise AttributeError(f'setting Device.DEFAULT is deprecated, use "with Context(DEV={v!r})" or "DEV.value = {v!r}"')
|
||||
@functools.cached_property
|
||||
|
|
@ -284,20 +283,11 @@ class Compiled:
|
|||
def _renderer_name(self, r:type[Renderer]|functools.partial) -> str:
|
||||
return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname:=self.device.split(':')[0].upper()) or devname
|
||||
|
||||
def _renderer_var(self, r:type[Renderer]|functools.partial) -> ContextVar|None:
|
||||
return ContextVar._cache.get(f"{self.device}_{self._renderer_name(r)}", None)
|
||||
|
||||
def _select_renderer(self) -> Renderer:
|
||||
# select forced compiler from global env var.
|
||||
forced_comps = set([r for r in self.renderers if self._renderer_name(r) == val] if
|
||||
(ctrl:=ContextVar._cache.get(f"{self.device}_CC", None)) is not None and (val:=ctrl.value) else [])
|
||||
|
||||
# add forced compilers from individual env vars (only if global env var is not set, as it takes precedence).
|
||||
if not forced_comps: forced_comps |= set(r for r in self.renderers if (en:=self._renderer_var(r)) is not None and en.value == 1)
|
||||
if len(forced_comps) > 1: raise RuntimeError(f"{self.device}: multiple compilers set in env {forced_comps}")
|
||||
|
||||
return select_first_inited(list(forced_comps) if len(forced_comps)>0 else self.renderers, f"No renderer for {self.device} is available",
|
||||
self.cached_renderer)
|
||||
assert (rn:=next((self._renderer_name(r) for r in self.renderers if getenv(f"{self.device}_{self._renderer_name(r)}")), None)) is None, \
|
||||
f"{self.device}_{rn}=1 is deprecated, use DEV={self.device}:{rn} or {self.device}_CC={rn} instead"
|
||||
renderers = [r for r in self.renderers if self._renderer_name(r) == rn] if (rn:=DEV.target(self.device).renderer) else self.renderers
|
||||
return select_first_inited(renderers, f"No renderer for {self.device} is available", self.cached_renderer)
|
||||
|
||||
def synchronize(self):
|
||||
"""
|
||||
|
|
@ -320,37 +310,45 @@ class Compiled:
|
|||
# TODO: move this to each Device
|
||||
# this only tracks if the dtype is natively supported, it may be supported in the frontend using decomps
|
||||
def is_dtype_supported(dtype:DType, device:str|None=None, arch:str|None=None) -> bool:
|
||||
if device is None: device = Device.DEFAULT
|
||||
target = DEV.target(device or Device.DEFAULT)
|
||||
if dtype == dtypes.bfloat16:
|
||||
if device == "METAL": return not CI or BENCHMARKS
|
||||
if device == "CUDA": return (not CI or BENCHMARKS) and not CUDA_PTX
|
||||
if device == "NV": return (not CI or BENCHMARKS) and not NV_PTX and not NV_NAK
|
||||
if device in {"CPU"}: return (not CI or BENCHMARKS) and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP
|
||||
return device in {"AMD", "CL", "PYTHON", "NULL"}
|
||||
match target.device:
|
||||
case "METAL": return not CI or BENCHMARKS
|
||||
case "CUDA": return (not CI or BENCHMARKS) and target.renderer != "PTX"
|
||||
case "NV": return (not CI or BENCHMARKS) and target.renderer not in ("PTX", "NAK")
|
||||
case "CPU": return (not CI or BENCHMARKS) and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and target.renderer != "LVP"
|
||||
case "AMD" | "CL" | "PYTHON" | "NULL": return True
|
||||
case _: return False
|
||||
if dtype in dtypes.fp8_ocp:
|
||||
if device == "CUDA": return (not CI or BENCHMARKS) and not CUDA_PTX
|
||||
if device == "NV": return (not CI or BENCHMARKS) and not NV_PTX and not NV_NAK
|
||||
if device == "AMD":
|
||||
# TODO: open the device to get arch of device, will be fixed after triple is in the device string
|
||||
if arch is None: arch = getattr(Device[device].renderer, "arch", "")
|
||||
return (not CI or BENCHMARKS) and arch == "gfx950"
|
||||
return device in {"PYTHON", "NULL"}
|
||||
if dtype in dtypes.fp8_fnuz: return device in {"PYTHON", "NULL"}
|
||||
if device == "WEBGPU": return dtype in [dtypes.bool, dtypes.char, dtypes.uchar, dtypes.short,
|
||||
dtypes.ushort, dtypes.float, dtypes.int32, dtypes.uint32, dtypes.half]
|
||||
match target.device:
|
||||
case "CUDA": return (not CI or BENCHMARKS) and target.renderer != "PTX"
|
||||
case "NV": return (not CI or BENCHMARKS) and target.renderer not in ("PTX", "NAK")
|
||||
case "AMD":
|
||||
# TODO: open the device to get arch of device, will be fixed after triple is in the device string
|
||||
if arch is None: arch = getattr(Device[target.device].renderer, "arch", "")
|
||||
return (not CI or BENCHMARKS) and arch == "gfx950"
|
||||
case "PYTHON" | "NULL": return True
|
||||
case _: return False
|
||||
if dtype in dtypes.fp8_fnuz: return target.device in {"PYTHON", "NULL"}
|
||||
if target.device == "WEBGPU": return dtype in [dtypes.bool, dtypes.char, dtypes.uchar, dtypes.short,
|
||||
dtypes.ushort, dtypes.float, dtypes.int32, dtypes.uint32, dtypes.half]
|
||||
# for CI GPU and OSX, cl_khr_fp16 isn't supported
|
||||
# for CI LLVM, it segfaults because it can't link to the casting function
|
||||
# CI CUDA architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
|
||||
# PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751
|
||||
# double can't be bitcast to anything without long support
|
||||
if dtype == dtypes.half:
|
||||
if device == "CL": return (not CI or BENCHMARKS) and not OSX
|
||||
if device == "QCOM": return bool(IMAGE) and bool(FLOAT16) # QCOM compiler is flaky with half
|
||||
if device in ["CUDA", "NV"]: return (not CI or BENCHMARKS) or "CUDA" in EMULATE.value
|
||||
if device == "CPU" and CPU_LLVM: return OSX
|
||||
if device == "PYTHON": return sys.version_info >= (3, 12)
|
||||
if dtype == dtypes.float64: return (device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not NULL_IR3 and not NULL_QCOMCL
|
||||
and dtypes.long not in EMULATED_DTYPES.tolist(dtypes))
|
||||
match target.device:
|
||||
case "CL": return (not CI or BENCHMARKS) and not OSX
|
||||
case "QCOM": return bool(IMAGE) and bool(FLOAT16) # QCOM compiler is flaky with half
|
||||
case "CUDA" | "NV": return (not CI or BENCHMARKS) or "CUDA" in EMULATE.value
|
||||
case "CPU" if target.renderer == "LLVM": return OSX
|
||||
case "PYTHON": return sys.version_info >= (3, 12)
|
||||
if dtype == dtypes.float64:
|
||||
match target.device:
|
||||
case _ if dtypes.long in EMULATED_DTYPES.tolist(dtypes): return False # double can't be bitcast to anything without long support
|
||||
case "CL": return not OSX
|
||||
case "NULL": return target.renderer not in ("IR3", "QCOMCL")
|
||||
case "METAL" | "QCOM": return False
|
||||
return True
|
||||
|
||||
if PROFILE:
|
||||
|
|
@ -373,23 +371,16 @@ def enumerate_devices_str() -> Generator[str, None, None]:
|
|||
compilers_results, any_works = [], False
|
||||
try:
|
||||
d = Device[device]
|
||||
default_renderers, default_renderer = d.renderers, d.renderer
|
||||
try:
|
||||
for r in default_renderers:
|
||||
d.renderers = [r]
|
||||
try:
|
||||
# d.renderer, d.compiler = r(), c()
|
||||
with Context(CACHELEVEL=0, **({f"{device}_CC": d._renderer_name(r)} if (ctrl:=f"{device}_CC") in ContextVar._cache else {})):
|
||||
test = (Tensor([1,2,3], device=device) * 2).tolist()
|
||||
if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
|
||||
set_text = f'({ctrl}={d._renderer_name(r)} to make default)' if (ctrl:=f"{device}_CC") in ContextVar._cache else ''
|
||||
default_text = '(default)' if type(default_renderer) is type(d.renderer) else set_text
|
||||
compilers_results.append(f"{colored('+', 'green')} {d._renderer_name(r)} {default_text}")
|
||||
any_works = True
|
||||
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._renderer_name(r)}: {e}")
|
||||
finally:
|
||||
# put the defaults back!
|
||||
d.renderers = default_renderers
|
||||
default_renderer = d.renderer
|
||||
for r in d.renderers:
|
||||
try:
|
||||
# d.renderer, d.compiler = r(), c()
|
||||
with Context(CACHELEVEL=0, DEV=f"{device}:{d._renderer_name(r)}"): test = (Tensor([1,2,3], device=device) * 2).tolist()
|
||||
if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
|
||||
default_text = '(default)' if type(default_renderer) is type(d.renderer) else f'(DEV={device}:{d._renderer_name(r)} to make default)'
|
||||
compilers_results.append(f"{colored('+', 'green')} {d._renderer_name(r)} {default_text}")
|
||||
any_works = True
|
||||
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._renderer_name(r)}: {e}")
|
||||
result = (colored('PASS', 'green') if any_works else f"{colored('FAIL', 'yellow')}") + ''.join([f'\n{" "*16} {x}' for x in compilers_results])
|
||||
except Exception as e:
|
||||
result = f"{colored('FAIL', 'red')} {e}"
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ START_TIME = time.perf_counter()
|
|||
import os, functools, platform, re, contextlib, operator, hashlib, pickle, sqlite3, tempfile, pathlib, string, ctypes, sys, gzip, getpass, gc
|
||||
from collections import defaultdict
|
||||
import subprocess, shutil, math, types, copyreg, inspect, importlib, decimal, itertools
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass, field, replace
|
||||
from typing import ClassVar, Iterable, Any, TypeVar, Callable, Sequence, TypeGuard, Iterator, Generic, Generator, cast, overload
|
||||
|
||||
T = TypeVar("T")
|
||||
|
|
@ -176,7 +176,32 @@ class ContextVar(Generic[T]):
|
|||
assert isinstance(self.value, str)
|
||||
return [getattr(obj, x) if obj else x for x in self.value.split(',') if x]
|
||||
|
||||
DEV, DEBUG, BEAM, NOOPT = ContextVar("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
|
||||
@dataclass(frozen=True)
|
||||
class Target:
|
||||
device: str = ""
|
||||
renderer: str = ""
|
||||
|
||||
@staticmethod
|
||||
def parse(s:str) -> Target: return Target(*(x.upper() for x in s.split(':')))
|
||||
def __repr__(self) -> str: return self.device + (":" + self.renderer if self.renderer else "")
|
||||
|
||||
class _DEV(ContextVar):
|
||||
_value = Target()
|
||||
@property
|
||||
def value(self) -> Target: return self._value
|
||||
@value.setter
|
||||
def value(self, v:str|Target): self._value = v if isinstance(v, Target) else Target.parse(v)
|
||||
def __getattr__(self, k): return getattr(self.value, k)
|
||||
# get target for device string
|
||||
def target(self, dev:str) -> Target:
|
||||
t = self.value if self.device == dev or not self.device else Target(device=dev)
|
||||
# TODO: remove this once DEV supports secondary targets
|
||||
if (cv:=ContextVar._cache.get(f"{dev}_CC", None)) is not None and cv.value:
|
||||
assert not t.renderer, f"renderer set in DEV and {dev}_CC"
|
||||
return replace(t, renderer=cv.value.upper())
|
||||
return replace(t, device=dev)
|
||||
|
||||
DEV, DEBUG, BEAM, NOOPT = _DEV("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
|
||||
IMAGE, FLOAT16, OPENPILOT_HACKS = ContextVar("IMAGE", 0), ContextVar("FLOAT16", 0), ContextVar("OPENPILOT_HACKS", 0)
|
||||
JIT, JIT_BATCH_SIZE = ContextVar("JIT", 2 if OSX and ARCH_X86 else 1), ContextVar("JIT_BATCH_SIZE", 32)
|
||||
WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)
|
||||
|
|
@ -193,13 +218,9 @@ EMULATE, EMULATED_DTYPES = ContextVar("EMULATE", ""), ContextVar("EMULATED_DTYPE
|
|||
CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0)
|
||||
CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
|
||||
# Compilers
|
||||
CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
|
||||
NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
|
||||
CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
|
||||
NULL_QCOMCL, NULL_IR3, NULL_NAK = ContextVar("NULL_QCOMCL", 0), ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0)
|
||||
CPU_CC, NV_CC, CUDA_CC, NULL_CC = ContextVar("CPU_CC", ""), ContextVar("NV_CC", ""), ContextVar("CUDA_CC", ""), ContextVar("NULL_CC", "")
|
||||
NULL_ALLOW_COPYOUT = ContextVar("NULL_ALLOW_COPYOUT", 0)
|
||||
AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
|
||||
QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
|
||||
AMD_CC, QCOM_CC = ContextVar("AMD_CC", ""), ContextVar("QCOM_CC", "")
|
||||
# VIZ implies PROFILE, but you can run PROFILE without VIZ
|
||||
VIZ = ContextVar("VIZ", 0)
|
||||
PROFILE = ContextVar("PROFILE", abs(VIZ.value))
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ def __getattr__(nm):
|
|||
return load("rocprof", "['rocprof-trace-decoder', p:='/usr/local/lib/rocprof-trace-decoder.so', p.replace('so','dylib')]",
|
||||
[f"{{}}/include/{s}.h" for s in ["rocprof_trace_decoder", "trace_decoder_instrument", "trace_decoder_types"]],
|
||||
srcs="https://github.com/ROCm/rocprof-trace-decoder/archive/dd0485100971522cc4cd8ae136bdda431061a04d.tar.gz")
|
||||
case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu']", [
|
||||
case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or DEV.renderer == 'LVP' else ['tinymesa']) + ['tinymesa_cpu']", [
|
||||
*[f"{{}}/src/compiler/nir/{s}.h" for s in ["nir", "nir_builder", "nir_shader_compiler_options", "nir_serialize"]], "{}/gen/nir_intrinsics.h",
|
||||
*[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
|
||||
*[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
|
||||
|
|
@ -146,7 +146,7 @@ def __getattr__(nm):
|
|||
*[f"python3 src/compiler/{s}_h.py > gen/{s.split('/')[-1]}.h" for s in ["nir/nir_opcodes", "nir/nir_builder_opcodes"]],
|
||||
*[f"python3 src/compiler/nir/nir_{s}_h.py --outdir gen" for s in ["intrinsics", "intrinsics_indices"]]]), cwd=path, shell=True, check=True),
|
||||
srcs="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
|
||||
prolog=["from tinygrad.helpers import CPU_CC, CPU_LVP", "import gzip, base64"],
|
||||
prolog=["from tinygrad.helpers import CPU_CC, DEV", "import gzip, base64"],
|
||||
epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
|
||||
case "libclang":
|
||||
return load("libclang", clang_lib,
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@ import ctypes
|
|||
from typing import Annotated, Literal, TypeAlias
|
||||
from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
|
||||
from tinygrad.runtime.support import c
|
||||
from tinygrad.helpers import CPU_CC, CPU_LVP
|
||||
from tinygrad.helpers import CPU_CC, DEV
|
||||
import gzip, base64
|
||||
dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu'])
|
||||
dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or DEV.renderer == 'LVP' else ['tinymesa']) + ['tinymesa_cpu'])
|
||||
class struct_u_printf_info(ctypes.Structure): pass
|
||||
u_printf_info: TypeAlias = struct_u_printf_info
|
||||
uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue