specify renderer in DEV, <dev>_<ren>=1 is deprecated (#15551)

2026-06-24 02:14:17 +00:00 · 2026-03-31 15:35:14 -07:00 · 2026-03-31 15:35:14 -07:00 · acf239e4d2
commit acf239e4d2
parent 5181c8e23a
20 changed files with 204 additions and 173 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -101,10 +101,10 @@ jobs:
      run: DEV=METAL python3.11 test/opt/test_tensor_cores.py
    - name: Test AMX tensor cores
      run: |
-        DEBUG=2 DEV=CPU CPU_LLVM=0 AMX=1 python3.11 test/opt/test_tensor_cores.py
-        DEBUG=2 DEV=CPU CPU_LLVM=1 AMX=1 python3.11 test/opt/test_tensor_cores.py
-        DEBUG=2 DEV=CPU CPU_LLVM=0 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
-        DEBUG=2 DEV=CPU CPU_LLVM=1 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
+        DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_tensor_cores.py
+        DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_tensor_cores.py
+        DEBUG=2 DEV=CPU AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
+        DEBUG=2 DEV=CPU:LLVM AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
    - name: Run Tensor Core GEMM (float)
      run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (half)
@ -193,9 +193,9 @@ jobs:
    #- name: UsbGPU openpilot test
    #  run: sudo -E PYTHONPATH=. GMMU=0 DEV=AMD AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
    - name: UsbGPU (USB4/TB) boot time
-      run: PYTHONPATH=. DEBUG=3 DEV=NV NV_IFACE=PCI NV_NAK=1 time python3.11 test/test_tiny.py TestTiny.test_plus
+      run: PYTHONPATH=. DEBUG=3 DEV=NV:NAK NV_IFACE=PCI time python3.11 test/test_tiny.py TestTiny.test_plus
    - name: UsbGPU (USB4/TB) tiny tests
-      run: PYTHONPATH=. DEV=NV NV_IFACE=PCI NV_NAK=1 python3.11 test/test_tiny.py
+      run: PYTHONPATH=. DEV=NV:NAK NV_IFACE=PCI python3.11 test/test_tiny.py

  testnvidiabenchmark:
    name: tinybox green Benchmark
@ -237,7 +237,7 @@ jobs:
    - name: Test tensor cores
      run: |
        DEV=NV ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
-        DEV=NV NV_PTX=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
+        DEV=NV:PTX ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
    - name: Run Tensor Core GEMM (CUDA)
      run: |
        DEV=CUDA SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
@ -245,7 +245,7 @@ jobs:
        DEV=CUDA SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
        DEV=CUDA SHOULD_USE_TC=1 FP8E4M3=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (PTX)
-      run: DEV=NV NV_PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
+      run: DEV=NV:PTX SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (NV)
      run: DEV=NV SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Test DEV=NV
@ -328,7 +328,7 @@ jobs:
    #   run: DEV=NV M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
    # TODO: too slow
    # - name: Fuzz Padded Tensor Core GEMM (PTX)
-    #   run: DEV=NV NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
+    #   run: DEV=NV:PTX M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
    - name: HEVC Decode Benchmark
      run: VALIDATE=1 MAX_FRAMES=100 ASSERT_FPS=1400 JITBEAM=1 DEV=NV PYTHONPATH=. python3 extra/hevc/decode.py
    - name: Train MNIST
@ -410,11 +410,11 @@ jobs:
    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py
    - name: Test speed vs theoretical
      run: DEV=AMD IGNORE_BEAM_CACHE=1 CCACHE=0 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
-    - name: Test tensor cores AMD_LLVM=0
-      run: DEV=AMD AMD_LLVM=0 python3 test/opt/test_tensor_cores.py
+    - name: Test tensor cores (no LLVM)
+      run: DEV=AMD python3 test/opt/test_tensor_cores.py
    # TODO: this is flaky
-    # - name: Test tensor cores AMD_LLVM=1
-    #   run: DEV=AMD AMD_LLVM=1 python3 test/opt/test_tensor_cores.py
+    # - name: Test tensor cores AMD:LLVM
+    #   run: DEV=AMD:LLVM python3 test/opt/test_tensor_cores.py
    - name: Run Tensor Core GEMM (AMD)
      run: |
        DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
@ -591,7 +591,7 @@ jobs:
    - name: openpilot compile3 0.11.0 driving_vision
      run: BENCHMARK_LOG=openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
    - name: IR3 openpilot compile3 0.11.0 driving_vision
-      run: BENCHMARK_LOG=ir3_openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM QCOM_IR3=1 FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
+      run: BENCHMARK_LOG=ir3_openpilot_0_11_0_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM:IR3 FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_vision.onnx
    - name: openpilot compile3 0.11.0 driving_policy
      run: BENCHMARK_LOG=openpilot_0_11_0_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=3 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.11.0/selfdrive/modeld/models/driving_policy.onnx
    - name: openpilot compile3 0.11.0 dmonitoring
@ -609,7 +609,7 @@ jobs:
        # generate quantized weights
        ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
        ln -s /data/home/tiny/tinygrad/testsig-*.so .
-        PYTHONPATH=. CC=clang-19 DEV=CPU CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
+        PYTHONPATH=. CC=clang-19 DEV=CPU QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
        # benchmark on DSP with NOOPT=1, the devectorizer has issues
        PYTHONPATH=. CC=clang-19 DEV=DSP NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
    - name: Run process replay tests
@ -632,7 +632,7 @@ jobs:
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: openpilot compile3 0.10.1 driving_vision
-      run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." GMMU=0 DEV=AMD AMD_LLVM=1 AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
+      run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." GMMU=0 DEV=AMD:LLVM AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: openpilot load_pickle 0.10.1 driving_vision
      run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision_load_pickle PYTHONPATH="." GMMU=0 DEV=AMD AMD_IFACE=USB ASSERT_MIN_LOAD_TIME=15 python3 examples/openpilot/load_pickle.py

@ -677,8 +677,8 @@ jobs:
    # Fails on 9070
    # - name: Test tensor cores
    #   run: |
-    #     DEV=AMD AMD_LLVM=0 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
-    #     DEV=AMD AMD_LLVM=1 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
+    #     DEV=AMD python3 test/test_linearizer.py test/opt/test_tensor_cores.py
+    #     DEV=AMD:LLVM python3 test/test_linearizer.py test/opt/test_tensor_cores.py
    #     DEV=AMD SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (AMD)
      run: DEV=AMD SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -29,9 +29,9 @@ jobs:
        deps: testing_unit
        llvm: 'true'
    - name: Speed Test
-      run: DEV=CPU CPU_LLVM=1 THREADS=0 python3 test/speed/external_test_speed_v_torch.py
+      run: DEV=CPU:LLVM THREADS=0 python3 test/speed/external_test_speed_v_torch.py
    - name: Speed Test (BEAM=2)
-      run: BEAM=2 DEV=CPU CPU_LLVM=1 THREADS=0 python3 test/speed/external_test_speed_v_torch.py
+      run: BEAM=2 DEV=CPU:LLVM THREADS=0 python3 test/speed/external_test_speed_v_torch.py

  docs:
    name: Docs
@ -83,7 +83,7 @@ jobs:
      run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
    - name: Compile EfficientNet to C and test it
      run: |
-        DEV=CPU CPU_LLVM=0 python examples/compile_efficientnet.py > recognize.c
+        DEV=CPU python examples/compile_efficientnet.py > recognize.c
        clang -O2 recognize.c -lm -o recognize
        cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock

@ -114,11 +114,11 @@ jobs:
    - name: Test one op in torch tests
      run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
    - name: Test Ops with TINY_BACKEND
-      run: DEV=CPU CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/backend/test_ops.py --durations=20
+      run: DEV=CPU:LLVM LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/backend/test_ops.py --durations=20
    - name: Test in-place operations on views
      run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
    - name: Test multi-gpu
-      run: DEV=CPU CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
+      run: DEV=CPU:LLVM GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py
    - name: Test kernel fusion
      run: python3 extra/torch_backend/test_kernel_fusion.py

@ -423,7 +423,7 @@ jobs:
      - name: Test openpilot CL compile fp32 (test correctness)
        run: DEV=CL IMAGE=1 SELFTEST=1 python examples/openpilot/compile3.py https://github.com/haraschax/filedump/raw/refs/heads/master/driving_vision_fp32.onnx
      - name: Test openpilot LLVM compile fp16
-        run: IMAGE=1 FLOAT16=1 DEV=CPU CPU_LLVM=1 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/cf6376aa9a090f0da26c280ef69eabf9bbdd51d1faac9ed392919c3db69be916
+        run: IMAGE=1 FLOAT16=1 DEV=CPU:LLVM python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/cf6376aa9a090f0da26c280ef69eabf9bbdd51d1faac9ed392919c3db69be916
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

@ -445,15 +445,15 @@ jobs:
          python-version: '3.12'
          llvm: 'true'
      - name: Test ONNX (CPU)
-        run: DEV=CPU CPU_LLVM=0 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+        run: DEV=CPU python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      - name: Test ONNX (LLVM)
-        run: DEV=CPU CPU_LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+        run: DEV=CPU:LLVM python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      - name: Test ONNX Runner (CPU)
-        run: DEV=CPU CPU_LLVM=0 python3 test/external/external_test_onnx_runner.py
+        run: DEV=CPU python3 test/external/external_test_onnx_runner.py
      - name: Test Additional ONNX Ops (CPU)
-        run: DEV=CPU CPU_LLVM=0 python3 test/external/external_test_onnx_ops.py
+        run: DEV=CPU python3 test/external/external_test_onnx_ops.py
      - name: Test Quantize ONNX
-        run: DEV=CPU CPU_LLVM=0 python3 test/backend/test_quantize_onnx.py
+        run: DEV=CPU python3 test/backend/test_quantize_onnx.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

@ -529,11 +529,11 @@ jobs:
          opencl: 'true'
          llvm: 'true'
      - name: Test models (llvm)
-        run: DEV=CPU CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20
+        run: DEV=CPU:LLVM python -m pytest -n=auto test/models --durations=20
      - name: Test models (opencl)
        run: DEV=CL python -m pytest -n=auto test/models --durations=20
      - name: Test models (cpu)
-        run: DEV=CPU CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20
+        run: DEV=CPU python -m pytest -n=auto test/models --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

@ -572,11 +572,11 @@ jobs:
        pydeps: "pillow"
        llvm: "true"
    - name: Test LLVM=1 DEVECTORIZE=0
-      run: DEV=CPU CPU_LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
+      run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
    - name: Test LLVM=1 DEVECTORIZE=0 for model
-      run: DEV=CPU CPU_LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
+      run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py
    - name: Test DEV=CPU DEVECTORIZE=0
-      run: DEV=CPU CPU_LLVM=0 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
+      run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py

  testdsp:
    name: Linux (DSP)
@ -667,9 +667,9 @@ jobs:
      - name: Install rocprof-trace-decoder
        run: sudo PYTHONPATH="." ./extra/sqtt/install_rocprof_decoder.py
      - name: Run AMD renderer tests
-        run: AMD_LLVM=0 python -m pytest -n=auto test/amd/ --durations 20
-      - name: Run AMD renderer tests (AMD_LLVM=1)
-        run: AMD_LLVM=1 python -m pytest -n=auto test/amd/ --durations 20
+        run: python -m pytest -n=auto test/amd/ --durations 20
+      - name: Run AMD renderer tests (AMD:LLVM)
+        run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
      - name: Run SQTT profiling tests
        run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
      - name: Run AMD emulated tests on NULL backend
@ -681,7 +681,7 @@ jobs:
      - name: Run ASM matmul on MOCKGPU
        run: PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
      - name: Run LLVM test
-        run: AMD_LLVM=1 python test/device/test_amd_llvm.py
+        run: DEV=AMD:LLVM python test/device/test_amd_llvm.py

  testmockam:
    name: Linux (am)
@ -726,11 +726,10 @@ jobs:
    runs-on: ubuntu-22.04
    timeout-minutes: 15
    env:
-      DEV: AMD
+      DEV: AMD${{ matrix.backend == 'amdllvm' && ':LLVM' || '' }}
      MOCKGPU: 1
      MOCKGPU_ARCH: ${{ matrix.arch }}
      SKIP_SLOW_TEST: 1
-      AMD_LLVM: ${{ matrix.backend == 'amdllvm' && '1' || matrix.backend != 'amdllvm' && '0' }}
    steps:
      - name: Checkout Code
        uses: actions/checkout@v6
@ -777,7 +776,7 @@ jobs:
          cuda: 'true'
          ocelot: 'true'
      - name: Set env
-        run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA\nCUDA_PTX=1' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
@ -811,7 +810,7 @@ jobs:
          llvm: ${{ matrix.backend == 'llvm' || matrix.backend == 'lvp' }}
          mesa: ${{ matrix.backend == 'lvp' && 'true' }}
      - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'DEV=CL' || matrix.backend == 'lvp' && 'DEV=CPU\nCPU_LVP=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'DEV=CL' || matrix.backend == 'lvp' && 'DEV=CPU:LVP' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT"
@ -864,23 +863,20 @@ jobs:
      env:
        MOCKGPU: 1
        DEV: AMD
-        AMD_LLVM: 0
        FORWARD_ONLY: 1
      run: |
        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
    - name: Run pytest (amd with llvm backend)
      env:
        MOCKGPU: 1
-        DEV: AMD
-        AMD_LLVM: 1
+        DEV: "AMD:LLVM"
        FORWARD_ONLY: 1
      run: |
        python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
    - name: Run pytest (ptx)
      env:
        MOCKGPU: 1
-        NV_PTX: 1
-        DEV: NV
+        DEV: "NV:PTX"
        FORWARD_ONLY: 1
        # TODO: failing due to library loading error
        CAPTURE_PROCESS_REPLAY: 0
@ -945,7 +941,7 @@ jobs:
          llvm: ${{ matrix.backend == 'llvm' || matrix.backend == 'lvp' }}
          mesa: ${{ matrix.backend == 'lvp' && 'true' }}
      - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'metal' && 'DEV=METAL' || matrix.backend == 'lvp' && 'DEV=CPU\nCPU_LVP=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'metal' && 'DEV=METAL' || matrix.backend == 'lvp' && 'DEV=CPU:LVP' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU','LVP':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT"
@ -980,7 +976,7 @@ jobs:
          pydeps: ${{ matrix.backend == 'webgpu' && 'dawn-python' || '' }}
      - name: Set env
        shell: bash
-        run:  printf "${{ matrix.backend == 'llvm' && 'DEV=CPU\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'webgpu' && 'DEV=WEBGPU'}}" >> $GITHUB_ENV
+        run:  printf "${{ matrix.backend == 'llvm' && 'DEV=CPU:LLVM' || matrix.backend == 'cpu' && 'DEV=CPU\nCPU_COUNT=2' || matrix.backend == 'webgpu' && 'DEV=WEBGPU'}}" >> $GITHUB_ENV
      - name: Run unit tests
        if: matrix.backend=='llvm'
        # test_newton_schulz hits RecursionError
@ -988,7 +984,7 @@ jobs:
      - name: Run NULL backend tests
        if: matrix.backend=='llvm'
        shell: bash
-        run: CPU=0 CPU_LLVM=0 DEV=NULL python -m pytest -n=auto test/null/ --ignore=test/null/test_elf.py --durations=20
+        run: DEV=NULL python -m pytest -n=auto test/null/ --ignore=test/null/test_elf.py --durations=20
      - name: Run pytest (${{ matrix.backend }})
        shell: bash
        run: |
@ -1017,7 +1013,7 @@ jobs:
          python-version: '3.12'
      - name: Set env
        shell: bash
-        run: printf "DEV=NULL\nNULL_ALLOW_COPYOUT=1\n${{ matrix.backend == 'ir3' && 'NULL_IR3=1' || matrix.backend == 'nak' && 'NULL_NAK=1' }}" >> $GITHUB_ENV
+        run: printf "NULL_ALLOW_COPYOUT=1\n${{ matrix.backend == 'ir3' && 'DEV=NULL:IR3' || matrix.backend == 'nak' && 'DEV=NULL:NAK' }}" >> $GITHUB_ENV
      - name: Run test_ops
        shell: bash
        run: |
@ -1040,7 +1036,7 @@ jobs:
          python-version: '3.12'
      - name: Set env
        shell: bash
-        run: printf "DEV=NULL\nNULL_ALLOW_COPYOUT=1\nNULL_QCOMCL=1" >> $GITHUB_ENV
+        run: printf "DEV=NULL:QCOMCL\nNULL_ALLOW_COPYOUT=1" >> $GITHUB_ENV
      - name: Run test_ops
        shell: bash
        run: |
--- a/docs/env_vars.md
+++ b/docs/env_vars.md
@ -28,6 +28,13 @@ The columns of this list are are: Variable, Possible Value(s) and Description.

 These control the behavior of core tinygrad even when used as a library.

+### DEV variable
+
+The `DEV` variable deserves special note due to its more nuanced syntax.
+`DEV` is used to specify the target device and target renderer for said device, separated by colons.
+Specifying the renderer is optional, omitting a preference will cause tinygrad to automatically select a renderer from those
+available on the system. Some example values for `DEV` are: `AMD`, `AMD:LLVM`, `NV:PTX`, etc.
+
 Variable | Possible Value(s) | Description
 ---|---|---
 DEBUG               | [1-7]      | enable debugging output (operations, timings, speed, generated code and more)
--- a/docs/runtime.md
+++ b/docs/runtime.md
@ -4,13 +4,13 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra

 | Runtime | Description | Compiler Options | Requirements |
 |---------|-------------|------------------|--------------|
-| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`NV_PTX=1`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
-| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`AMD_LLVM=1`)<br>HIP/COMGR (`AMD_HIP=1`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
+| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`DEV=NV:PTX`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
+| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`DEV=AMD:LLVM`)<br>HIP/COMGR (`DEV=AMD:HIP`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
 | [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | - | 6xx series GPUs |
 | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | - | M1+ Macs; Metal 3.0+ for `bfloat` support |
-| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`CUDA_PTX=1`) | NVIDIA GPU with CUDA support |
+| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`DEV=CUDA:PTX`) | NVIDIA GPU with CUDA support |
 | [CL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | - | OpenCL 2.0 compatible device |
-| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`CPU_LLVM=1`) | `clang` compiler in system `PATH` |
+| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`DEV=CPU:LLVM`) | `clang` compiler in system `PATH` |
 | [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | - | Dawn library installed and discoverable. Binaries: [pydawn v0.3.0](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0) |


--- a/extra/remu/README.md
+++ b/extra/remu/README.md
@ -22,7 +22,7 @@ This will produce a binary in the `extra/remu/target/release` directory.

 The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally).

-Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `AMD_LLVM=1` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
+Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `DEV=AMD:LLVM` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).

 `PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu.

--- a/test/amd/hw/test_vop3.py
+++ b/test/amd/hw/test_vop3.py
@ -83,7 +83,7 @@ class TestFmacE64(unittest.TestCase):
    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 7.0, places=5)

  def test_v_fmac_f32_e64_with_sgpr_sources(self):
-    """V_FMAC_F32_E64 with SGPR sources (common in AMD_LLVM output).
+    """V_FMAC_F32_E64 with SGPR sources (common in AMD:LLVM output).

    This tests the exact pattern that was failing: v_fmac_f32_e64(v[0], s[4], 0)
    where src0 is SGPR and src1 is inline constant 0.
@ -936,7 +936,7 @@ class TestF16Modifiers(unittest.TestCase):
  def test_v_fmac_f16_hi_dest(self):
    """v_fmac_f16 with .h destination: dst.h = src0 * src1 + dst.h.

-    This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
+    This tests the case from AMD:LLVM sin(0) where V_FMAC_F16 writes to v0.h.
    """
    instructions = [
      s_mov_b32(s[0], 0x38003c00),  # v0 = {hi=0.5, lo=1.0}
--- a/test/amd/test_compare_emulators.py
+++ b/test/amd/test_compare_emulators.py
@ -504,7 +504,7 @@ class TestTinygradKernels(unittest.TestCase):
    x_np = np.random.uniform(-2, 2, (33,)).astype(np.float32)
    self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).unsqueeze(-1).expand(33, 3).flatten().sum())

-  @unittest.skip("slow and broken with AMD_LLVM=1")
+  @unittest.skip("slow and broken with AMD:LLVM")
  def test_nonzero(self):
    """Test nonzero operation - counts and gathers indices of non-zero elements."""
    import numpy as np
--- a/test/amd/test_roundtrip.py
+++ b/test/amd/test_roundtrip.py
@ -60,11 +60,11 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
    from test.amd.test_compare_emulators import get_kernels_from_tinygrad
    from tinygrad.runtime.support.elf import elf_loader
    from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
-    from tinygrad.helpers import AMD_LLVM
+    from tinygrad.helpers import DEV

    kernels, _, _ = get_kernels_from_tinygrad(op_fn)
    # rendered source can be C or llvmir
-    compiler = (AMDLLVMCompiler if AMD_LLVM else HIPCompiler)(get_target(arch))
+    compiler = (AMDLLVMCompiler if DEV.renderer == "LLVM" else HIPCompiler)(get_target(arch))

    # First pass: decode all instructions and collect info
    decoded_instrs: list[tuple] = []  # list of (ki, offset, orig_bytes, decoded, our_disasm, decode_ok, decode_err)
--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
 import numpy as np
 from typing import List, Callable
 import torch
-from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, AMD_LLVM, EMULATE
+from tinygrad.helpers import getenv, CI, DEBUG, DEV, EMULATE, IMAGE, Context
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
@ -560,7 +560,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(45,65), (45,65)], lambda x,y: x/y)
    helper_test_op([(), ()], lambda x,y: x/y)

-  @unittest.skipIf(Device.DEFAULT == "AMD" and AMD_LLVM, "AMD with LLVM backend generate rcp in FP division causes trunc/floor errors")
+  @unittest.skipIf(Device.DEFAULT == "AMD" and DEV.renderer == "LLVM", "AMD with LLVM backend generate rcp in FP division causes trunc/floor errors")
  def test_div_rounding_mode(self):
    for denominator in [-10, -5, -3, -2, -1, 1, 2, 3, 5, 10]:
      # int numerator
@ -843,7 +843,7 @@ class TestOps(unittest.TestCase):
    self.assertEqual(a, b)
    self.assertEqual(Tensor(-1).contiguous().idiv(4).item(), 0)  # NOTE this is trunc-div behaviour

-  @unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
+  @unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
  def test_sin(self):
    helper_test_op([(45,65)], lambda x: x.sin())
    helper_test_op([()], lambda x: x.sin())
@ -853,7 +853,7 @@ class TestOps(unittest.TestCase):
      helper_test_op(None, lambda x: x.sin(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and platform.system() == "Windows", "Not accurate enough with DirectX backend")
-  @unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
+  @unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
  def test_cos(self):
    helper_test_op([(45,65)], lambda x: x.cos())
    helper_test_op([()], lambda x: x.cos())
@ -862,7 +862,7 @@ class TestOps(unittest.TestCase):
      helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and platform.system() == "Windows", "Not accurate enough with DirectX backend")
-  @unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
+  @unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
  def test_tan(self):
    # NOTE: backward has much higher diff with input close to pi/2 and -pi/2
    helper_test_op([(45,65)], lambda x: x.tan(), low=-1.5, high=1.5)
@ -1398,7 +1398,7 @@ class TestOps(unittest.TestCase):
                                                                         np.arange(64,128,dtype=np.float32).reshape(8,8)])
  def test_small_gemm_eye(self):
    helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
-  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
+  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and DEV.renderer == "LLVM") or IMAGE
  or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
  @unittest.skipIf(Device.DEFAULT == "QCOM", "not precise enough")
  def test_gemm_fp16(self):
@ -2396,7 +2396,7 @@ class TestOps(unittest.TestCase):
      lambda x,w: torch.nn.functional.conv2d(x,w,stride=2),
      lambda x,w: Tensor.conv2d(x,w,stride=2))

-  @unittest.skipUnless(Device.DEFAULT == "CPU" and CPU_LLVM, "DEVECTORIZE=0 only for LLVM")
+  @unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM")
  def test_strided_conv2d_simple_vec(self):
    with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()

@ -2715,9 +2715,9 @@ class TestOps(unittest.TestCase):
      lambda x: Tensor.avg_pool2d(x, kernel_size=(11,28)), rtol=1e-5)

  def test_avg_pool3d(self):
-    # TODO: AMD_LLVM has larger atol
+    # TODO: AMD:LLVM has larger atol
    # TODO: DEV=PYTHON backward hangs?
-    atol = 1e-2 if AMD_LLVM else 1e-6
+    atol = 1e-2 if DEV.device == "AMD" and DEV.renderer == "LLVM" else 1e-6
    helper_test_op([(1,1,16,16,16)],
      lambda x: torch.nn.functional.avg_pool3d(x, kernel_size=(8,8,8), stride=5, padding=1, count_include_pad=False),
      lambda x: Tensor.avg_pool2d(x, kernel_size=(8,8,8), stride=5, padding=1, count_include_pad=False), atol=atol, rtol=1e-5, forward_only=True)
--- a/test/backend/test_transcendental.py
+++ b/test/backend/test_transcendental.py
@ -1,7 +1,7 @@
 import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
-from tinygrad.helpers import Context, getenv, CI, OSX
+from tinygrad.helpers import Context, getenv, CI, DEV, OSX
 from test.backend.test_schedule import check_schedule
 from test.backend.test_dtype_alu import ht, dtypes_float
 from tinygrad.device import is_dtype_supported
@ -188,7 +188,7 @@ class TestTranscendentalVectorized(unittest.TestCase):
    for vec_size in [1,2,3,4,5,127,128]: self._test_vectorized_op(Tensor.log2, np.log2, (0.001, 200), vec_size)

  @unittest.skipIf(Device.DEFAULT == "DSP", "requires int division")
-  @unittest.skipIf(getenv("NV_NAK"), "MUFU.SIN is not accurate enough")
+  @unittest.skipIf(DEV.renderer == "NAK", "MUFU.SIN is not accurate enough")
  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and OSX, "WEBGPU Metal backend is not accurate enough")
  def test_sin_vectorized(self):
    for vec_size in [1,2,3,4,5,127,128]: self._test_vectorized_op(Tensor.sin, np.sin, (-100, 100), vec_size)
--- a/test/external/mlperf_stable_diffusion/external_test_eval.py
+++ b/test/external/mlperf_stable_diffusion/external_test_eval.py
@ -11,8 +11,7 @@ from examples.stable_diffusion import AutoencoderKL
 def set_eval_params():
  # override these as needed from cli
  for k,v in {"MODEL": "stable_diffusion", "GPUS": "8", "EVAL_SAMPLES": "600", "CONTEXT_BS": "816", "DENOISE_BS": "600", "DECODE_BS": "384",
-   "INCEPTION_BS": "560", "CLIP_BS": "240", "DATADIR": "/raid/datasets/stable_diffusion", "CKPTDIR": "/raid/weights/stable_diffusion",
-   "AMD_LLVM": "0"}.items():
+   "INCEPTION_BS": "560", "CLIP_BS": "240", "DATADIR": "/raid/datasets/stable_diffusion", "CKPTDIR": "/raid/weights/stable_diffusion"}.items():
    os.environ[k] = getenv(k, v)

 class TestEval(unittest.TestCase):
--- a/test/mockgpu/amd/README
+++ b/test/mockgpu/amd/README
@ -1,7 +1,7 @@
 An integrated environment for AMD GPU assembly and emulation

 Test with `pytest -n12 test/amd/`
-`AMD_LLVM=1 pytest -n12 test/amd/`
+`DEV=AMD:LLVM pytest -n12 test/amd/`

 * dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
 * test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1 PYTHON_REMU=1`
@ -20,13 +20,13 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same

 tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.

-`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
-`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
+`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
+`DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`

 The ops tests also pass, but they are very slow, so you should run them one at a time.

-`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_ops.py`
-`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_ops.py`
+`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
+`SKIP_SLOW_TEST=1 DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`

 When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
 While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
--- a/test/null/test_compile_failures.py
+++ b/test/null/test_compile_failures.py
@ -1,7 +1,7 @@
 import unittest, io
 from contextlib import redirect_stdout
 from tinygrad import Tensor, dtypes, Device
-from tinygrad.helpers import OSX, CPU_LLVM, CPU_LVP
+from tinygrad.helpers import OSX, DEV
 from tinygrad.device import is_dtype_supported
 from tinygrad.engine.realize import get_program

@ -18,7 +18,7 @@ class TestCompileFailures(unittest.TestCase):

 class TestDisassembly(unittest.TestCase):
  # TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
-  @unittest.skipUnless(Device.DEFAULT in ("CPU",) and not (CPU_LLVM or CPU_LVP) and OSX, "m series cpus support fp16 arithmetic")
+  @unittest.skipUnless(Device.DEFAULT in ("CPU",) and DEV.renderer not in ("LLVM", "LVP") and OSX, "m series cpus support fp16 arithmetic")
  def test_float16_alu(self):
    c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
    s = c.schedule()[-1]
--- a/test/null/test_device.py
+++ b/test/null/test_device.py
@ -3,7 +3,7 @@ import unittest, os, subprocess
 from unittest.mock import patch
 from tinygrad import Tensor
 from tinygrad.device import Device, Compiler, enumerate_devices_str
-from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, WIN, CI, OSX
+from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, Target, WIN, CI, OSX, DEV
 from tinygrad.runtime.support.c import DLL

 class TestDevice(unittest.TestCase):
@ -39,6 +39,12 @@ class TestDevice(unittest.TestCase):
    self.assertNotEqual(result.returncode, 0)
    self.assertIn(b"deprecated", result.stderr)

+  def test_old_renderer_env_raises(self):
+    result = subprocess.run(['python3', '-c', 'from tinygrad import Device; Device[Device.DEFAULT].renderer'],
+                            env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"}, capture_output=True)
+    self.assertNotEqual(result.returncode, 0)
+    self.assertIn(b"deprecated", result.stderr)
+
  @unittest.skipIf(WIN and CI, "skipping windows test") # TODO: subprocess causes memory violation?
  def test_env_overwrite_default_compiler(self):
    if Device.DEFAULT == "CPU":
@ -48,13 +54,11 @@ class TestDevice(unittest.TestCase):

      imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler"
      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"})
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU:LLVM"})
      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "0"})
-      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CC": "LLVM"})
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU"})
      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CC": "CLANGJIT"})
+                        shell=True, check=True, env={**os.environ, "DEV": "CPU:CLANGJIT"})
    elif Device.DEFAULT == "AMD":
      from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
      try: _, _ = HIPCompiler(Device[Device.DEFAULT].arch), AMDLLVMCompiler(Device[Device.DEFAULT].arch)
@ -62,27 +66,25 @@ class TestDevice(unittest.TestCase):

      imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler"
      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "1"})
+                        shell=True, check=True, env={**os.environ, "DEV": "AMD:LLVM"})
      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "0"})
-      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_CC": "LLVM"})
+                        shell=True, check=True, env={**os.environ, "DEV": "AMD"})
      subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
-                        shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_CC": "HIP"})
+                        shell=True, check=True, env={**os.environ, "DEV": "AMD:HIP"})
    else: self.skipTest("only run on CPU/AMD")

-  @unittest.skipIf((WIN and CI) or (not Device.DEFAULT == "CPU"), "skipping windows test")
+  @unittest.skipIf(WIN and CI, "skipping windows test")
  def test_env_online(self):
    from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
    try: _, _ = CPULLVMCompiler(), ClangJITCompiler()
    except Exception as e: self.skipTest(f"skipping compiler test: not all compilers: {e}")

-    with Context(CPU_LLVM=1):
+    with Context(DEV="CPU:LLVM"):
      inst = Device["CPU"].compiler
      self.assertIsInstance(Device["CPU"].compiler, CPULLVMCompiler)
-    with Context(CPU_LLVM=0):
+    with Context(DEV="CPU"):
      self.assertIsInstance(Device["CPU"].compiler, ClangJITCompiler)
-    with Context(CPU_LLVM=1):
+    with Context(DEV="CPU:LLVM"):
      self.assertIsInstance(Device["CPU"].compiler, CPULLVMCompiler)
      assert inst is Device["CPU"].compiler  # cached

@ -94,7 +96,7 @@ class TestDevice(unittest.TestCase):
    except Exception as e: self.skipTest(f"skipping: LLVM not available: {e}")

    dev = Device["CPU"]
-    dev.cached_pair.clear()
+    dev.cached_renderer.clear()
    with patch("tinygrad.renderer.cstyle.ClangJITRenderer.__init__", side_effect=RuntimeError("broken")):
      self.assertIsInstance(dev.renderer.compiler, CPULLVMCompiler)

@ -104,6 +106,21 @@ class TestDevice(unittest.TestCase):
    with Context(DEV="NULL"): self.assertEqual(Tensor.empty(1).device, "NULL")
    self.assertEqual(Tensor.empty(1).device, orig_dev)

+class TestDevVar(unittest.TestCase):
+  def test_parse(self):
+    for d, t in [("AMD", Target(device="AMD", renderer="")), ("AMD:LLVM", Target(device="AMD", renderer="LLVM")),
+                 (":LLVM", Target(device="", renderer="LLVM"))]:
+      with Context(DEV=d):
+        self.assertEqual(DEV.value, t)
+        self.assertEqual(str(DEV.value), d)
+
+  def test_target(self):
+    with Context(DEV="CPU"): self.assertEqual(DEV.target("CPU"), Target("CPU"))
+    with Context(DEV="CPU:LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU", "LLVM"))
+    with Context(DEV=":LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU", "LLVM"))
+    with Context(DEV="AMD:LLVM"): self.assertEqual(DEV.target("CPU"), Target("CPU"))
+    with Context(DEV=""): self.assertEqual(DEV.target("CPU"), Target("CPU"))
+
 class MockCompiler(Compiler):
  def __init__(self, key): super().__init__(key)
  def compile(self, src) -> bytes: return src.encode()
--- a/test/opt/test_tensor_cores.py
+++ b/test/opt/test_tensor_cores.py
@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype
 from tinygrad.uop.ops import Ops
 from tinygrad.dtype import DType
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM, Context
+from tinygrad.helpers import AMX, DEV, Context
 from test.helpers import slow
 from tinygrad.engine.realize import CompiledRunner, get_program
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
@ -75,9 +75,9 @@ class TestTensorCores(unittest.TestCase):
      a, b = Tensor.rand(m, k, dtype=tc.dtype_in), Tensor.rand(k, n, dtype=tc.dtype_in)
      r = a.matmul(b, dtype=tc.dtype_out)
      prg = get_program(r.schedule()[-1].ast, Device[Device.DEFAULT].renderer, opts=[Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))])
-      if Device.DEFAULT == "CPU" and CPU_LLVM:
+      if Device.DEFAULT == "CPU" and DEV.renderer == "LLVM":
        assert "0x201000" in prg.src
-      elif Device.DEFAULT == "AMD" and AMD_LLVM:
+      elif Device.DEFAULT == "AMD" and DEV.renderer == "LLVM":
        assert "@llvm.amdgcn.wmma" in prg.src
      elif Device[Device.DEFAULT].renderer.suffix == "PTX":
        assert "mma.sync.aligned" in prg.src
--- a/test/unit/test_assign.py
+++ b/test/unit/test_assign.py
@ -4,7 +4,7 @@ import numpy as np
 from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
 from tinygrad.uop.ops import Ops
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import temp, CI, CPU_LVP, Context
+from tinygrad.helpers import temp, CI, DEV, Context

 N = 200  # has to be bigger than the cache to fail

@ -193,7 +193,7 @@ class TestAssign(unittest.TestCase):
    new = a + times_a
    np.testing.assert_allclose(new.numpy(), 8)

-  @unittest.skipIf(CI and CPU_LVP, "flaky in CI")
+  @unittest.skipIf(CI and DEV.renderer == "LVP", "flaky in CI")
  def test_double_assign(self):
    a = Tensor.ones(4).contiguous().realize()
    a += 1
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@ -4,9 +4,8 @@ from collections import defaultdict
 from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
 import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
 from tinygrad.helpers import BENCHMARKS, CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
-from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, ContextVar
-from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, DEV, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
-from tinygrad.helpers import EMULATE, EMULATED_DTYPES, NULL_IR3, NULL_QCOMCL, IMAGE, FLOAT16, TracingKey, size_to_str
+from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, unwrap_class_type
+from tinygrad.helpers import suppress_finalizing, select_first_inited, DEV, VIZ, EMULATE, EMULATED_DTYPES, IMAGE, FLOAT16, TracingKey, size_to_str
 from tinygrad.dtype import DType, PtrDType, dtypes, _to_np_dtype
 if TYPE_CHECKING: from tinygrad.renderer import Renderer

@ -40,7 +39,7 @@ class _Device:
    for device in ALL_DEVICES:
      with contextlib.suppress(Exception): yield self[device].device
  @property
-  def DEFAULT(self) -> str: return DEV.value.upper() if DEV else self._select_device
+  def DEFAULT(self) -> str: return DEV.device or self._select_device
  @DEFAULT.setter
  def DEFAULT(self, v): raise AttributeError(f'setting Device.DEFAULT is deprecated, use "with Context(DEV={v!r})" or "DEV.value = {v!r}"')
  @functools.cached_property
@ -284,20 +283,11 @@ class Compiled:
  def _renderer_name(self, r:type[Renderer]|functools.partial) -> str:
    return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname:=self.device.split(':')[0].upper()) or devname

-  def _renderer_var(self, r:type[Renderer]|functools.partial) -> ContextVar|None:
-    return ContextVar._cache.get(f"{self.device}_{self._renderer_name(r)}", None)
-
  def _select_renderer(self) -> Renderer:
-    # select forced compiler from global env var.
-    forced_comps = set([r for r in self.renderers if self._renderer_name(r) == val] if
-                       (ctrl:=ContextVar._cache.get(f"{self.device}_CC", None)) is not None and (val:=ctrl.value) else [])
-
-    # add forced compilers from individual env vars (only if global env var is not set, as it takes precedence).
-    if not forced_comps: forced_comps |= set(r for r in self.renderers if (en:=self._renderer_var(r)) is not None and en.value == 1)
-    if len(forced_comps) > 1: raise RuntimeError(f"{self.device}: multiple compilers set in env {forced_comps}")
-
-    return select_first_inited(list(forced_comps) if len(forced_comps)>0 else self.renderers, f"No renderer for {self.device} is available",
-                               self.cached_renderer)
+    assert (rn:=next((self._renderer_name(r) for r in self.renderers if getenv(f"{self.device}_{self._renderer_name(r)}")), None)) is None, \
+      f"{self.device}_{rn}=1 is deprecated, use DEV={self.device}:{rn} or {self.device}_CC={rn} instead"
+    renderers = [r for r in self.renderers if self._renderer_name(r) == rn] if (rn:=DEV.target(self.device).renderer) else self.renderers
+    return select_first_inited(renderers, f"No renderer for {self.device} is available", self.cached_renderer)

  def synchronize(self):
    """
@ -320,37 +310,45 @@ class Compiled:
 # TODO: move this to each Device
 # this only tracks if the dtype is natively supported, it may be supported in the frontend using decomps
 def is_dtype_supported(dtype:DType, device:str|None=None, arch:str|None=None) -> bool:
-  if device is None: device = Device.DEFAULT
+  target = DEV.target(device or Device.DEFAULT)
  if dtype == dtypes.bfloat16:
-    if device == "METAL": return not CI or BENCHMARKS
-    if device == "CUDA": return (not CI or BENCHMARKS) and not CUDA_PTX
-    if device == "NV": return (not CI or BENCHMARKS) and not NV_PTX and not NV_NAK
-    if device in {"CPU"}: return (not CI or BENCHMARKS) and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP
-    return device in {"AMD", "CL", "PYTHON", "NULL"}
+    match target.device:
+      case "METAL": return not CI or BENCHMARKS
+      case "CUDA": return (not CI or BENCHMARKS) and target.renderer != "PTX"
+      case "NV": return (not CI or BENCHMARKS) and target.renderer not in ("PTX", "NAK")
+      case "CPU": return (not CI or BENCHMARKS) and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and target.renderer != "LVP"
+      case "AMD" | "CL" | "PYTHON" | "NULL": return True
+      case _: return False
  if dtype in dtypes.fp8_ocp:
-    if device == "CUDA": return (not CI or BENCHMARKS) and not CUDA_PTX
-    if device == "NV": return (not CI or BENCHMARKS) and not NV_PTX and not NV_NAK
-    if device == "AMD":
-      # TODO: open the device to get arch of device, will be fixed after triple is in the device string
-      if arch is None: arch = getattr(Device[device].renderer, "arch", "")
-      return (not CI or BENCHMARKS) and arch == "gfx950"
-    return device in {"PYTHON", "NULL"}
-  if dtype in dtypes.fp8_fnuz: return device in {"PYTHON", "NULL"}
-  if device == "WEBGPU": return dtype in [dtypes.bool, dtypes.char, dtypes.uchar, dtypes.short,
-                                          dtypes.ushort, dtypes.float, dtypes.int32, dtypes.uint32, dtypes.half]
+    match target.device:
+      case "CUDA": return (not CI or BENCHMARKS) and target.renderer != "PTX"
+      case "NV": return (not CI or BENCHMARKS) and target.renderer not in ("PTX", "NAK")
+      case "AMD":
+        # TODO: open the device to get arch of device, will be fixed after triple is in the device string
+        if arch is None: arch = getattr(Device[target.device].renderer, "arch", "")
+        return (not CI or BENCHMARKS) and arch == "gfx950"
+      case "PYTHON" | "NULL": return True
+      case _: return False
+  if dtype in dtypes.fp8_fnuz: return target.device in {"PYTHON", "NULL"}
+  if target.device == "WEBGPU": return dtype in [dtypes.bool, dtypes.char, dtypes.uchar, dtypes.short,
+                                                 dtypes.ushort, dtypes.float, dtypes.int32, dtypes.uint32, dtypes.half]
  # for CI GPU and OSX, cl_khr_fp16 isn't supported
  # for CI LLVM, it segfaults because it can't link to the casting function
  # CI CUDA architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
  # PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751
-  # double can't be bitcast to anything without long support
  if dtype == dtypes.half:
-    if device == "CL": return (not CI or BENCHMARKS) and not OSX
-    if device == "QCOM": return bool(IMAGE) and bool(FLOAT16) # QCOM compiler is flaky with half
-    if device in ["CUDA", "NV"]: return (not CI or BENCHMARKS) or "CUDA" in EMULATE.value
-    if device == "CPU" and CPU_LLVM: return OSX
-    if device == "PYTHON": return sys.version_info >= (3, 12)
-  if dtype == dtypes.float64: return (device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not NULL_IR3 and not NULL_QCOMCL
-                                      and dtypes.long not in EMULATED_DTYPES.tolist(dtypes))
+    match target.device:
+      case "CL": return (not CI or BENCHMARKS) and not OSX
+      case "QCOM": return bool(IMAGE) and bool(FLOAT16) # QCOM compiler is flaky with half
+      case "CUDA" | "NV": return (not CI or BENCHMARKS) or "CUDA" in EMULATE.value
+      case "CPU" if target.renderer == "LLVM": return OSX
+      case "PYTHON": return sys.version_info >= (3, 12)
+  if dtype == dtypes.float64:
+    match target.device:
+      case _ if dtypes.long in EMULATED_DTYPES.tolist(dtypes): return False # double can't be bitcast to anything without long support
+      case "CL": return not OSX
+      case "NULL": return target.renderer not in ("IR3", "QCOMCL")
+      case "METAL" | "QCOM": return False
  return True

 if PROFILE:
@ -373,23 +371,16 @@ def enumerate_devices_str() -> Generator[str, None, None]:
    compilers_results, any_works = [], False
    try:
      d = Device[device]
-      default_renderers, default_renderer = d.renderers, d.renderer
-      try:
-        for r in default_renderers:
-          d.renderers = [r]
-          try:
-            # d.renderer, d.compiler = r(), c()
-            with Context(CACHELEVEL=0, **({f"{device}_CC": d._renderer_name(r)} if (ctrl:=f"{device}_CC") in ContextVar._cache else {})):
-              test = (Tensor([1,2,3], device=device) * 2).tolist()
-            if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
-            set_text = f'({ctrl}={d._renderer_name(r)} to make default)' if (ctrl:=f"{device}_CC") in ContextVar._cache else ''
-            default_text = '(default)' if type(default_renderer) is type(d.renderer) else set_text
-            compilers_results.append(f"{colored('+', 'green')} {d._renderer_name(r)} {default_text}")
-            any_works = True
-          except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._renderer_name(r)}: {e}")
-      finally:
-        # put the defaults back!
-        d.renderers = default_renderers
+      default_renderer = d.renderer
+      for r in d.renderers:
+        try:
+          # d.renderer, d.compiler = r(), c()
+          with Context(CACHELEVEL=0, DEV=f"{device}:{d._renderer_name(r)}"): test = (Tensor([1,2,3], device=device) * 2).tolist()
+          if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
+          default_text = '(default)' if type(default_renderer) is type(d.renderer) else f'(DEV={device}:{d._renderer_name(r)} to make default)'
+          compilers_results.append(f"{colored('+', 'green')} {d._renderer_name(r)} {default_text}")
+          any_works = True
+        except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._renderer_name(r)}: {e}")
      result = (colored('PASS', 'green') if any_works else f"{colored('FAIL', 'yellow')}") + ''.join([f'\n{" "*16} {x}' for x in compilers_results])
    except Exception as e:
      result = f"{colored('FAIL', 'red')} {e}"
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@ -4,7 +4,7 @@ START_TIME = time.perf_counter()
 import os, functools, platform, re, contextlib, operator, hashlib, pickle, sqlite3, tempfile, pathlib, string, ctypes, sys, gzip, getpass, gc
 from collections import defaultdict
 import subprocess, shutil, math, types, copyreg, inspect, importlib, decimal, itertools
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from typing import ClassVar, Iterable, Any, TypeVar, Callable, Sequence, TypeGuard, Iterator, Generic, Generator, cast, overload

 T = TypeVar("T")
@ -176,7 +176,32 @@ class ContextVar(Generic[T]):
    assert isinstance(self.value, str)
    return [getattr(obj, x) if obj else x for x in self.value.split(',') if x]

-DEV, DEBUG, BEAM, NOOPT = ContextVar("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
+@dataclass(frozen=True)
+class Target:
+  device: str = ""
+  renderer: str = ""
+
+  @staticmethod
+  def parse(s:str) -> Target: return Target(*(x.upper() for x in s.split(':')))
+  def __repr__(self) -> str: return self.device + (":" + self.renderer if self.renderer else "")
+
+class _DEV(ContextVar):
+  _value = Target()
+  @property
+  def value(self) -> Target: return self._value
+  @value.setter
+  def value(self, v:str|Target): self._value = v if isinstance(v, Target) else Target.parse(v)
+  def __getattr__(self, k): return getattr(self.value, k)
+  # get target for device string
+  def target(self, dev:str) -> Target:
+    t = self.value if self.device == dev or not self.device else Target(device=dev)
+    # TODO: remove this once DEV supports secondary targets
+    if (cv:=ContextVar._cache.get(f"{dev}_CC", None)) is not None and cv.value:
+      assert not t.renderer, f"renderer set in DEV and {dev}_CC"
+      return replace(t, renderer=cv.value.upper())
+    return replace(t, device=dev)
+
+DEV, DEBUG, BEAM, NOOPT = _DEV("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
 IMAGE, FLOAT16, OPENPILOT_HACKS = ContextVar("IMAGE", 0), ContextVar("FLOAT16", 0), ContextVar("OPENPILOT_HACKS", 0)
 JIT, JIT_BATCH_SIZE = ContextVar("JIT", 2 if OSX and ARCH_X86 else 1), ContextVar("JIT_BATCH_SIZE", 32)
 WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)
@ -193,13 +218,9 @@ EMULATE, EMULATED_DTYPES = ContextVar("EMULATE", ""), ContextVar("EMULATED_DTYPE
 CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0)
 CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
 # Compilers
-CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
-NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
-CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
-NULL_QCOMCL, NULL_IR3, NULL_NAK = ContextVar("NULL_QCOMCL", 0), ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0)
+CPU_CC, NV_CC, CUDA_CC, NULL_CC = ContextVar("CPU_CC", ""), ContextVar("NV_CC", ""), ContextVar("CUDA_CC", ""), ContextVar("NULL_CC", "")
 NULL_ALLOW_COPYOUT = ContextVar("NULL_ALLOW_COPYOUT", 0)
-AMD_CC, AMD_LLVM, AMD_HIPCC  = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
-QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
+AMD_CC, QCOM_CC = ContextVar("AMD_CC", ""), ContextVar("QCOM_CC", "")
 # VIZ implies PROFILE, but you can run PROFILE without VIZ
 VIZ = ContextVar("VIZ", 0)
 PROFILE = ContextVar("PROFILE", abs(VIZ.value))
--- a/tinygrad/runtime/autogen/init.py
+++ b/tinygrad/runtime/autogen/init.py
@ -127,7 +127,7 @@ def __getattr__(nm):
      return load("rocprof", "['rocprof-trace-decoder', p:='/usr/local/lib/rocprof-trace-decoder.so', p.replace('so','dylib')]",
                  [f"{{}}/include/{s}.h" for s in ["rocprof_trace_decoder", "trace_decoder_instrument", "trace_decoder_types"]],
                  srcs="https://github.com/ROCm/rocprof-trace-decoder/archive/dd0485100971522cc4cd8ae136bdda431061a04d.tar.gz")
-    case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu']", [
+    case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or DEV.renderer == 'LVP' else ['tinymesa']) + ['tinymesa_cpu']", [
        *[f"{{}}/src/compiler/nir/{s}.h" for s in ["nir", "nir_builder", "nir_shader_compiler_options", "nir_serialize"]], "{}/gen/nir_intrinsics.h",
        *[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
        *[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
@ -146,7 +146,7 @@ def __getattr__(nm):
          *[f"python3 src/compiler/{s}_h.py > gen/{s.split('/')[-1]}.h" for s in ["nir/nir_opcodes", "nir/nir_builder_opcodes"]],
          *[f"python3 src/compiler/nir/nir_{s}_h.py --outdir gen" for s in ["intrinsics", "intrinsics_indices"]]]), cwd=path, shell=True, check=True),
  srcs="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
-  prolog=["from tinygrad.helpers import CPU_CC, CPU_LVP", "import gzip, base64"],
+  prolog=["from tinygrad.helpers import CPU_CC, DEV", "import gzip, base64"],
  epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
    case "libclang":
      return load("libclang", clang_lib,
--- a/tinygrad/runtime/autogen/mesa.py
+++ b/tinygrad/runtime/autogen/mesa.py
@ -4,9 +4,9 @@ import ctypes
 from typing import Annotated, Literal, TypeAlias
 from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
 from tinygrad.runtime.support import c
-from tinygrad.helpers import CPU_CC, CPU_LVP
+from tinygrad.helpers import CPU_CC, DEV
 import gzip, base64
-dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu'])
+dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or DEV.renderer == 'LVP' else ['tinymesa']) + ['tinymesa_cpu'])
 class struct_u_printf_info(ctypes.Structure): pass
 u_printf_info: TypeAlias = struct_u_printf_info
 uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32]